In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the dataset
file_path="../data/dataset_BreastCancerMETABRIC.csv"

# load the data
data = pd.read_csv(file_path)
print("Dataset loaded successfully")

# size of the dataset
print("The shape of the data: {}".format(data.shape))

Dataset loaded successfully
The shape of the data: (2509, 34)


In [3]:
# display basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Patient ID                      2509 non-null   object 
 1   Age at Diagnosis                2498 non-null   float64
 2   Type of Breast Surgery          1955 non-null   object 
 3   Cancer Type                     2509 non-null   object 
 4   Cancer Type Detailed            2509 non-null   object 
 5   Cellularity                     1917 non-null   object 
 6   Chemotherapy                    1980 non-null   object 
 7   Pam50 + Claudin-low subtype     1980 non-null   object 
 8   Cohort                          2498 non-null   float64
 9   ER status measured by IHC       2426 non-null   object 
 10  ER Status                       2469 non-null   object 
 11  Neoplasm Histologic Grade       2388 non-null   float64
 12  HER2 status measured by SNP6    19

In [4]:
# getting the first few rows of the dataset
data.head()

Unnamed: 0,Patient ID,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,Cohort,ER status measured by IHC,...,Overall Survival Status,PR Status,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Sex,3-Gene classifier subtype,Tumor Size,Tumor Stage,Patient's Vital Status
0,MB-0000,75.65,Mastectomy,Breast Cancer,Breast Invasive Ductal Carcinoma,,No,claudin-low,1.0,Positve,...,Living,Negative,Yes,138.65,Not Recurred,Female,ER-/HER2-,22.0,2.0,Living
1,MB-0002,43.19,Breast Conserving,Breast Cancer,Breast Invasive Ductal Carcinoma,High,No,LumA,1.0,Positve,...,Living,Positive,Yes,83.52,Not Recurred,Female,ER+/HER2- High Prolif,10.0,1.0,Living
2,MB-0005,48.87,Mastectomy,Breast Cancer,Breast Invasive Ductal Carcinoma,High,Yes,LumB,1.0,Positve,...,Deceased,Positive,No,151.28,Recurred,Female,,15.0,2.0,Died of Disease
3,MB-0006,47.68,Mastectomy,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,Yes,LumB,1.0,Positve,...,Living,Positive,Yes,162.76,Not Recurred,Female,,25.0,2.0,Living
4,MB-0008,76.97,Mastectomy,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,Yes,LumB,1.0,Positve,...,Deceased,Positive,Yes,18.55,Recurred,Female,ER+/HER2- High Prolif,40.0,2.0,Died of Disease


In [5]:
# check for missing values
missing_values=data.isnull().mean()*100
print("Missing values (%):")
print(missing_values[missing_values > 0].sort_values(ascending=False))

Missing values (%):
3-Gene classifier subtype         29.693105
Tumor Stage                       28.736548
Primary Tumor Laterality          25.468314
Cellularity                       23.595058
Type of Breast Surgery            22.080510
Integrative Cluster               21.084097
Hormone Therapy                   21.084097
Radio Therapy                     21.084097
PR Status                         21.084097
Inferred Menopausal State         21.084097
Patient's Vital Status            21.084097
Pam50 + Claudin-low subtype       21.084097
HER2 status measured by SNP6      21.084097
Chemotherapy                      21.084097
HER2 Status                       21.084097
Overall Survival Status           21.044241
Overall Survival (Months)         21.044241
Lymph nodes examined positive     10.601833
Nottingham prognostic index        8.848147
Mutation Count                     6.058191
Tumor Size                         5.938621
Tumor Other Histologic Subtype     5.380630
Relapse Free

#### 1. Columns with 20-30% missing data:
    - 3-Gene classifier sbutype (categorical)
        -- Use Mode imputation or KNN imputation

    - Tumor Stage (categorical)
        -- Impute with Mode

    - Primary Tumor Laterality (categorical)
        -- Impute with Mode or add a placeholder like "Unkown"

    - Cellularity (categorical)
        -- Impute with mode

In [9]:
# filter columns with 20-30% missing values
columns_20_30_missing = missing_values[(missing_values >= 20) & (missing_values < 30)].index
print("Columns with 20%-30% missing data:", columns_20_30_missing.tolist())
print("The total number of columns missing 20% of the values: {}".format(len(columns_20_30_missing)))

Columns with 20%-30% missing data: ['Type of Breast Surgery', 'Cellularity', 'Chemotherapy', 'Pam50 + Claudin-low subtype', 'HER2 status measured by SNP6', 'HER2 Status', 'Hormone Therapy', 'Inferred Menopausal State', 'Integrative Cluster', 'Primary Tumor Laterality', 'Overall Survival (Months)', 'Overall Survival Status', 'PR Status', 'Radio Therapy', '3-Gene classifier subtype', 'Tumor Stage', "Patient's Vital Status"]
The total number of columns missing 20% of the values: 17


In [20]:
# Perform one-hot encoding
columns_to_encode = ['Type of Breast Surgery', 'Cellularity', 'Chemotherapy', 
                     'Pam50 + Claudin-low subtype', 'HER2 status measured by SNP6', 
                     'HER2 Status', 'Hormone Therapy', 'Inferred Menopausal State', 
                     'Integrative Cluster', 'Primary Tumor Laterality', 
                     'Overall Survival (Months)', 'Overall Survival Status', 
                     'PR Status', 'Radio Therapy', '3-Gene classifier subtype', 
                     'Tumor Stage', "Patient's Vital Status"]

# Apply one-hot encoding
encoded_data = pd.get_dummies(data, columns=columns_to_encode, drop_first=True)

In [21]:
# Check for non-numeric columns
non_numeric_cols = encoded_data.select_dtypes(include=['object']).columns
print("Non-numeric columns in the dataset:", non_numeric_cols)

Non-numeric columns in the dataset: Index(['Patient ID', 'Cancer Type', 'Cancer Type Detailed',
       'ER status measured by IHC', 'ER Status',
       'Tumor Other Histologic Subtype', 'Oncotree Code',
       'Relapse Free Status', 'Sex'],
      dtype='object')


In [22]:
# Convert non-numeric columns to numeric
for col in non_numeric_cols:
    try:
        encoded_data[col] = pd.to_numeric(encoded_data[col], errors='coerce')
    except Exception as e:
        print(f"Error converting column {col}: {e}")

In [23]:
# Impute missing values
encoded_data.fillna(encoded_data.median(), inplace=True)

In [24]:
# Compute the correlation matrix
correlation_matrix = encoded_data.corr()

In [25]:
# Filter for the columns of interest
encoded_columns_to_check = [col for col in correlation_matrix.columns if any(c in col for c in columns_to_encode)]
correlations_of_interest = correlation_matrix[encoded_columns_to_check].loc[encoded_columns_to_check]

In [26]:
# Display the filtered correlation matrix
print("Filtered Correlation Matrix:")
print(correlations_of_interest)

Filtered Correlation Matrix:
                                             Type of Breast Surgery_Mastectomy  \
Type of Breast Surgery_Mastectomy                                     1.000000   
Cellularity_Low                                                       0.076327   
Cellularity_Moderate                                                  0.186497   
Chemotherapy_Yes                                                      0.146381   
Pam50 + Claudin-low subtype_Her2                                      0.155628   
...                                                                        ...   
Tumor Stage_2.0                                                       0.097407   
Tumor Stage_3.0                                                       0.088793   
Tumor Stage_4.0                                                       0.034712   
Patient's Vital Status_Died of Other Causes                           0.192918   
Patient's Vital Status_Living                                        