In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the dataset
file_path="../data/dataset_BreastCancerMETABRIC.csv"

# load the data
data = pd.read_csv(file_path)
print("Dataset loaded successfully")

# size of the dataset
print("The shape of the data: {}".format(data.shape))

Dataset loaded successfully
The shape of the data: (2509, 34)


In [3]:
# display basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Patient ID                      2509 non-null   object 
 1   Age at Diagnosis                2498 non-null   float64
 2   Type of Breast Surgery          1955 non-null   object 
 3   Cancer Type                     2509 non-null   object 
 4   Cancer Type Detailed            2509 non-null   object 
 5   Cellularity                     1917 non-null   object 
 6   Chemotherapy                    1980 non-null   object 
 7   Pam50 + Claudin-low subtype     1980 non-null   object 
 8   Cohort                          2498 non-null   float64
 9   ER status measured by IHC       2426 non-null   object 
 10  ER Status                       2469 non-null   object 
 11  Neoplasm Histologic Grade       2388 non-null   float64
 12  HER2 status measured by SNP6    19

In [4]:
# getting the first few rows of the dataset
data.head()

Unnamed: 0,Patient ID,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,Cohort,ER status measured by IHC,...,Overall Survival Status,PR Status,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Sex,3-Gene classifier subtype,Tumor Size,Tumor Stage,Patient's Vital Status
0,MB-0000,75.65,Mastectomy,Breast Cancer,Breast Invasive Ductal Carcinoma,,No,claudin-low,1.0,Positve,...,Living,Negative,Yes,138.65,Not Recurred,Female,ER-/HER2-,22.0,2.0,Living
1,MB-0002,43.19,Breast Conserving,Breast Cancer,Breast Invasive Ductal Carcinoma,High,No,LumA,1.0,Positve,...,Living,Positive,Yes,83.52,Not Recurred,Female,ER+/HER2- High Prolif,10.0,1.0,Living
2,MB-0005,48.87,Mastectomy,Breast Cancer,Breast Invasive Ductal Carcinoma,High,Yes,LumB,1.0,Positve,...,Deceased,Positive,No,151.28,Recurred,Female,,15.0,2.0,Died of Disease
3,MB-0006,47.68,Mastectomy,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,Yes,LumB,1.0,Positve,...,Living,Positive,Yes,162.76,Not Recurred,Female,,25.0,2.0,Living
4,MB-0008,76.97,Mastectomy,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,Yes,LumB,1.0,Positve,...,Deceased,Positive,Yes,18.55,Recurred,Female,ER+/HER2- High Prolif,40.0,2.0,Died of Disease


In [5]:
# Display missing values in each column
missing_values = data.isnull().sum()
print("\nColumns with missing values:")
print(missing_values[missing_values > 0])


Columns with missing values:
Age at Diagnosis                   11
Type of Breast Surgery            554
Cellularity                       592
Chemotherapy                      529
Pam50 + Claudin-low subtype       529
Cohort                             11
ER status measured by IHC          83
ER Status                          40
Neoplasm Histologic Grade         121
HER2 status measured by SNP6      529
HER2 Status                       529
Tumor Other Histologic Subtype    135
Hormone Therapy                   529
Inferred Menopausal State         529
Integrative Cluster               529
Primary Tumor Laterality          639
Lymph nodes examined positive     266
Mutation Count                    152
Nottingham prognostic index       222
Overall Survival (Months)         528
Overall Survival Status           528
PR Status                         529
Radio Therapy                     529
Relapse Free Status (Months)      121
Relapse Free Status                21
3-Gene classifier su

In [6]:
# Identify columns with missing values
columns_with_missing = missing_values[missing_values > 0].index

# Apply mode imputation
for col in columns_with_missing:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("\nMode imputation completed for all columns with missing values.")


Mode imputation completed for all columns with missing values.


In [7]:
# Check for remaining missing values after imputation
remaining_missing_values = data.isnull().sum()
if remaining_missing_values.sum() == 0:
    print("\nNo missing values remain in the dataset after mode imputation.")
else:
    print("\nColumns with remaining missing values after imputation:")
    print(remaining_missing_values[remaining_missing_values > 0])


No missing values remain in the dataset after mode imputation.


In [8]:
# cleaning the column names
data.columns = data.columns.str.replace(" ", "_").str.lower()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   patient_id                      2509 non-null   object 
 1   age_at_diagnosis                2509 non-null   float64
 2   type_of_breast_surgery          2509 non-null   object 
 3   cancer_type                     2509 non-null   object 
 4   cancer_type_detailed            2509 non-null   object 
 5   cellularity                     2509 non-null   object 
 6   chemotherapy                    2509 non-null   object 
 7   pam50_+_claudin-low_subtype     2509 non-null   object 
 8   cohort                          2509 non-null   float64
 9   er_status_measured_by_ihc       2509 non-null   object 
 10  er_status                       2509 non-null   object 
 11  neoplasm_histologic_grade       2509 non-null   float64
 12  her2_status_measured_by_snp6    25

In [9]:
data["patient's_vital_status"].unique()

array(['Living', 'Died of Disease', 'Died of Other Causes'], dtype=object)

In [10]:
# changing the dtype of the following column
data["patient's_vital_status"] = data["patient's_vital_status"].map({"Living": 0, 
                                                                     "Died of Disease": 1,
                                                                    "Died of Other Causes": 1})
data["patient\'s_vital_status"].unique()

array([0, 1])

In [11]:
# Save the dataset in the 'data' folder outside the 'notebooks' folder
output_path = "../data/preprocessed_breast_cancer_data.csv"
data.to_csv(output_path, index=False)

print(f"\nPreprocessed dataset saved successfully at: {output_path}")


Preprocessed dataset saved successfully at: ../data/preprocessed_breast_cancer_data.csv
