In [2]:
import os

# Print the current working directory
print(os.getcwd())

/home/rburke/breast_cancer_survival_project/scripts


In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sksurv.util import Surv

# Load raw data
df = pd.read_csv("../data/raw/breast_cancer_data_raw.csv")



# Handling missing values & encoding Tumor Grade
df["Tumor grade"] = df["Tumor grade"].replace('1 or 2', '1.5')
df["Tumor grade"] = df["Tumor grade"].astype(float) # keep NaNs intact

# Feature Engineering: No. of Metastatic Sites
# Map 'N of met. Sites' to numeric values
df['N of met. Sites numeric'] = df['N of met. Sites'].map({'< 3': 0, '>=3': 1})

# List of categorical features which will be included in model
# Drop missing values in categorical features BEFORE feature selection
categorical_features = ['subtype', 'metastatic site', 'N of met. Sites numeric', 'Tumor grade']
# 1 missing value in 'metastatic site' and 'N of met. Sites' removed below, 17 missing values from tumor grade
df = df.dropna(subset=categorical_features)

#Preprocessing
# Select Relevant Features for Model
features = df [['AGE', 'PS', 'CTCs counts at baseline', 'MAF of gene used at baseline'] + categorical_features]
target = df[['Status OS', 'OS at baseline']] 
# [[selects multiple columns]], result is a dataframe.
# [selects a single column], result is a series.


# Define Preprocessing for Numberic and Categorical Variables

# List of numeric features which will be included in model
numeric_features = ['AGE', 'PS', 'CTCs counts at baseline', 'MAF of gene used at baseline']


# Numeric Pipeline:
# Creates pipeline for preprocessing numeric features. Scaling. Previously had imputing, but not needed anymore.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())]) # Scale numeric features

# Pipeline from sklearn.pipeline allows chaining of multiple preprocessing steps together.
# Standardizing the features involves subtracting the mean and scaling to unit variance.
# ML algorithms perform better when input data is standardized.


# Creates pipeline for preprocessing categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')  # One-hot encode categorical features with unknown handling

# One-hot encoding converts categorical variables into format for ML algorithms.
# Creates binary columns for each category so model can interpret categorical data


# Combining transformers into single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), # applies numeric transformer to numeric features
        ('cat', categorical_transformer, categorical_features)]) # applies cat transformer to cat features


# Apply preprocessing
X_processed = preprocessor.fit_transform(features)

# Get column names after one-hot encoding
cat_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
processed_columns = numeric_features + list(cat_columns)

# Create processed DataFrame
df_processed = pd.DataFrame(X_processed, columns=processed_columns, index=df.index)


# Append target variables clearly to the processed DataFrame
df_processed['Status OS'] = target['Status OS'].values
df_processed['OS at baseline'] = target['OS at baseline'].values



# Creating the survival dataset directly
# Converts 'target' with columns 'Status OS' and 'OS at baseline' into structured array suitable for survival analysis
# y = Surv.from_dataframe("Status OS", "OS at baseline", target)



# Save processed data
df_processed.to_csv("../data/processed/breast_cancer_data_processed.csv", index=False)

print("Data preprocessing completed and saved successfully.")

Data preprocessing completed and saved successfully.
