In [3]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif


from joblib import Parallel, delayed

from pickle import dump , load

In [4]:
# Read data
training_file = "../TrainDataset2024.xls"


data = pd.read_excel(training_file)
data.drop(["ID", "pCR (outcome)"], axis=1, inplace=True)
data_no_na = data.replace(999, np.nan)
data_no_na.dropna(ignore_index=True, inplace=True)
X = data_no_na.drop('RelapseFreeSurvival (outcome)', axis=1)
y = data_no_na['RelapseFreeSurvival (outcome)']

In [5]:
# Drop highly correlated features
CORR_THRESHOLD = 0.8
# Create a correlation matrix
correlation_matrix = X.corr()

highly_correlated_features = set()

for i in range(len(correlation_matrix.columns)):
  for j in range(i):
    if abs(correlation_matrix.iloc[i, j]) > CORR_THRESHOLD:
        highly_correlated_features.add(correlation_matrix.columns[i])

X_no_highly_correlated = X.drop(columns=highly_correlated_features)

In [6]:
scaler = StandardScaler()
Xs = scaler.fit_transform(X_no_highly_correlated)
Xs = pd.DataFrame(Xs, columns=X_no_highly_correlated.columns)

In [7]:
df = pd.concat([y, Xs], axis=1)

corr = df.corr()[["RelapseFreeSurvival (outcome)"]]

corr.drop(["RelapseFreeSurvival (outcome)"], inplace=True)

corr["RelapseFreeSurvival (outcome)"] = abs(corr["RelapseFreeSurvival (outcome)"])

sorted = corr.sort_values(by="RelapseFreeSurvival (outcome)", ascending=False)

sorted

Unnamed: 0,RelapseFreeSurvival (outcome)
original_firstorder_InterquartileRange,0.179408
original_firstorder_Kurtosis,0.146722
TumourStage,0.14049
original_shape_MajorAxisLength,0.133172
original_firstorder_90Percentile,0.131621
ChemoGrade,0.118109
HER2,0.11447
original_shape_Maximum2DDiameterRow,0.110054
original_shape_LeastAxisLength,0.093966
original_shape_Maximum2DDiameterColumn,0.09388


In [11]:
print(sorted.drop(['ER', 'HER2', 'Gene']))

                                                 RelapseFreeSurvival (outcome)
original_firstorder_InterquartileRange                                0.179408
original_firstorder_Kurtosis                                          0.146722
TumourStage                                                           0.140490
original_shape_MajorAxisLength                                        0.133172
original_firstorder_90Percentile                                      0.131621
ChemoGrade                                                            0.118109
original_shape_Maximum2DDiameterRow                                   0.110054
original_shape_LeastAxisLength                                        0.093966
original_shape_Maximum2DDiameterColumn                                0.093880
original_glszm_SmallAreaEmphasis                                      0.085894
Age                                                                   0.076094
original_shape_Sphericity                           

In [13]:
print(len(sorted))

36


In [14]:
num_of_features_list = [5, 10, 15, 20, 25, 30, 35]

for n in num_of_features_list:
  with open(f"pkl/regression_features_corr_{n}_selected_features.pkl", 'wb') as file:
    print(['ER', 'HER2', 'Gene'] + list(sorted[:n-3].index))
    dump(list(sorted[:n].index), file)
    print(f"Saved {file.name}")



['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis']
Saved pkl/regression_features_corr_5_selected_features.pkl
['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2']
Saved pkl/regression_features_corr_10_selected_features.pkl
['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age']
Saved pkl/regression_features_corr_15_selected_features.pkl
['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_fi