In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
from functools import reduce

import numpy as np

In [2]:
XL_PATH = r"radiomicsFeatures.csv"
DB_PATH = r"DB_INT.csv"

In [3]:
SOI = ["t2w", "adc", "sub_win", "sub_wout"]

### 1. Dataset Preparation

In [4]:
raw_feats_df = pd.read_csv(XL_PATH, index_col=0)
raw_label_df = pd.read_csv(DB_PATH)

In [5]:
label_values = []
for pid in tqdm(raw_feats_df.id):
    row_dict = raw_label_df[raw_label_df["Patient ID"]==pid].iloc[0].to_dict()
    
    label = 1 if row_dict["Upgrade"]=="Yes" else 0
    label_values.append(label)

    
raw_feats_df["label"] = label_values
original_feats = [column for column in raw_feats_df.columns if "original" in column and not "diagnostics" in column]
feats_df = raw_feats_df[["sequence", "id", "label"]+original_feats]

100%|████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 2407.20it/s]


In [6]:
feats_df.head()

Unnamed: 0,sequence,id,label,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,...,original_glszm_LargeAreaLowGrayLevelEmphasis,original_glszm_LowGrayLevelZoneEmphasis,original_glszm_SizeZoneNonUniformity,original_glszm_SizeZoneNonUniformityNormalized,original_glszm_SmallAreaEmphasis,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance
0,t2w,2535039,1,192.418499,452.058943,680574300.0,5.477913,125.886276,3.375305,600.0,...,0.033786,0.002423,667.188064,0.28647,0.383775,883.573857,0.001106,7.80273,0.340248,10.602551
1,t2w,2417361,0,185.721632,440.305,1168039000.0,6.243059,135.576942,3.062526,600.0,...,0.009861,0.00154,1790.620016,0.350004,0.424907,1147.830748,0.000626,8.169777,0.436705,3.239467
2,adc,2535039,1,219.666667,470.37037,752771300.0,6.25846,123.333333,3.100725,600.0,...,0.010393,0.00147,964.619324,0.350388,0.412557,2036.268619,0.00033,8.081728,0.441752,2.365617
3,adc,2417361,0,203.703704,471.111111,1484331000.0,6.400923,140.740741,3.197323,600.0,...,0.058145,0.003444,1903.238975,0.358696,0.429808,2138.833383,0.001331,8.23919,0.456312,2.085461
4,sub_win,2535039,1,202.467632,414.119741,600167400.0,4.199555,96.205504,4.335266,600.0,...,0.201518,0.005894,954.937228,0.4611,0.706208,3166.349551,0.0042,6.339939,0.33923,522.765026


In [15]:
flattened_feats_df = []

for sequence in SOI:
    
    _df = feats_df[feats_df.sequence==sequence].drop('sequence', axis=1).add_prefix(sequence+"_")
    _df = _df.rename(columns={sequence+"_id":"id", sequence+"_label":"label"})
    
    flattened_feats_df.append(_df)
   

merged_feats_df = reduce(lambda left,right:pd.merge(left,right, on=["id", "label"]), flattened_feats_df)

In [16]:
merged_feats_df.head()

Unnamed: 0,id,label,t2w_original_firstorder_10Percentile,t2w_original_firstorder_90Percentile,t2w_original_firstorder_Energy,t2w_original_firstorder_Entropy,t2w_original_firstorder_InterquartileRange,t2w_original_firstorder_Kurtosis,t2w_original_firstorder_Maximum,t2w_original_firstorder_MeanAbsoluteDeviation,...,sub_wout_original_glszm_LargeAreaLowGrayLevelEmphasis,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_glszm_SizeZoneNonUniformity,sub_wout_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaEmphasis,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_wout_original_glszm_SmallAreaLowGrayLevelEmphasis,sub_wout_original_glszm_ZoneEntropy,sub_wout_original_glszm_ZonePercentage,sub_wout_original_glszm_ZoneVariance
0,2535039,1,192.418499,452.058943,680574300.0,5.477913,125.886276,3.375305,600.0,77.97818,...,0.018184,0.003103,1647.676735,0.478558,0.719452,2946.8378,0.001447,7.183907,0.563964,2.851537
1,2417361,0,185.721632,440.305,1168039000.0,6.243059,135.576942,3.062526,600.0,79.878761,...,0.008935,0.001672,4680.378895,0.612775,0.810952,3322.225544,0.001139,7.450441,0.714901,0.960889
2,2602563,1,172.244683,438.200403,317396400.0,6.250625,150.863771,2.454526,600.0,83.052248,...,0.019154,0.0016,1436.853244,0.642887,0.829284,3407.597573,0.000765,7.212101,0.743019,0.824004
3,2902440,0,189.364163,434.471326,1038689000.0,6.141777,118.228161,3.547675,600.0,76.227598,...,0.054978,0.002428,4318.753593,0.591205,0.797411,3121.573712,0.001289,7.378925,0.696178,1.131925
4,2921898,0,166.287053,423.935053,1244073000.0,5.020437,122.689524,3.186097,600.0,77.347125,...,0.019192,0.00172,4364.415606,0.55016,0.770309,3175.569089,0.001042,7.548119,0.643129,1.852684


### 2. Removing Features with near 0 variance

In [18]:
threshold = 1e-6

In [19]:
feats = [column for column in merged_feats_df.columns if column not in ["id", "label"]]

In [20]:
feats_var = merged_feats_df[feats].var()

mask_feats = feats_var[feats_var<=threshold].index.to_list()

selected_feats = [feat for feat in feats if feat not in mask_feats]

print(f"Deleted {len(mask_feats)}/{len(feats)} features, remaining {len(selected_feats)} features")

Deleted 9/288 features, remaining 279 features


### 3. Correlation Analysis

In [21]:
threshold = 0.85

In [22]:
corr_matrix = merged_feats_df[selected_feats].corr(method='spearman').abs()

mean_corr = corr_matrix.mean()

ordered_feats = mean_corr.sort_values(ascending=True).index.to_list() #less mean correlated features are not as redundant as highly correlated features

In [23]:
corr_matrix = merged_feats_df[ordered_feats].corr(method='spearman').abs()
up_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
mask_feats = [column for column in up_tri.columns if any(up_tri[column]>=threshold)]

selected_feats = [feat for feat in ordered_feats if feat not in mask_feats]

print(f"Deleted {len(mask_feats)}/{len(ordered_feats)} features, remaining {len(selected_feats)} features")

Deleted 190/279 features, remaining 89 features


In [24]:
feats_df = merged_feats_df[["id", "label"]+selected_feats]

display(feats_df.head())

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


In [25]:
feats_df.to_csv("curated_df.csv", index=False)