In [44]:
# Code to preprocessing outcome data, imaging features, then create predictions based on the features
# outputs include: predicted models, accuracies, feature importance, table 1 basic distributions with outcome
# Setup, imports
import math
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
#import graphviz
#from graphviz import Source
from IPython.display import SVG
from tableone import TableOne
from IPython.display import HTML

from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn import preprocessing, decomposition
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, roc_auc_score, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import auc 
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 106)

# Helper functions

In [51]:
#feature selection
#dropping high VIF columns
def calculate_vif(X, thresh=10):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True
            
    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

def VIF_feature_select(data, name, id_col_name = 'filename', feature_select = 1):
    
    #if there is a filename column, we will remove it and add it back later.
    #if no filename column, ignore
    imNameFlag = 0
    if id_col_name in data.columns:
        dataTemp = data
        data = data.drop(columns=[id_col_name])
        imNameFlag = 1
        
    csvName = 'dataFiltered_' + name + '.csv'
    
    if(os.path.isfile(csvName)):
        dataFiltered = pd.read_csv(csvName)
        return dataFiltered
    else:
        if (feature_select == 0):
            return data

        dataFiltered = calculate_vif(data, 10)
        
        if (imNameFlag == 1):
            dataFiltered[id_col_name] = dataTemp[id_col_name]

        #saving feature selected data
        dataFiltered.to_csv(csvName, index = False)

        return dataFiltered

In [37]:
#data imports and preprocessing
data_radiomics_raw = pd.read_csv('radiomics.csv',engine='python')
data_outcomes_raw = pd.read_csv('segmentation_labels_mgfr_cls.csv',engine='python')

#joining both by ID, but have to edit the ID columns because they are labelled differently

data_outcomes_raw['filename'] = data_outcomes_raw['filename'].str.replace('.png', '', regex=False)
data_radiomics_raw['imName'] = data_radiomics_raw['imName'].str.split('_').str[:-1].str.join('_')

#removing useless columbs
data_radiomics_raw = data_radiomics_raw.drop('maskNum', axis=1)
data_outcomes_raw = data_outcomes_raw.drop_duplicates(subset = 'filename', keep = 'first')

# binary mapping of outcomes and file labels
data_outcomes_raw['mgfr_cls_flag'] = data_outcomes_raw['mgfr_cls'].map({'Normal': 0, 'Kidney_Disease': 1})
data_outcomes_raw['file_attributes_flag'] = data_outcomes_raw['file_attributes'].apply(
    lambda x: 0 if 'Normal' in x else (1 if 'Abnormal' in x else None)
)


data_outcomes_raw_filtered = data_outcomes_raw[['filename', 'file_attributes_flag', 'mgfr', 'mgfr_cls_flag']]
data_outcomes_raw_filtered

data_all = pd.merge(
    data_outcomes_raw_filtered,
    data_radiomics_raw,
    left_on='filename',
    right_on='imName',
    how='inner'  # use 'outer', 'left', or 'right' if needed
)

data_all = data_all.drop('imName', axis=1)
data_all.to_csv("data_cleaned.csv")
data_all

Unnamed: 0,filename,file_attributes_flag,mgfr,mgfr_cls_flag,original_shape2D_Elongation,original_shape2D_MajorAxisLength,original_shape2D_MaximumDiameter,original_shape2D_MeshSurface,original_shape2D_MinorAxisLength,original_shape2D_Perimeter,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1__Still images_IM-0001-0020,0,201.050000,0,0.349595,1622.958220,1180.034745,107203.0,567.378098,4257.947761,...,1.095314,0.038393,6.942736,0.053655,776.825617,77.763435,0.000624,2.261034,0.000480,0.082289
1,1__Still images_IM-0001-0040,0,201.050000,0,0.417717,1905.351612,1404.624505,130856.5,795.898301,5198.174383,...,0.260777,0.040268,6.472604,0.068028,196.311433,529.590261,0.000316,0.445347,0.003166,0.002818
2,2__Still Images_IM-0001-0016,0,151.580000,0,0.346946,2116.748882,1583.480344,160310.5,734.396706,5086.256418,...,1.376564,0.029196,7.326310,0.043765,954.032463,86.649771,0.000504,1.850986,0.000492,0.062204
3,2__Still Images_IM-0001-0062,1,151.580000,0,0.473713,2323.803551,1779.031759,159973.0,1100.815160,5417.976331,...,0.506615,0.028774,7.345738,0.040238,846.827062,218.898343,0.000566,0.389957,0.001143,0.010082
4,3__still_images_5384_IM-0001-0031,0,40.830000,1,0.092461,1587.460963,1385.428814,129115.5,146.777456,3112.052957,...,0.328613,0.020899,7.227684,0.046129,536.441511,150.016135,0.000573,0.417694,0.001739,0.007096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,365a__Still_images_5346_IM-0002-0033,1,30.251345,1,0.484504,1478.479063,1019.656805,104768.5,716.328636,4424.487732,...,0.763487,0.019312,6.851189,0.073648,177.865458,117.500891,0.000471,1.020091,0.002587,0.008424
690,365a__Still_images_5346_IM-0002-0049,1,30.251345,1,0.422497,1592.250172,1176.384291,143944.5,672.720618,4502.239174,...,0.393930,0.038544,6.826292,0.063997,241.866607,380.024690,0.000327,0.354136,0.003597,0.001939
691,383a__Still_images_5375_IM-0001-0006,0,41.953214,1,0.524145,764.781866,558.649264,25410.5,400.856796,2461.336362,...,2.516635,0.076148,5.587849,0.164614,81.154095,62.247313,0.000966,3.585624,0.005910,0.032431
692,383b__Still_images_5379_IM-0001-0031,0,41.953214,1,0.534737,755.302919,561.381332,25080.0,403.888608,2494.182033,...,3.309076,0.063094,5.455190,0.175870,142.155549,43.245019,0.001396,5.264928,0.007239,0.071786


In [45]:
# basic stats
data_table1 = data_all.drop('filename', axis=1)
table1_columns = data_table1.columns.tolist()
table1_binary = TableOne(data_cleaned, columns=table1_columns, pval = True, 
                       groupby = ['mgfr_cls_flag'], htest_name=True, smd=True)
table1_binary

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag,Grouped by mgfr_cls_flag
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0,1,P-Value,Test,"SMD (0,1)"
n,,,694,504,190,,,
"file_attributes_flag, n (%)",0.0,0.0,545 (78.5),437 (86.7),108 (56.8),<0.001,Chi-squared,0.703
"file_attributes_flag, n (%)",1.0,,149 (21.5),67 (13.3),82 (43.2),,,
"mgfr, mean (SD)",,0.0,120.2 (49.3),141.4 (39.2),63.8 (20.7),<0.001,Two Sample T-test,-2.479
"original_shape2D_Elongation, mean (SD)",,0.0,0.4 (0.1),0.4 (0.1),0.4 (0.1),0.061,Two Sample T-test,-0.163
"original_shape2D_MajorAxisLength, mean (SD)",,0.0,1610.8 (685.5),1672.9 (670.7),1446.1 (698.7),<0.001,Two Sample T-test,-0.331
"original_shape2D_MaximumDiameter, mean (SD)",,0.0,1189.9 (505.1),1236.7 (493.3),1066.0 (516.2),<0.001,Two Sample T-test,-0.338
"original_shape2D_MeshSurface, mean (SD)",,0.0,117288.8 (81163.5),124602.5 (81119.3),97888.2 (78233.3),<0.001,Two Sample T-test,-0.335
"original_shape2D_MinorAxisLength, mean (SD)",,0.0,690.6 (322.5),722.1 (319.1),606.9 (317.1),<0.001,Two Sample T-test,-0.362
"original_shape2D_Perimeter, mean (SD)",,0.0,4701.7 (2075.1),4901.5 (2028.6),4171.8 (2109.0),<0.001,Two Sample T-test,-0.353


In [53]:
# Feature selection
# Approx n = 350, so max 30-35 featutres
# First will do VIF to remove collinears, then mutual info score selection

#dropping outcome columns and the manual abnormal labels before feature selecting
cols_labels = ['mgfr', 'mgfr_cls_flag','file_attributes_flag']
data_no_labels = data_all.drop(columns = cols_labels)
data_labels_only = data_all[['filename'] + cols_labels]
data_all_VIF = VIF_feature_select(data_no_labels, 'data_all', 
                                     'filename', feature_select = 1)
data_all_VIF

Unnamed: 0,original_firstorder_10Percentile,original_firstorder_Kurtosis,original_firstorder_Minimum,original_firstorder_TotalEnergy,original_glcm_ClusterProminence,original_glcm_ClusterShade,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glszm_LargeAreaLowGrayLevelEmphasis,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,filename
0,4.0,13.376776,0.0,90659144.0,37.392629,3.989152,0.038393,957.001859,77.763435,0.000624,2.261034,0.000480,0.082289,1__Still images_IM-0001-0020
1,12.0,3.405356,0.0,105680551.0,4.357592,0.920404,0.040268,300.710749,529.590261,0.000316,0.445347,0.003166,0.002818,1__Still images_IM-0001-0040
2,0.0,16.138137,0.0,184878121.0,88.395518,7.074510,0.029196,1019.303855,86.649771,0.000504,1.850986,0.000492,0.062204,2__Still Images_IM-0001-0016
3,7.0,3.813743,0.0,128170438.0,8.154955,1.895926,0.028774,1197.972480,218.898343,0.000566,0.389957,0.001143,0.010082,2__Still Images_IM-0001-0062
4,17.0,2.712345,0.0,229767002.0,8.906822,0.052417,0.020899,420.201331,150.016135,0.000573,0.417694,0.001739,0.007096,3__still_images_5384_IM-0001-0031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,25.0,2.994076,0.0,300490843.0,17.189528,-0.138433,0.019312,97.752753,117.500891,0.000471,1.020091,0.002587,0.008424,365a__Still_images_5346_IM-0002-0033
690,20.0,2.710040,0.0,225745526.0,6.843876,0.199794,0.038544,165.908680,380.024690,0.000327,0.354136,0.003597,0.001939,365a__Still_images_5346_IM-0002-0049
691,10.0,3.931123,0.0,55222386.0,42.609845,4.609357,0.076148,77.489057,62.247313,0.000966,3.585624,0.005910,0.032431,383a__Still_images_5375_IM-0001-0006
692,0.0,2.864715,0.0,55410325.0,98.226246,11.519947,0.063094,159.982082,43.245019,0.001396,5.264928,0.007239,0.071786,383b__Still_images_5379_IM-0001-0031


In [88]:
data_features

Unnamed: 0,original_firstorder_10Percentile,original_firstorder_Kurtosis,original_firstorder_Minimum,original_firstorder_TotalEnergy,original_glcm_ClusterProminence,original_glcm_ClusterShade,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glszm_LargeAreaLowGrayLevelEmphasis,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,filename
0,4.0,13.376776,0.0,90659144.0,37.392629,3.989152,0.038393,957.001859,77.763435,0.000624,2.261034,0.000480,0.082289,1__Still images_IM-0001-0020
1,12.0,3.405356,0.0,105680551.0,4.357592,0.920404,0.040268,300.710749,529.590261,0.000316,0.445347,0.003166,0.002818,1__Still images_IM-0001-0040
2,0.0,16.138137,0.0,184878121.0,88.395518,7.074510,0.029196,1019.303855,86.649771,0.000504,1.850986,0.000492,0.062204,2__Still Images_IM-0001-0016
3,7.0,3.813743,0.0,128170438.0,8.154955,1.895926,0.028774,1197.972480,218.898343,0.000566,0.389957,0.001143,0.010082,2__Still Images_IM-0001-0062
4,17.0,2.712345,0.0,229767002.0,8.906822,0.052417,0.020899,420.201331,150.016135,0.000573,0.417694,0.001739,0.007096,3__still_images_5384_IM-0001-0031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,25.0,2.994076,0.0,300490843.0,17.189528,-0.138433,0.019312,97.752753,117.500891,0.000471,1.020091,0.002587,0.008424,365a__Still_images_5346_IM-0002-0033
690,20.0,2.710040,0.0,225745526.0,6.843876,0.199794,0.038544,165.908680,380.024690,0.000327,0.354136,0.003597,0.001939,365a__Still_images_5346_IM-0002-0049
691,10.0,3.931123,0.0,55222386.0,42.609845,4.609357,0.076148,77.489057,62.247313,0.000966,3.585624,0.005910,0.032431,383a__Still_images_5375_IM-0001-0006
692,0.0,2.864715,0.0,55410325.0,98.226246,11.519947,0.063094,159.982082,43.245019,0.001396,5.264928,0.007239,0.071786,383b__Still_images_5379_IM-0001-0031


In [92]:
# SELECT OUTCOME TO DO MUTUAL INFO
outcome = 'mgfr_cls_flag'

# applying mutual info selector
data_VIF_merged = pd.merge(data_all_VIF, data_labels_only, 
                           how='left', left_on=['filename'], right_on = ['filename'])



data_features = data_VIF_merged.drop(columns = cols_labels)
data_outcome = data_VIF_merged[outcome]
data_labels_and_filename = data_VIF_merged[cols_labels + ['filename']]

num_rows = X.shape[0]
num_cols = X.shape[1]

print("rows " + str(num_rows) + ", cols " + str(num_cols))
num_features_to_keep = min(math.floor(num_rows/10), num_cols)

selector = SelectKBest(mutual_info_classif, k=num_features_to_keep)
selector.fit(data_features.drop('filename', axis=1), data_outcome)

# Get columno keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_selected_cols = data_VIF_merged.iloc[:,cols]

data_processed = pd.merge(data_features, data_labels_and_filename, 
                           how='left', left_on=['filename'], right_on = ['filename'])


data_processed_no_id = data_processed.drop(columns = 'filename',) 

rows 694, cols 13


Unnamed: 0,original_firstorder_10Percentile,original_firstorder_Kurtosis,original_firstorder_Minimum,original_firstorder_TotalEnergy,original_glcm_ClusterProminence,original_glcm_ClusterShade,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glszm_LargeAreaLowGrayLevelEmphasis,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,filename,mgfr,mgfr_cls_flag,file_attributes_flag
0,4.0,13.376776,0.0,90659144.0,37.392629,3.989152,0.038393,957.001859,77.763435,0.000624,2.261034,0.000480,0.082289,1__Still images_IM-0001-0020,201.050000,0,0
1,12.0,3.405356,0.0,105680551.0,4.357592,0.920404,0.040268,300.710749,529.590261,0.000316,0.445347,0.003166,0.002818,1__Still images_IM-0001-0040,201.050000,0,0
2,0.0,16.138137,0.0,184878121.0,88.395518,7.074510,0.029196,1019.303855,86.649771,0.000504,1.850986,0.000492,0.062204,2__Still Images_IM-0001-0016,151.580000,0,0
3,7.0,3.813743,0.0,128170438.0,8.154955,1.895926,0.028774,1197.972480,218.898343,0.000566,0.389957,0.001143,0.010082,2__Still Images_IM-0001-0062,151.580000,0,1
4,17.0,2.712345,0.0,229767002.0,8.906822,0.052417,0.020899,420.201331,150.016135,0.000573,0.417694,0.001739,0.007096,3__still_images_5384_IM-0001-0031,40.830000,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,25.0,2.994076,0.0,300490843.0,17.189528,-0.138433,0.019312,97.752753,117.500891,0.000471,1.020091,0.002587,0.008424,365a__Still_images_5346_IM-0002-0033,30.251345,1,1
690,20.0,2.710040,0.0,225745526.0,6.843876,0.199794,0.038544,165.908680,380.024690,0.000327,0.354136,0.003597,0.001939,365a__Still_images_5346_IM-0002-0049,30.251345,1,1
691,10.0,3.931123,0.0,55222386.0,42.609845,4.609357,0.076148,77.489057,62.247313,0.000966,3.585624,0.005910,0.032431,383a__Still_images_5375_IM-0001-0006,41.953214,1,0
692,0.0,2.864715,0.0,55410325.0,98.226246,11.519947,0.063094,159.982082,43.245019,0.001396,5.264928,0.007239,0.071786,383b__Still_images_5379_IM-0001-0031,41.953214,1,0


In [85]:
# plotting correlation ratios for visualization:
corrMatrix= data_cleaned.corr()
print('Correlation Heatmap')
fig, ax = plt.subplots(figsize=[10,10])
im = ax.imshow(corrMatrix, cmap='plasma')
ax.set_xticks(np.arange(len(corrMatrix)))
ax.set_yticks(np.arange(len(corrMatrix)))
ax.set_xticklabels(corrMatrix.columns)
ax.set_yticklabels(corrMatrix.columns)
plt.xticks(rotation='vertical')
plt.colorbar(im)
plt.savefig('corrMatrix.png')
corrMatrix.to_csv('corrMatrix.csv', index = False)


Unnamed: 0,original_firstorder_10Percentile,original_firstorder_Kurtosis,original_firstorder_Minimum,original_firstorder_TotalEnergy,original_glcm_ClusterProminence,original_glcm_ClusterShade,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glszm_LargeAreaLowGrayLevelEmphasis,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,4.0,13.376776,0.0,90659144.0,37.392629,3.989152,0.038393,957.001859,77.763435,0.000624,2.261034,0.000480,0.082289
1,12.0,3.405356,0.0,105680551.0,4.357592,0.920404,0.040268,300.710749,529.590261,0.000316,0.445347,0.003166,0.002818
2,0.0,16.138137,0.0,184878121.0,88.395518,7.074510,0.029196,1019.303855,86.649771,0.000504,1.850986,0.000492,0.062204
3,7.0,3.813743,0.0,128170438.0,8.154955,1.895926,0.028774,1197.972480,218.898343,0.000566,0.389957,0.001143,0.010082
4,17.0,2.712345,0.0,229767002.0,8.906822,0.052417,0.020899,420.201331,150.016135,0.000573,0.417694,0.001739,0.007096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,25.0,2.994076,0.0,300490843.0,17.189528,-0.138433,0.019312,97.752753,117.500891,0.000471,1.020091,0.002587,0.008424
690,20.0,2.710040,0.0,225745526.0,6.843876,0.199794,0.038544,165.908680,380.024690,0.000327,0.354136,0.003597,0.001939
691,10.0,3.931123,0.0,55222386.0,42.609845,4.609357,0.076148,77.489057,62.247313,0.000966,3.585624,0.005910,0.032431
692,0.0,2.864715,0.0,55410325.0,98.226246,11.519947,0.063094,159.982082,43.245019,0.001396,5.264928,0.007239,0.071786
