In [1]:
import os
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler
import pingouin as pg
import numpy as np

In [2]:
homedir = '/home/raghuram/Desktop/radiomics/TEXTURES/'
os.chdir(homedir)

In [3]:
t1ce_file = 'expt_t1ce.csv'
t1w_file = 'expt_t1w.csv'
t2f_file = 'expt_t2f.csv'
t2w_file = 'expt_t2w.csv'

In [4]:
def pre_process_dataframe(csv_file_name, experiment_number):
    # Binarize magnetization strength values
    # Scanner names and manufacturer are categorical 
    # One hot encode them
    
    # Code for the above here
    # Experiments in ascending order
    
    experiment_df = pd.read_csv(csv_file_name)
    experiment_df = experiment_df[experiment_df['experiment_number'] == experiment_number]
    fill_values = {'mag_field_strength':1.5}
    experiment_df.fillna(value=fill_values, inplace=True)
    experiment_df['mag_field_strength_binarized'] = (experiment_df['mag_field_strength']>=1.5).astype(int)
    experiment_df.drop(columns=[ 'Tumor','experiment_number', 'scale', 'algo', 'ng', 'flip_angle', 
                                'VOLUME_ET', 'VOLUME_NET', 'VOLUME_ED', 'VOLUME_TC',
                               'VOLUME_BRAIN', 'mag_field_strength', 'scanner_model'], inplace=True)
    
    experiment_df.dropna(subset=['VOLUME_WT'], inplace=True)
    print(experiment_df.groupby('scanner_manufacturer').count())
    experiment_df = pd.get_dummies(experiment_df, columns=['scanner_manufacturer'], drop_first=False)
    radiomics_features  = list(experiment_df.columns)[:42]

    radiomics_df = experiment_df[radiomics_features]
    scaled_df = experiment_df.drop(columns=radiomics_features)
    print(list(scaled_df.columns))
    scaled_df = scaled_df - scaled_df.min()/(scaled_df.max()-scaled_df.min())
    
    return (scaled_df, radiomics_df)
        


In [5]:
def results_summary_to_dataframe(results,response_variable):
    '''take the result of an statsmodel results table and transforms it into a dataframe'''
    pvals = results.pvalues
    coeff = results.params
    ci_bounds = results.conf_int()
 
    results_df = pd.DataFrame({"pvals":pvals,
                               "coeff":coeff,
                               "response_variable":response_variable,
                              })
    #Reordering...
    results_df = results_df[["coeff","pvals", "response_variable"]]
    return results_df

In [13]:
def linear_regression(results_folder, radiomic_df, scaled_df, experiment_number):
    # Load the data corresponding to the sequence argument here
    # Perform regression with Tr and Te 
    
    radiomic_features = list(radiomic_df.columns)
    y = np.array(radiomic_df)
    X = np.array(scaled_df)
#     X = sm.add_constant(X)
    result_list = []
    for idx, column in enumerate(radiomic_features):
        
        model = sm.OLS(y[:, idx], X)
        results = model.fit()
        results_df = results_summary_to_dataframe(results, column)
#         results_df['significant'] = (results_df['pvals']<0.05).astype(bool)
        result_list.append(results_df)
       
    pd.concat(result_list).to_csv(os.path.join(results_folder, str(experiment_number)+'_'+'t1w.csv'), index=False)

In [17]:
results_folder = '/home/raghuram/Desktop/radiomics/TEXTURES/results/t1w/linear_regression'
for expt_number in range(1, 26):

    scaled_df, radiomics_df = pre_process_dataframe(t2f_file, expt_number)
    if expt_number > 1:
        break
    print
    linear_regression(results_folder, radiomics_df, scaled_df, expt_number) 
    print('Finished experiment {}'.format(expt_number))

                      GLCM_Contrast  GLCM_Correlation  GLCM_Dissimilarity  \
scanner_manufacturer                                                        
GE                               58                58                  58   
Philips                          10                10                  10   
Siemens                           9                 9                   9   

                      GLCM_Energy  GLCM_Entropy  GLCM_Homogeneity  \
scanner_manufacturer                                                
GE                             58            58                58   
Philips                        10            10                10   
Siemens                         9             9                 9   

                      GLCM_SumAverage  GLCM_Variance  GLRLM_GLN  GLRLM_GLV  \
scanner_manufacturer                                                         
GE                                 58             58         58         58   
Philips                           

In [6]:
results_folder = '/home/raghuram/Desktop/radiomics/TEXTURES/results/t2w/linear_regression/without_idh1'
os.chdir(results_folder)

In [7]:
csv_file_list = sorted(glob.glob('*.csv'), key=lambda _: int(_.split('_')[0]))

In [8]:
csv_file_list

['1_t2w.csv',
 '2_t2w.csv',
 '3_t2w.csv',
 '4_t2w.csv',
 '5_t2w.csv',
 '6_t2w.csv',
 '7_t2w.csv',
 '8_t2w.csv',
 '9_t2w.csv',
 '10_t2w.csv',
 '11_t2w.csv',
 '12_t2w.csv',
 '13_t2w.csv',
 '14_t2w.csv',
 '15_t2w.csv',
 '16_t2w.csv',
 '17_t2w.csv',
 '18_t2w.csv',
 '19_t2w.csv',
 '20_t2w.csv',
 '21_t2w.csv',
 '22_t2w.csv',
 '23_t2w.csv',
 '24_t2w.csv',
 '25_t2w.csv']

In [120]:
input_variables = ['repetition_time', 'excitation_time', 'VOLUME_WT', 'mag_field_strength_binarized', 'scanner_manufacturer_GE', 'scanner_manufacturer_Philips', 'scanner_manufacturer_Siemens']
input_columns = input_variables*42
for idx, csv_file in enumerate(csv_file_list):
    df = pd.read_csv(csv_file)
    df['signifcant'] = (df['pvals']<(0.00017)).astype(bool)
    df['input_variable'] = input_columns
#     df.set_index('response_variable', inplace=True)
#     if idx != 0:
#         df.drop(columns=['Unnamed: 0'], inplace=True)
    df.to_csv(csv_file, index=False)

# Apply the Benjamini-Hochberg FDR

# Procedure. Arrange p-value as below:

In [110]:
df = pd.read_csv(csv_file_list[0])

In [111]:
sorted_df = df.sort_values(by=['pvals'])

In [165]:
sorted_df['rank'] = sorted_df.reset_index().index+1
sorted_df['iQ_by_m'] = sorted_df['rank']*(0.05/210)

In [166]:
index = np.where(sorted_df['pvals']<sorted_df['iQ_by_m'])

In [182]:
sorted_df['significant'] = index

In [183]:
sorted_df.groupby('response_variable').count()

Unnamed: 0_level_0,coeff,pvals,input_variable,rank,iQ_by_m,significant
response_variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GLCM_Contrast,7,7,7,7,7,6
GLCM_Correlation,7,7,7,7,7,6
GLCM_Dissimilarity,7,7,7,7,7,6
GLCM_Energy,7,7,7,7,7,6
GLCM_Entropy,7,7,7,7,7,6
GLCM_Homogeneity,7,7,7,7,7,6
GLCM_SumAverage,7,7,7,7,7,6
GLCM_Variance,7,7,7,7,7,6
GLRLM_GLN,7,7,7,7,7,4
GLRLM_GLV,7,7,7,7,7,6


In [180]:
sorted_df[sorted_df['response_variable'] == 'GLSZM_ZSV']

Unnamed: 0,coeff,pvals,response_variable,input_variable,rank,iQ_by_m,significant
234,0.6471895,0.021041,GLSZM_ZSV,mag_field_strength_binarized,183,0.043571,True
235,-0.5771025,0.04139,GLSZM_ZSV,scanner_manufacturer_GE,194,0.04619,True
236,-0.3151999,0.281927,GLSZM_ZSV,scanner_manufacturer_Philips,223,0.053095,
233,-2.69e-07,0.602272,GLSZM_ZSV,VOLUME_WT,256,0.060952,
231,2.461455e-05,0.679263,GLSZM_ZSV,repetition_time,266,0.063333,
232,0.0003829101,0.821922,GLSZM_ZSV,excitation_time,277,0.065952,
237,0.009192598,0.968119,GLSZM_ZSV,scanner_manufacturer_Siemens,293,0.069762,


In [187]:
0.021041*210/195

0.022659538461538464