In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import csv
import itertools
import os
import pandas as pd
import numpy as np
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.utils import shuffle
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn import svm
from sklearn.pipeline import Pipeline

In [2]:
RANDOM_STATE = 3 

In [86]:
modality_list = ['T1CE', 'T1W', 'T2F', 'T2W']

In [94]:
data_folder = '/home/raghuram/Desktop/radiomics/expt/data'
results_folder = '/home/raghuram/Desktop/radiomics/expt/results/'

In [95]:
def predict_function(noise_pc=07878544):
    
    '''
    
    path: Path to where the features are stored in csv file
    noise_level: Amount of noise to be added

    '''
    noise_level = noise_pc
    for modality in modality_list:
        print ('The modality is {}'.format(modality))
        path = os.path.join(data_folder, modality)
        #expt_dict[modality] = {}
        results_path = os.path.join(results_folder, modality, 'scanner_manufacturer', 'anova')
#         print('The results path is {}'.format(results_path))
        csv_file_list = sorted(os.listdir(path))
        results_file_list = sorted(os.listdir(results_path))
        for idx, csv_file in enumerate(csv_file_list):
    
            try:
                print('Index value is {}'.format(results_file_list[idx]))
                results_number = int(results_file_list[idx].split('anova')[-1].split('.')[0])
                print('Results number is {}'.format(results_number))
                expt_num = int(csv_file.split('.')[0].split('_Experiment')[1])
                assert(expt_num == results_number)
                print('Current experiment number is {}'.format(expt_num))
                if expt_num>=25:
                    continue
                results_df = pd.read_csv(os.path.join(results_path, results_file_list[idx]))
                features = list(results_df[results_df['status'] == 'REJECTED']['feature_names'])
                df = pd.read_csv(os.path.join(data_folder, modality, csv_file))
                df.dropna(subset=['codel_status'],axis='rows', inplace=True)
                df.drop(columns=['Filename','Magnetic Strength', 'death_days', 'Scanner Manufacturer', 'idh_status',
                                 'parameters_Algo','parameters_Scale', 'parameters_Ng'], inplace=True)
                df.drop(columns=features, axis=1, inplace=True)
                
                X = np.array(df.loc[:, 'GLCM_Contrast':'NGTDM_Strength'])
                y = np.array(df.loc[:, 'codel_status'])

                log_reg = LogisticRegression(random_state=RANDOM_STATE, solver='lbfgs', max_iter=1000 )
                pipeline = Pipeline(memory=None, 
                            steps=[('scaler', StandardScaler(with_mean=True, with_std=True)), 
                                   ('logistic', log_reg)
                                  ]
                           )
                p_grid = {
                  'logistic__penalty': ['l2'],
                  'logistic__C': [1e-6, 1e-5, 1e-4, 1e-3]
                 }

                NUM_TRIALS = 50
                FOLDS = 10
                
                results_dict = {}
                scores = np.zeros(NUM_TRIALS)
                for trial_number in range(NUM_TRIALS):
                    inner_cv = StratifiedShuffleSplit(n_splits=FOLDS, test_size=0.2, random_state=trial_number)
                    outer_cv = StratifiedShuffleSplit(n_splits=FOLDS-1, test_size=0.2, random_state=trial_number)
                    nested_score = cross_val_score(GridSearchCV(pipeline, p_grid, cv=inner_cv), 
                                        X, y, cv=outer_cv, scoring='roc_auc')
                    scores[trial_number] = nested_score.mean()
                    filename = 'results_features_removed'+'_'+str(expt_num)+'_'+str(noise_level)+'_'+str(modality)+'.csv'
                    results_dict[trial_number] = nested_score
                    
                    #expt_dict[modality][expt_num] = scores[trial_number]
#                     print ('The nested score is {}'.format(nested_score))
                df = pd.DataFrame.from_dict(results_dict, orient='index')
                df.to_csv(os.path.join(data_folder, filename))

            except Exception as e:
                print('Exception has occurred')
                print(e)

In [96]:
predict_function()

The modality is T1W
Index value is Experiment_anova1.csv
Results number is 1
Current experiment number is 1
Index value is Experiment_anova10.csv
Results number is 10
Current experiment number is 10
Index value is Experiment_anova11.csv
Results number is 11
Current experiment number is 11
Index value is Experiment_anova12.csv
Results number is 12
Current experiment number is 12
Index value is Experiment_anova13.csv
Results number is 13
Current experiment number is 13
Index value is Experiment_anova14.csv
Results number is 14
Current experiment number is 14
Index value is Experiment_anova15.csv
Results number is 15
Current experiment number is 15
Index value is Experiment_anova16.csv
Results number is 16
Current experiment number is 16
Index value is Experiment_anova17.csv
Results number is 17
Current experiment number is 17
Index value is Experiment_anova18.csv
Results number is 18
Current experiment number is 18
Index value is Experiment_anova19.csv
Results number is 19
Current experi

Index value is Experiment_anova19.csv
Results number is 19
Current experiment number is 19
Index value is Experiment_anova2.csv
Results number is 2
Current experiment number is 2
Index value is Experiment_anova20.csv
Results number is 20
Current experiment number is 20
Index value is Experiment_anova21.csv
Results number is 21
Current experiment number is 21
Index value is Experiment_anova22.csv
Results number is 22
Current experiment number is 22
Index value is Experiment_anova23.csv
Results number is 23
Current experiment number is 23
Index value is Experiment_anova24.csv
Results number is 24
Current experiment number is 24
Index value is Experiment_anova25.csv
Results number is 25
Current experiment number is 25
Index value is Experiment_anova26.csv
Results number is 26
Current experiment number is 26
Index value is Experiment_anova27.csv
Results number is 27
Current experiment number is 27
Index value is Experiment_anova28.csv
Results number is 28
Current experiment number is 28
In

In [32]:
df.drop(columns=features, axis=1, inplace=True)