Overall Process

    Reading Source Data and Preprocessing
    Data formatting
    Scaling
    Split the data into Train and test
    Create & Compile model
    Train
    Predict
    Error metrics



In [143]:
#import all necessary libraries
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler




In [144]:
#Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Reading source data and preprocessing

In [145]:
#read Phenotype metadata file from drive
df = pd.read_csv('/content/gdrive/My Drive/ResearchProject/dat/Phenotypes/Phenotypic_V1_0b_preprocessed1.csv')
#df.head(10)

In [146]:
#keep only necessary columns
df1 = df[['SUB_ID','DX_GROUP']]
df1.head(5)



Unnamed: 0,SUB_ID,DX_GROUP
0,50002,1
1,50003,1
2,50004,1
3,50005,1
4,50006,1


In [147]:
df1.shape

(1112, 2)

In [148]:
#read ROI = Region of Interest data file
file_name = "/content/gdrive/My Drive/ResearchProject/dat/Caltech_0051456_rois_cc200.1D"
data_df = pd.read_csv(file_name,delimiter='\t')
data_df.head(2)


Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,#15,#16,#17,#18,#19,#20,#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#31,#32,#33,#34,#35,#36,#37,#38,#39,#40,...,#161,#162,#163,#164,#165,#166,#167,#168,#169,#170,#171,#172,#173,#174,#175,#176,#177,#178,#179,#180,#181,#182,#183,#184,#185,#186,#187,#188,#189,#190,#191,#192,#193,#194,#195,#196,#197,#198,#199,#200
0,0.726714,-6.220083,-0.1055,0.827082,2.979758,8.791881,17.111346,3.28422,-16.968309,-22.130082,-3.73296,3.312534,2.072158,-0.305862,7.63301,3.79085,-3.982335,8.093873,-1.792861,-6.764295,3.579822,5.235932,-2.743671,6.414772,-8.899544,-0.957038,0.127942,19.101731,0.334213,-16.255181,14.892796,0.0,-3.154229,-10.20107,5.601131,-12.629192,4.955706,2.693456,10.786329,1.488354,...,-3.110225,-5.95469,-4.000533,-10.164482,7.5901,-4.566786,7.824773,-5.109869,-4.055288,-3.181114,-3.863731,0.720803,1.013343,5.613486,0.525683,-36.72304,-9.9083,3.507208,-2.973038,13.994154,1.77134,-7.848021,-7.496919,5.487471,6.147989,14.692906,-0.248979,3.641608,-7.739103,1.44185,5.885835,-0.869757,6.564439,8.814768,-1.190129,-0.645136,5.23925,-0.091448,-2.531757,4.341991
1,0.769439,3.99801,-1.046632,2.548084,8.51892,10.375453,10.571439,6.050726,-7.508605,-19.843585,-4.112897,-5.368731,-4.7049,10.988172,10.334088,2.732998,-13.076623,4.317733,-1.041544,-8.743133,4.292422,8.289673,-11.477216,-1.843866,-14.927714,1.342886,-0.554138,25.22226,1.361751,-1.635876,20.356237,0.0,-8.265292,-12.273486,9.052045,-11.983986,1.781613,4.563817,6.549014,7.916988,...,-2.434796,-2.985292,-10.072803,-14.960779,8.046342,-4.187571,6.945285,-12.780204,6.462198,-11.002358,-10.996264,-3.223731,1.943647,9.231075,1.022486,-31.675386,-13.760443,6.419978,-0.918588,19.956091,3.509222,-8.856135,-10.323292,6.607094,9.020284,21.478172,-2.607743,7.237246,-4.351354,-0.929499,15.715102,-0.143579,-4.519351,9.938296,-5.155952,-1.705809,6.912461,-0.013286,-5.128482,-0.085926


In [149]:
#add SUB_ID column to the dataframe, based on name of the file
data_df['SUB_ID'] = os.path.basename(file_name)[10:15]
data_df.head(2)

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,#15,#16,#17,#18,#19,#20,#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#31,#32,#33,#34,#35,#36,#37,#38,#39,#40,...,#162,#163,#164,#165,#166,#167,#168,#169,#170,#171,#172,#173,#174,#175,#176,#177,#178,#179,#180,#181,#182,#183,#184,#185,#186,#187,#188,#189,#190,#191,#192,#193,#194,#195,#196,#197,#198,#199,#200,SUB_ID
0,0.726714,-6.220083,-0.1055,0.827082,2.979758,8.791881,17.111346,3.28422,-16.968309,-22.130082,-3.73296,3.312534,2.072158,-0.305862,7.63301,3.79085,-3.982335,8.093873,-1.792861,-6.764295,3.579822,5.235932,-2.743671,6.414772,-8.899544,-0.957038,0.127942,19.101731,0.334213,-16.255181,14.892796,0.0,-3.154229,-10.20107,5.601131,-12.629192,4.955706,2.693456,10.786329,1.488354,...,-5.95469,-4.000533,-10.164482,7.5901,-4.566786,7.824773,-5.109869,-4.055288,-3.181114,-3.863731,0.720803,1.013343,5.613486,0.525683,-36.72304,-9.9083,3.507208,-2.973038,13.994154,1.77134,-7.848021,-7.496919,5.487471,6.147989,14.692906,-0.248979,3.641608,-7.739103,1.44185,5.885835,-0.869757,6.564439,8.814768,-1.190129,-0.645136,5.23925,-0.091448,-2.531757,4.341991,51456
1,0.769439,3.99801,-1.046632,2.548084,8.51892,10.375453,10.571439,6.050726,-7.508605,-19.843585,-4.112897,-5.368731,-4.7049,10.988172,10.334088,2.732998,-13.076623,4.317733,-1.041544,-8.743133,4.292422,8.289673,-11.477216,-1.843866,-14.927714,1.342886,-0.554138,25.22226,1.361751,-1.635876,20.356237,0.0,-8.265292,-12.273486,9.052045,-11.983986,1.781613,4.563817,6.549014,7.916988,...,-2.985292,-10.072803,-14.960779,8.046342,-4.187571,6.945285,-12.780204,6.462198,-11.002358,-10.996264,-3.223731,1.943647,9.231075,1.022486,-31.675386,-13.760443,6.419978,-0.918588,19.956091,3.509222,-8.856135,-10.323292,6.607094,9.020284,21.478172,-2.607743,7.237246,-4.351354,-0.929499,15.715102,-0.143579,-4.519351,9.938296,-5.155952,-1.705809,6.912461,-0.013286,-5.128482,-0.085926,51456


In [150]:

data_df1 = data_df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
data_df1.head(2)


Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,#15,#16,#17,#18,#19,#20,#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#31,#32,#33,#34,#35,#36,#37,#38,#39,#40,...,#162,#163,#164,#165,#166,#167,#168,#169,#170,#171,#172,#173,#174,#175,#176,#177,#178,#179,#180,#181,#182,#183,#184,#185,#186,#187,#188,#189,#190,#191,#192,#193,#194,#195,#196,#197,#198,#199,#200,SUB_ID
0,0.726714,-6.220083,-0.1055,0.827082,2.979758,8.791881,17.111346,3.28422,-16.968309,-22.130082,-3.73296,3.312534,2.072158,-0.305862,7.63301,3.79085,-3.982335,8.093873,-1.792861,-6.764295,3.579822,5.235932,-2.743671,6.414772,-8.899544,-0.957038,0.127942,19.101731,0.334213,-16.255181,14.892796,0.0,-3.154229,-10.20107,5.601131,-12.629192,4.955706,2.693456,10.786329,1.488354,...,-5.95469,-4.000533,-10.164482,7.5901,-4.566786,7.824773,-5.109869,-4.055288,-3.181114,-3.863731,0.720803,1.013343,5.613486,0.525683,-36.72304,-9.9083,3.507208,-2.973038,13.994154,1.77134,-7.848021,-7.496919,5.487471,6.147989,14.692906,-0.248979,3.641608,-7.739103,1.44185,5.885835,-0.869757,6.564439,8.814768,-1.190129,-0.645136,5.23925,-0.091448,-2.531757,4.341991,51456
1,0.769439,3.99801,-1.046632,2.548084,8.51892,10.375453,10.571439,6.050726,-7.508605,-19.843585,-4.112897,-5.368731,-4.7049,10.988172,10.334088,2.732998,-13.076623,4.317733,-1.041544,-8.743133,4.292422,8.289673,-11.477216,-1.843866,-14.927714,1.342886,-0.554138,25.22226,1.361751,-1.635876,20.356237,0.0,-8.265292,-12.273486,9.052045,-11.983986,1.781613,4.563817,6.549014,7.916988,...,-2.985292,-10.072803,-14.960779,8.046342,-4.187571,6.945285,-12.780204,6.462198,-11.002358,-10.996264,-3.223731,1.943647,9.231075,1.022486,-31.675386,-13.760443,6.419978,-0.918588,19.956091,3.509222,-8.856135,-10.323292,6.607094,9.020284,21.478172,-2.607743,7.237246,-4.351354,-0.929499,15.715102,-0.143579,-4.519351,9.938296,-5.155952,-1.705809,6.912461,-0.013286,-5.128482,-0.085926,51456


In [151]:
#merge phenotype data frame with actual dataframe inorder to get the classification column = DX_GROUP merged
merged = pd.merge(data_df1,df1, left_on='SUB_ID',right_on='SUB_ID',how='inner')
merged.drop('SUB_ID', axis=1, inplace=True)
merged.head(2)

Unnamed: 0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,#15,#16,#17,#18,#19,#20,#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#31,#32,#33,#34,#35,#36,#37,#38,#39,#40,...,#162,#163,#164,#165,#166,#167,#168,#169,#170,#171,#172,#173,#174,#175,#176,#177,#178,#179,#180,#181,#182,#183,#184,#185,#186,#187,#188,#189,#190,#191,#192,#193,#194,#195,#196,#197,#198,#199,#200,DX_GROUP
0,0.726714,-6.220083,-0.1055,0.827082,2.979758,8.791881,17.111346,3.28422,-16.968309,-22.130082,-3.73296,3.312534,2.072158,-0.305862,7.63301,3.79085,-3.982335,8.093873,-1.792861,-6.764295,3.579822,5.235932,-2.743671,6.414772,-8.899544,-0.957038,0.127942,19.101731,0.334213,-16.255181,14.892796,0.0,-3.154229,-10.20107,5.601131,-12.629192,4.955706,2.693456,10.786329,1.488354,...,-5.95469,-4.000533,-10.164482,7.5901,-4.566786,7.824773,-5.109869,-4.055288,-3.181114,-3.863731,0.720803,1.013343,5.613486,0.525683,-36.72304,-9.9083,3.507208,-2.973038,13.994154,1.77134,-7.848021,-7.496919,5.487471,6.147989,14.692906,-0.248979,3.641608,-7.739103,1.44185,5.885835,-0.869757,6.564439,8.814768,-1.190129,-0.645136,5.23925,-0.091448,-2.531757,4.341991,1
1,0.769439,3.99801,-1.046632,2.548084,8.51892,10.375453,10.571439,6.050726,-7.508605,-19.843585,-4.112897,-5.368731,-4.7049,10.988172,10.334088,2.732998,-13.076623,4.317733,-1.041544,-8.743133,4.292422,8.289673,-11.477216,-1.843866,-14.927714,1.342886,-0.554138,25.22226,1.361751,-1.635876,20.356237,0.0,-8.265292,-12.273486,9.052045,-11.983986,1.781613,4.563817,6.549014,7.916988,...,-2.985292,-10.072803,-14.960779,8.046342,-4.187571,6.945285,-12.780204,6.462198,-11.002358,-10.996264,-3.223731,1.943647,9.231075,1.022486,-31.675386,-13.760443,6.419978,-0.918588,19.956091,3.509222,-8.856135,-10.323292,6.607094,9.020284,21.478172,-2.607743,7.237246,-4.351354,-0.929499,15.715102,-0.143579,-4.519351,9.938296,-5.155952,-1.705809,6.912461,-0.013286,-5.128482,-0.085926,1


In [152]:
#scale the values between 0 and 1 using min max scaler
scaler=MinMaxScaler(feature_range=(0,1))
scaled=scaler.fit_transform(np.array(merged))
scaled.shape
#len(scaled)

(146, 201)

In [153]:
##splitting dataset into train and test (70-30)
train_data_percent = 0.70
training_size=int(len(scaled)*train_data_percent)
test_size=len(scaled)-training_size

#print(training_size,test_size)
train_data,test_data=scaled[0:training_size,:],scaled[training_size:len(scaled),:]
#print(train_data)
#print(test_data)

x_train,y_train=train_data[:,0:200],train_data[:,200:]
#print(x_train)
#print(y_train)


x_test,y_test=test_data[:,0:200],test_data[:,200:]
#print(x_test)
#print(y_test.shape)



In [154]:
#check the shape
print(x_train.shape), print(y_train.shape)

(102, 200)
(102, 1)


(None, None)

In [155]:
#check the shape
print(x_test.shape), print(y_test.shape)

(44, 200)
(44, 1)


(None, None)

In [156]:
#RandomForest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50)
model.fit(x_train, y_train.ravel())
train_predict = model.predict(x_train)
test_predict = model.predict(x_test)




In [157]:
### Calculate RMSE performance metrics
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_train,train_predict))

0.0

In [158]:
### Test Data RMSE
math.sqrt(mean_squared_error(y_test,test_predict))



0.0