### Ipython Notebook for PCF Computational Oncology > Computational Oncology contest (Dec 2015)

In [15]:
# load modules
from scipy.misc import imread
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import mahotas as mh
import numpy as np
import pandas as pd
import re
from PIL import Image
import time
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline  

Load training and test csv files with case ID and FGA results (for training only)

In [2]:
# load training.csv file with case id and fga numbers
train_df = pd.read_csv('csv/training.csv',header=None)
train_df.head()

Unnamed: 0,0,1
0,10500,0.19
1,10549,0.58
2,11169,0.05
3,13188,0.09
4,13699,0.04


In [3]:
# load test.csv file with case id
test_df = pd.read_csv('csv/test.csv',header=None)
test_df.head()

Unnamed: 0,0
0,11229
1,11996
2,12059
3,12796
4,13553


Load all image files and extract all 13 Haralick texture features per image (this generates 26 total features per CaseID)

In [5]:
# save all path to image files (saved in Google Drive from topcoder)
import glob
TS_path = glob.glob('C:\\Users\\Shonket\\Google Drive\\Contest Images\\TS\\*.png')
DX_path = glob.glob('C:\\Users\\Shonket\\Google Drive\\Contest Images\\DX\\*.png')

In [6]:
# form TS feature matrix (right now contains both training and test, will be seperated later)
df_TS = []
start_time = time.time()
# loop through each folder to form feature matrix
for im_filename in TS_path:
    # load each RGB image and convert to grayscale image using luminance matching formula
    im = Image.open(im_filename).convert("L")
    arr = np.asarray(im)
    #print arr.shape
    #print type(arr)
    #plt.imshow(arr, cmap = cm.Greys_r)
    #plt.show()
    # find 13 averaged haralick features
    f = mh.features.haralick(arr,return_mean=True)
    #print f
    TS_case_id = re.split(r"\\|\.|\-",im_filename)[6]
    #print train_case_id
    data_row = f.tolist()
    data_row.insert(0, TS_case_id)
    #print data_row
    df_TS.append(data_row) 
    
# display loop time
end_time = time.time()
print("Elapsed time was %g seconds" % (end_time - start_time))



Elapsed time was 1462.69 seconds




In [7]:
# convert numpy into dataframe and write to csv
TS_df = pd.DataFrame(df_TS)
TS_df.columns = ['CaseID','f1_TS','f2_TS','f3_TS','f4_TS','f5_TS','f6_TS','f7_TS','f8_TS',
                      'f9_TS','f10_TS','f11_TS','f12_TS','f13_TS']
#df_train_TS.columns.values
#df_train_TS = df_train_TS.set_index(['CaseID'])
TS_df.head()
TS_df.to_csv('TS_val.csv')

In [8]:
# form DX feature matrix (right now contains both training and test, will be seperated later)
df_DX = []
start_time = time.time()
# loop through each folder to form feature matrix
for im_filename in DX_path:
    # load each RGB image and convert to grayscale image using luminance matching formula
    im = Image.open(im_filename).convert("L")
    arr = np.asarray(im)
    #print arr.shape
    #print type(arr)
    #plt.imshow(arr, cmap = cm.Greys_r)
    #plt.show()
    # find 13 averaged haralick features
    f = mh.features.haralick(arr,return_mean=True)
    #print f
    DX_case_id = re.split(r"\\|\.|\-",im_filename)[6]
    #print train_case_id
    data_row = f.tolist()
    data_row.insert(0, DX_case_id)
    #print data_row
    df_DX.append(data_row) 
    #ind += 1
    
# display loop time
end_time = time.time()
print("Elapsed time was %g seconds" % (end_time - start_time))

Elapsed time was 2632.19 seconds


In [9]:
# convert numpy into dataframe and write to csv
DX_df = pd.DataFrame(df_DX)
DX_df.columns = ['CaseID','f1_DX','f2_DX','f3_DX','f4_DX','f5_DX','f6_DX','f7_DX','f8_DX',
                      'f9_DX','f10_DX','f11_DX','f12_DX','f13_DX']
#df_train_TS.columns.values
#df_train_TS = df_train_TS.set_index(['CaseID'])
DX_df.head()
DX_df.to_csv('DX_val.csv')

In [11]:
# combine TS and DX dataframes into one
df_full = pd.merge(DX_df,TS_df,on='CaseID')
df_full['CaseID'].dtype
df_full['CaseID'] = df_full['CaseID'].astype(long)
df_full['CaseID'].dtype
df_full.head()
#df_train.describe()

Unnamed: 0,CaseID,f1_DX,f2_DX,f3_DX,f4_DX,f5_DX,f6_DX,f7_DX,f8_DX,f9_DX,...,f4_TS,f5_TS,f6_TS,f7_TS,f8_TS,f9_TS,f10_TS,f11_TS,f12_TS,f13_TS
0,10500,0.000178,636.979923,0.74252,1236.960925,0.071534,374.374775,4310.863779,7.817773,13.343517,...,2110.183073,0.098998,225.345156,7912.741144,8.360827,13.635484,0.000115,5.407751,-0.160784,0.948502
1,10549,0.000263,889.757091,0.848634,2939.139272,0.1034,260.825705,10866.799997,8.427585,13.950134,...,2787.779327,0.105171,267.863826,10595.395747,8.510544,13.758875,0.000112,5.454939,-0.176409,0.961644
2,11169,0.001117,503.788679,0.837511,1550.270937,0.140879,325.805807,5697.295069,7.96567,13.029051,...,2859.590389,0.097938,263.796433,10826.487969,8.523251,13.85781,0.000105,5.530232,-0.167069,0.955874
3,11229,0.000129,747.830797,0.786749,1753.415849,0.077938,335.755439,6265.832599,8.238044,13.83815,...,2310.808478,0.094541,341.599007,8603.876428,8.391743,13.747654,9.9e-05,5.570626,-0.150675,0.941751
4,11996,0.003496,215.596175,0.881285,908.038743,0.247822,411.400182,3416.558797,7.441903,11.51888,...,3147.025011,0.112963,284.624617,12037.631286,8.568638,13.784936,0.000111,5.453467,-0.185569,0.967272


In [12]:
# rename columns
train_df = train_df.rename(columns={0:'CaseID',1:'FGA'})
train_df.head()
train_df['CaseID'].dtype

dtype('int64')

Build feature matrix for training data from full feature matrix

In [13]:
# build feautre matrix dataframe
train_feature = pd.merge(train_df, df_full, on="CaseID", how='left')
train_feature.head()

Unnamed: 0,CaseID,FGA,f1_DX,f2_DX,f3_DX,f4_DX,f5_DX,f6_DX,f7_DX,f8_DX,...,f4_TS,f5_TS,f6_TS,f7_TS,f8_TS,f9_TS,f10_TS,f11_TS,f12_TS,f13_TS
0,10500,0.19,0.000178,636.979923,0.74252,1236.960925,0.071534,374.374775,4310.863779,7.817773,...,2110.183073,0.098998,225.345156,7912.741144,8.360827,13.635484,0.000115,5.407751,-0.160784,0.948502
1,10549,0.58,0.000263,889.757091,0.848634,2939.139272,0.1034,260.825705,10866.799997,8.427585,...,2787.779327,0.105171,267.863826,10595.395747,8.510544,13.758875,0.000112,5.454939,-0.176409,0.961644
2,11169,0.05,0.001117,503.788679,0.837511,1550.270937,0.140879,325.805807,5697.295069,7.96567,...,2859.590389,0.097938,263.796433,10826.487969,8.523251,13.85781,0.000105,5.530232,-0.167069,0.955874
3,13188,0.09,0.013913,314.6895,0.918697,1935.295697,0.329966,373.522241,7426.493287,7.462422,...,2156.486873,0.099525,241.318143,8164.852426,8.416838,13.637209,0.000118,5.346865,-0.172101,0.95773
4,13699,0.04,0.00378,594.823683,0.89011,2706.468779,0.212569,320.116969,10231.051432,7.873309,...,2972.278906,0.121855,227.918753,11482.329943,8.558484,13.594024,0.00013,5.240751,-0.209421,0.977188


Now make x_train feature matrix, y_train label array, and build regressor model with random forests ensemble estimator

In [14]:
# make x,y arrays for regression
x_df = train_feature.drop(['CaseID','FGA'], axis=1)
x_df.head()
x_train = x_df.as_matrix()
x_train
y_df = train_feature['FGA']
y_df.head()
y_train = y_df.as_matrix()
y_train

array([ 0.19,  0.58,  0.05,  0.09,  0.04,  0.18,  0.17,  0.12,  0.06,
        0.14,  0.12,  0.22,  0.05,  0.15,  0.22,  0.21,  0.66,  0.16,
        0.01,  0.05,  0.02,  0.07,  0.  ,  0.08,  0.05,  0.03,  0.07,
        0.1 ,  0.04,  0.02,  0.11,  0.01,  0.03,  0.4 ,  0.33,  0.54,
        0.13,  0.08,  0.08,  0.09,  0.72,  0.13,  0.11,  0.36,  0.14,
        0.04,  0.15,  0.01,  0.02,  0.09,  0.09,  0.12,  0.03,  0.05,
        0.14,  0.09,  0.04,  0.07,  0.01,  0.09,  0.04,  0.36,  0.06,
        0.02,  0.03,  0.19,  0.17,  0.04,  0.22,  0.06,  0.03,  0.06,
        0.01,  0.05,  0.12,  0.03,  0.07,  0.16,  0.14,  0.1 ,  0.01,
        0.03,  0.34,  0.07,  0.08,  0.14,  0.08,  0.15,  0.15,  0.23,
        0.05,  0.06,  0.01,  0.08,  0.33,  0.08,  0.07,  0.08,  0.13,
        0.3 ,  0.09,  0.45,  0.06,  0.04,  0.3 ,  0.05,  0.12,  0.14,
        0.22,  0.39,  0.04,  0.07,  0.02,  0.03,  0.1 ,  0.02,  0.41,
        0.03,  0.01,  0.07,  0.08,  0.  ,  0.03,  0.05,  0.04,  0.14,
        0.03,  0.01,

In [16]:
# build random forest regressor using training data
regressor = RandomForestRegressor(n_estimators=150, min_samples_split=1)
regressor.fit(x_train, y_train)
# show predicted training values from model
print regressor.predict(x_train)

[ 0.19393333  0.45566667  0.08013333  0.12693333  0.04986667  0.1582
  0.1646      0.14446667  0.08653333  0.14906667  0.12513333  0.15513333
  0.0448      0.16006667  0.183       0.19326667  0.46493333  0.14586667
  0.02033333  0.08193333  0.10713333  0.11253333  0.0176      0.1254
  0.07306667  0.0584      0.08906667  0.10486667  0.05006667  0.02993333
  0.1456      0.0858      0.03886667  0.31566667  0.231       0.40706667
  0.14133333  0.0802      0.09206667  0.0966      0.444       0.157       0.1888
  0.2724      0.17133333  0.0434      0.15146667  0.04913333  0.06986667
  0.11166667  0.13346667  0.14706667  0.06373333  0.052       0.13966667
  0.1132      0.12666667  0.08786667  0.0272      0.09006667  0.0546
  0.3132      0.0872      0.06966667  0.04766667  0.16713333  0.14833333
  0.10733333  0.18046667  0.069       0.072       0.0736      0.05086667
  0.06586667  0.1142      0.03853333  0.1352      0.20833333  0.1322
  0.0928      0.0408      0.06626667  0.254       0.0896   

Build test cases feature matrix dataframe from full dataframe and make FGA predictions for test cases using trained model and save to csv file to be submitted 

In [17]:
# build test cases feautre matrix dataframe
test_df = test_df.rename(columns={0:'CaseID'})
test_df.head()
test_df['CaseID'].dtype
test_feature = pd.merge(test_df, df_full, on="CaseID", how='left')
test_feature.head()

Unnamed: 0,CaseID,f1_DX,f2_DX,f3_DX,f4_DX,f5_DX,f6_DX,f7_DX,f8_DX,f9_DX,...,f4_TS,f5_TS,f6_TS,f7_TS,f8_TS,f9_TS,f10_TS,f11_TS,f12_TS,f13_TS
0,11229,0.000129,747.830797,0.786749,1753.415849,0.077938,335.755439,6265.832599,8.238044,13.83815,...,2310.808478,0.094541,341.599007,8603.876428,8.391743,13.747654,9.9e-05,5.570626,-0.150675,0.941751
1,11996,0.003496,215.596175,0.881285,908.038743,0.247822,411.400182,3416.558797,7.441903,11.51888,...,3147.025011,0.112963,284.624617,12037.631286,8.568638,13.784936,0.000111,5.453467,-0.185569,0.967272
2,12059,0.000153,763.569634,0.85324,2601.46983,0.099195,253.437615,9642.309684,8.482255,13.985507,...,4470.127578,0.131369,292.23268,16817.195282,8.558706,13.996703,9e-05,5.863353,-0.165541,0.956562
3,12796,0.000196,561.27671,0.763081,1184.539811,0.089579,375.160107,4176.882535,7.886615,13.240285,...,3974.814814,0.128532,295.236285,15329.401068,8.574868,13.691855,0.000115,5.460737,-0.198981,0.973415
4,13553,0.00017,501.849726,0.736065,950.712969,0.075178,309.21509,3301.002151,7.840456,13.263432,...,1833.206178,0.083901,309.561942,6504.596508,8.254674,13.872549,7.9e-05,5.808166,-0.111328,0.892058


In [18]:
# test case FGA predictions
x2_df = test_feature.drop(['CaseID'], axis=1)
x2_df.head()
x_test = x2_df.as_matrix()
x_test
print regressor.predict(x_test)
y_test = regressor.predict(x_test)

[ 0.13293333  0.11026667  0.14073333  0.12853333  0.108       0.06046667
  0.12386667  0.19973333  0.08226667  0.18486667  0.04633333  0.2456
  0.1372      0.18733333  0.1576      0.1324      0.04926667  0.0714
  0.07626667  0.11253333  0.1554      0.19053333  0.12533333  0.18293333
  0.16273333  0.1174      0.1098      0.109       0.1968      0.14586667
  0.1768      0.0732      0.07253333  0.16733333  0.12953333  0.16493333
  0.06373333  0.27593333  0.06093333  0.11893333  0.07973333  0.17466667
  0.15293333  0.12446667  0.0672      0.06626667  0.05426667  0.10313333
  0.12046667  0.04606667  0.1022      0.15806667  0.12086667  0.1464
  0.0856      0.13893333  0.25686667  0.0618      0.3016      0.0514      0.086
  0.18726667  0.09513333  0.16013333  0.20173333  0.07713333  0.13793333
  0.1758      0.17793333  0.04066667  0.18673333  0.07633333  0.18106667
  0.14153333  0.17633333  0.1342      0.08533333  0.1416      0.2158
  0.0848      0.1454      0.06773333  0.10306667  0.0444    

In [19]:
# save predicted FGA values to csv file
s1 = pd.Series(y_test.tolist())
#print s1
s2 = pd.Series(test_df['CaseID'][0:140])
#print s2
final_df = pd.concat([s2, s1], axis=1)
final_df[0] = final_df[0].astype(str) 
final_df.info()
final_df.head()

final_df.to_csv('predict_1b.csv',header=False,index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 2 columns):
0    140 non-null object
1    140 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.3+ KB
