In [1]:
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
all_files = [f for f in listdir('result_groups') if isfile(join('result_groups', f))]
df_all_groups = pd.DataFrame()
sum_index = 0

for cur_file in all_files:
    cur_df = pd.DataFrame.from_csv('result_groups/' + cur_file)
    for index, row in cur_df.iterrows():
        if int(row['dados']) >= 1:
            data = pd.DataFrame({"marcador": row['marcador'],
                                 "grupo": row['grupo'],
                                 "dados": row['dados']}, 
                                index=[sum_index],
                                columns=["marcador", "grupo", "dados"])
            
            df_all_groups = df_all_groups.append(data)
        

In [3]:
df_all_groups

Unnamed: 0,marcador,grupo,dados
0,BV421:A,Neuroblastic tumor,8
0,CD10:A,Neuroblastic tumor,21
0,CD105:A,Neuroblastic tumor,4
0,CD117:A,Neuroblastic tumor,21
0,CD123:A,Neuroblastic tumor,16
0,CD14:A,Neuroblastic tumor,17
0,CD15:A,Neuroblastic tumor,3
0,CD16:A,Neuroblastic tumor,16
0,CD19:A,Neuroblastic tumor,16
0,CD19_CD4:A,Neuroblastic tumor,12


In [4]:
all_groups = df_all_groups[df_all_groups['dados'] >= 1]['grupo'].unique()
all_groups

array(['Neuroblastic tumor', 'mesoblastic nephroma', 'renal carcinoma',
       'sopharyngeal carcinoma', 'normal kidney',
       'Extraesqueletal Ewing Sarcoma', 'vascular tumor', 'osteosarcoma',
       'germ cell tumor', 'Adrenal carcinoma',
       'undiferenciated malignt neoplasm',
       'thyroid-like follicular carcinoma of kidney', 'chondrosarcoma',
       'wilms tumor', 'cystic nephroma', 'rhabdomyosarcoma'], dtype=object)

In [5]:
all_groups = df_all_groups[df_all_groups['dados'] >= 2]['grupo'].unique()
all_groups

array(['Neuroblastic tumor', 'Extraesqueletal Ewing Sarcoma',
       'vascular tumor', 'germ cell tumor',
       'undiferenciated malignt neoplasm', 'wilms tumor',
       'rhabdomyosarcoma'], dtype=object)

In [6]:
all_groups = df_all_groups[df_all_groups['dados'] >= 3]['grupo'].unique()
all_groups

array(['Neuroblastic tumor', 'Extraesqueletal Ewing Sarcoma',
       'germ cell tumor', 'wilms tumor', 'rhabdomyosarcoma'], dtype=object)

In [7]:
all_groups = df_all_groups[df_all_groups['dados'] >= 4]['grupo'].unique()
all_groups

array(['Neuroblastic tumor', 'germ cell tumor', 'wilms tumor',
       'rhabdomyosarcoma'], dtype=object)

In [8]:
all_groups = df_all_groups[df_all_groups['dados'] >= 5]['grupo'].unique()
all_groups

array(['Neuroblastic tumor', 'germ cell tumor', 'wilms tumor',
       'rhabdomyosarcoma'], dtype=object)

In [9]:
df_5lines = df_all_groups[df_all_groups['dados'] >= 5]
result_group_df = pd.DataFrame()

for i in range(0, len(all_groups) - 1):
    cur_group_df = df_5lines[df_5lines['grupo'] == all_groups[i]]
    cur_group_df = cur_group_df.set_index(['marcador'])
    for j in range(i + 1, len(all_groups)):
        next_group_df = df_5lines[df_5lines['grupo'] == all_groups[j]]
        next_group_df = next_group_df.set_index(['marcador'])

        merge_result = pd.concat([cur_group_df, next_group_df], axis=1, join='inner')
        merge_result.columns = ['grupo1', 'dados1', 'grupo2', 'dados2']
        result_group_df = result_group_df.append(merge_result)

result_group_df = result_group_df.reset_index()
result_group_df

Unnamed: 0,marcador,grupo1,dados1,grupo2,dados2
0,BV421:A,Neuroblastic tumor,8,germ cell tumor,5
1,CD10:A,Neuroblastic tumor,21,germ cell tumor,9
2,CD34:A,Neuroblastic tumor,23,germ cell tumor,6
3,CD38:A,Neuroblastic tumor,26,germ cell tumor,8
4,CD45:A,Neuroblastic tumor,36,germ cell tumor,10
5,CD56:A,Neuroblastic tumor,36,germ cell tumor,9
6,CD57:A,Neuroblastic tumor,22,germ cell tumor,8
7,CD81:A,Neuroblastic tumor,27,germ cell tumor,7
8,CD9:A,Neuroblastic tumor,28,germ cell tumor,8
9,CD90:A,Neuroblastic tumor,24,germ cell tumor,7


## *** ML Test ***

In [10]:
result_group_df = result_group_df[(result_group_df.grupo1 == all_groups[0]) & (result_group_df.grupo2 == all_groups[1])]

In [11]:
cur_marks = result_group_df['marcador'].unique()
cur_marks

array(['BV421:A', 'CD10:A', 'CD34:A', 'CD38:A', 'CD45:A', 'CD56:A',
       'CD57:A', 'CD81:A', 'CD9:A', 'CD90:A', 'CD99:A', 'EPCAM:A',
       'FSC-A:A', 'FSC-H:A', 'GD2:A', 'nuMIOGENIN:A', 'nuMYOD:A',
       'SSC-A:A', 'SSC-H:A', 'CD45:B', 'FSC-A:B', 'FSC-H:B', 'SSC-A:B',
       'BV421:L', 'CD10:L', 'CD34:L', 'CD38:L', 'CD45:L', 'CD56:L',
       'CD57:L', 'CD81:L', 'CD9:L', 'CD90:L', 'CD99:L', 'EPCAM:L',
       'FSC-A:L', 'FSC-H:L', 'GD2:L', 'nuMIOGENIN:L', 'SSC-A:L', 'SSC-H:L',
       'CD10:MO', 'CD34:MO', 'CD38:MO', 'CD45:MO', 'CD56:MO', 'CD57:MO',
       'CD9:MO', 'CD90:MO', 'CD99:MO', 'EPCAM:MO', 'FSC-A:MO', 'FSC-H:MO',
       'GD2:MO', 'nuMIOGENIN:MO', 'SSC-A:MO', 'SSC-H:MO', 'BV421:N',
       'CD10:N', 'CD34:N', 'CD38:N', 'CD45:N', 'CD56:N', 'CD57:N',
       'CD81:N', 'CD9:N', 'CD90:N', 'CD99:N', 'EPCAM:N', 'FSC-A:N',
       'FSC-H:N', 'GD2:N', 'SSC-A:N', 'SSC-H:N', 'BV421:NK', 'CD45:NK',
       'CD56:NK', 'FSC-A:NK', 'FSC-H:NK', 'GD2:NK', 'SSC-A:NK', 'SSC-H:NK',
       'CD45:T',

In [12]:
tumors = pd.read_csv('train/tumores_20_06_2017_no_comma.csv', encoding='latin1')
two_tumors = tumors[(tumors.GROUP == all_groups[0]) | (tumors.GROUP == all_groups[1])]
columns = np.hstack(('GROUP', cur_marks))
two_tumors = two_tumors[columns]

In [13]:
two_tumors

Unnamed: 0,GROUP,BV421:A,CD10:A,CD34:A,CD38:A,CD45:A,CD56:A,CD57:A,CD81:A,CD9:A,...,FSC-A:NK,FSC-H:NK,GD2:NK,SSC-A:NK,SSC-H:NK,CD45:T,CD56:T,FSC-A:T,FSC-H:T,SSC-A:T
41,Neuroblastic tumor,,-109.93,,-70.03,-57.95,14324.18,752.94,9190.5,18576.51,...,118552.5,,,16094.88,,4548.58,846.48,116437.95,,14032.8
42,Neuroblastic tumor,,,293.98,0.94,429.33,62394.24,,10114.03,24035.58,...,77669.41,63962.0,311.93,12090.24,,4350.71,22.69,92521.52,77129.0,9616.32
43,Neuroblastic tumor,,,718.65,53.99,1005.77,5012.91,,6330.71,32724.67,...,75823.93,60528.0,595.77,12487.68,,3824.95,-80.67,86071.45,72099.0,9364.32
44,Neuroblastic tumor,,,896.36,-13.39,1039.45,24861.46,,6598.98,25796.1,...,82870.97,69585.0,241.0,9695.52,,4890.43,40.48,88713.17,76691.0,9383.04
46,Neuroblastic tumor,,175.11,1125.95,0.47,321.07,8739.53,223.4,12447.49,30057.14,...,98415.59,83157.5,126.13,10535.04,8134.0,2834.55,-605.22,106444.52,91522.0,7927.2
48,germ cell tumor,,104.77,76.45,-7.69,14.43,24767.43,90.76,17709.95,3985.55,...,85626.9,73406.5,,10329.12,7218.0,,,74623.83,,9463.68
49,germ cell tumor,,23.3,146.42,59.44,353.85,887.48,581.51,6156.41,4657.34,...,108370.99,93411.5,,16585.92,,2913.3,,107248.97,91133.5,16175.52
50,germ cell tumor,1289.59,355.01,,-4.17,493.78,2549.15,328.61,12261.62,8979.36,...,90231.96,73048.0,174.32,10435.68,,6599.26,26.46,90673.31,75042.0,7741.44
51,germ cell tumor,,215.48,776.13,138.71,212.67,10611.93,236.75,4550.03,5635.1,...,80284.5,68390.0,,10276.88,7124.0,2967.3,-35.82,71377.2,59971.0,9340.9
52,germ cell tumor,,598.91,832.01,59.18,63.63,,,,,...,,,,,,,,,,


In [14]:
two_tumors = two_tumors.dropna(axis=1,how='any')
two_tumors

Unnamed: 0,GROUP,CD45:A,FSC-A:A,CD45:L,FSC-A:L,CD45:N,FSC-A:N
41,Neuroblastic tumor,-57.95,148847.84,4446.7,126583.64,3311.68,140084.55
42,Neuroblastic tumor,429.33,130303.81,3810.5,83476.58,2987.54,107020.55
43,Neuroblastic tumor,1005.77,176965.89,3298.35,89773.32,3130.71,127244.84
44,Neuroblastic tumor,1039.45,179886.98,5144.98,87134.32,2218.82,133294.08
46,Neuroblastic tumor,321.07,46691.19,3118.85,96214.3,3731.22,113540.7
48,germ cell tumor,14.43,136597.2,701.27,77117.16,585.06,150280.5
49,germ cell tumor,353.85,182620.62,5160.46,103724.98,3133.58,161260.19
50,germ cell tumor,493.78,137133.36,5726.27,73830.58,2095.97,115851.2
51,germ cell tumor,212.67,145741.5,2887.44,63308.7,1751.21,119511.0
52,germ cell tumor,63.63,98496.0,356.14,121320.0,1295.58,147298.94


In [15]:
my_Y = two_tumors.as_matrix(columns=['GROUP'])
del two_tumors['GROUP']
my_X = two_tumors.as_matrix()

In [16]:
my_Y

array([[u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'germ cell tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
       [u'Neuroblastic tumor'],
      

In [17]:
my_X

array([[ -5.79500000e+01,   1.48847840e+05,   4.44670000e+03,
          1.26583640e+05,   3.31168000e+03,   1.40084550e+05],
       [  4.29330000e+02,   1.30303810e+05,   3.81050000e+03,
          8.34765800e+04,   2.98754000e+03,   1.07020550e+05],
       [  1.00577000e+03,   1.76965890e+05,   3.29835000e+03,
          8.97733200e+04,   3.13071000e+03,   1.27244840e+05],
       [  1.03945000e+03,   1.79886980e+05,   5.14498000e+03,
          8.71343200e+04,   2.21882000e+03,   1.33294080e+05],
       [  3.21070000e+02,   4.66911900e+04,   3.11885000e+03,
          9.62143000e+04,   3.73122000e+03,   1.13540700e+05],
       [  1.44300000e+01,   1.36597200e+05,   7.01270000e+02,
          7.71171600e+04,   5.85060000e+02,   1.50280500e+05],
       [  3.53850000e+02,   1.82620620e+05,   5.16046000e+03,
          1.03724980e+05,   3.13358000e+03,   1.61260190e+05],
       [  4.93780000e+02,   1.37133360e+05,   5.72627000e+03,
          7.38305800e+04,   2.09597000e+03,   1.15851200e+05],


## Logistic Regression

In [18]:
kf = KFold(n_splits=10)
kf.get_n_splits(my_X)
scores = []

for train_index, test_index in kf.split(my_X):
    X_train, X_test = my_X[train_index], my_X[test_index]
    y_train, y_test = my_Y[train_index], my_Y[test_index]
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    scores.append(lr.score(X_test, y_test))
    
print 'Score: ' +  str(np.mean(scores))
lr = LogisticRegressionCV()
lr.fit(my_X, my_Y)
print lr.predict(my_X)


Score: 0.725


  y = column_or_1d(y, warn=True)


NameError: name 'LogisticRegressionCV' is not defined

## Random Forest

In [None]:
kf = KFold(n_splits=10)
kf.get_n_splits(my_X)
scores = []

for train_index, test_index in kf.split(my_X):
    X_train, X_test = my_X[train_index], my_X[test_index]
    y_train, y_test = my_Y[train_index], my_Y[test_index]
    lr = RandomForestClassifier()
    lr.fit(X_train, y_train.ravel())
    scores.append(lr.score(X_test, y_test.ravel()))
    
print 'Score: ' +  str(np.mean(scores))
lr = RandomForestClassifier()
lr.fit(my_X, my_Y.ravel())
print lr.predict(my_X)