### Import library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as skl
import pandas as pd
%matplotlib inline

### Data Study

In [71]:
trainDataframe = pd.read_csv("train.csv")
trainDataframe = trainDataframe.set_index("id")
trainDataframe

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3556.0,2489.0,265.19,77.53,176.55,0.00,4.20,307.91,52,0,7515.0,1
1,1906.0,134.0,1442.61,551.90,876.07,112.10,168.15,1735.48,20,1,1756.0,0
2,1586.0,71.0,1332.74,684.20,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,683.0,94.0,419.23,255.80,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,1032.0,71.0,1102.72,480.27,625.30,188.78,130.77,1427.97,28,1,1542.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
82,626.0,68.0,1771.57,666.99,1117.48,360.21,118.84,2306.82,42,1,1521.0,0
83,1237.0,71.0,1348.53,428.09,924.69,120.02,48.67,1524.78,56,0,1345.0,0
84,634.0,1002.0,1300.00,558.00,724.00,67.00,105.00,1484.26,34,0,2926.0,1
85,112.0,884.0,942.83,378.49,567.06,116.77,31.81,1104.59,33,1,2352.0,1


In [153]:
print(trainDataframe.dtypes)

MO HLADR+ MFI (cells/ul)    float64
Neu CD64+MFI (cells/ul)     float64
CD3+T (cells/ul)            float64
CD8+T (cells/ul)            float64
CD4+T (cells/ul)            float64
NK (cells/ul)               float64
CD19+ (cells/ul)            float64
CD45+ (cells/ul)            float64
Age                           int64
Sex 0M1F                      int64
Mono CD64+MFI (cells/ul)    float64
label                         int64
dtype: object


In [154]:
trainDataframe.iloc[:,-1].describe()

count    86.000000
mean      0.337209
std       0.475530
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: label, dtype: float64

In [143]:
trainDataframe.iloc[:,-1].value_counts()

0    57
1    29
Name: label, dtype: int64

We found 3 missing values in the data.

In [76]:
trainDataframe.isnull().sum(axis = 0)
# axis = 0 finds the sum along columns 
# axis = 1 finds the sum along rows 

MO HLADR+ MFI (cells/ul)    1
Neu CD64+MFI (cells/ul)     1
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    1
label                       0
dtype: int64

### Data Preprocessing

In [77]:
trainDataframe[trainDataframe["MO HLADR+ MFI (cells/ul)"].isna() ]

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
39,,,1336.54,739.71,550.3,68.46,192.07,1615.68,21,0,,0


We notice that the three missing values in fact come from the same record. 
Therefore, dropping that record with missing values would be a suitable approach to deal with missing data.

In [90]:
trainDataframe = trainDataframe.dropna(how='any', axis = 0)
# How = any: drop the row when any column is NA

In [112]:
trainDataframeProcessed = trainDataframe.copy()
trainDataframeProcessed.loc[trainDataframe["Sex 0M1F"] == 0, 'Sex_M'] = 1
trainDataframeProcessed.loc[trainDataframe["Sex 0M1F"] == 0, 'Sex_F'] = 0
trainDataframeProcessed.loc[trainDataframe["Sex 0M1F"] == 1, 'Sex_M'] = 0
trainDataframeProcessed.loc[trainDataframe["Sex 0M1F"] == 1, 'Sex_F'] = 1
trainDataframeProcessed.Sex_M = trainDataframeProcessed.Sex_M.astype(np.int64)
trainDataframeProcessed.Sex_F = trainDataframeProcessed.Sex_F.astype(np.int64)

trainDataframeProcessed = trainDataframeProcessed.drop("Sex 0M1F", axis = 1)

In [114]:
# Normalize
features = trainDataframeProcessed.loc[:, trainDataframeProcessed.columns != 'label'] 
features = (features-features.mean())/features.std()
features

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Mono CD64+MFI (cells/ul),Sex_M,Sex_F
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2.993989,4.484377,-1.150888,-1.155294,-1.013082,-1.233235,-1.179124,-1.274245,1.120810,4.546444,0.971312,-0.971312
1,0.838401,-0.318966,0.748939,0.218978,1.217478,-0.588464,0.520688,0.520054,-1.982278,-0.259124,-1.017564,1.017564
2,0.420347,-0.447463,0.571658,0.602257,0.513381,0.175656,1.022181,0.626337,-1.206506,-0.630452,-1.017564,1.017564
3,-0.759347,-0.400551,-0.902336,-0.638837,-1.058935,-0.818821,-0.759433,-0.984771,1.411725,-0.520306,-1.017564,1.017564
4,-0.303408,-0.447463,0.200509,0.011463,0.417848,-0.147420,0.133137,0.133548,-1.206506,-0.437695,-1.017564,1.017564
...,...,...,...,...,...,...,...,...,...,...,...,...
82,-0.833813,-0.453582,1.279732,0.552399,1.987262,0.838602,0.009448,1.238166,0.151095,-0.455219,-1.017564,1.017564
83,-0.035592,-0.447463,0.597136,-0.139705,1.372513,-0.542910,-0.718065,0.255227,1.508697,-0.602081,0.971312,-0.971312
84,-0.823362,1.451438,0.518830,0.236650,0.732572,-0.847868,-0.134043,0.204298,-0.624677,0.717176,0.971312,-0.971312
85,-1.505311,1.210761,-0.057481,-0.283399,0.232138,-0.561603,-0.892867,-0.272906,-0.721649,0.238205,-1.017564,1.017564


In [115]:
# trainDataframeProcessed = trainDataframe.copy()
trainDataframeProcessed.loc[:, trainDataframeProcessed.columns != 'label']  = features
trainDataframeProcessed

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Mono CD64+MFI (cells/ul),label,Sex_M,Sex_F
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2.993989,4.484377,-1.150888,-1.155294,-1.013082,-1.233235,-1.179124,-1.274245,1.120810,4.546444,1,0.971312,-0.971312
1,0.838401,-0.318966,0.748939,0.218978,1.217478,-0.588464,0.520688,0.520054,-1.982278,-0.259124,0,-1.017564,1.017564
2,0.420347,-0.447463,0.571658,0.602257,0.513381,0.175656,1.022181,0.626337,-1.206506,-0.630452,0,-1.017564,1.017564
3,-0.759347,-0.400551,-0.902336,-0.638837,-1.058935,-0.818821,-0.759433,-0.984771,1.411725,-0.520306,0,-1.017564,1.017564
4,-0.303408,-0.447463,0.200509,0.011463,0.417848,-0.147420,0.133137,0.133548,-1.206506,-0.437695,0,-1.017564,1.017564
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,-0.833813,-0.453582,1.279732,0.552399,1.987262,0.838602,0.009448,1.238166,0.151095,-0.455219,0,-1.017564,1.017564
83,-0.035592,-0.447463,0.597136,-0.139705,1.372513,-0.542910,-0.718065,0.255227,1.508697,-0.602081,0,0.971312,-0.971312
84,-0.823362,1.451438,0.518830,0.236650,0.732572,-0.847868,-0.134043,0.204298,-0.624677,0.717176,1,0.971312,-0.971312
85,-1.505311,1.210761,-0.057481,-0.283399,0.232138,-0.561603,-0.892867,-0.272906,-0.721649,0.238205,1,-1.017564,1.017564


### Testing Data Preparation

In [156]:
testing_data = pd.read_csv("test.csv")
testing_data = testing_data.set_index("id")

In [157]:
testing_data_processed = testing_data.copy()
testing_data_processed.loc[testing_data["Sex 0M1F"] == 0, 'Sex_M'] = 1
testing_data_processed.loc[testing_data["Sex 0M1F"] == 0, 'Sex_F'] = 0
testing_data_processed.loc[testing_data["Sex 0M1F"] == 1, 'Sex_M'] = 0
testing_data_processed.loc[testing_data["Sex 0M1F"] == 1, 'Sex_F'] = 1
testing_data_processed.Sex_M = testing_data_processed.Sex_M.astype(np.int64)
testing_data_processed.Sex_F = testing_data_processed.Sex_F.astype(np.int64)

testing_data_processed = testing_data_processed.drop("Sex 0M1F", axis = 1)

In [160]:
testing_data_processed = testing_data_processed.loc[:, testing_data_processed.columns != 'label'] 
testing_data_processed = (testing_data_processed-testing_data_processed.mean())/testing_data_processed.std()

testing_data_processed.head()

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Mono CD64+MFI (cells/ul),Sex_M,Sex_F
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2.111764,-0.203435,0.484071,0.539065,0.421785,-0.527675,-0.231982,0.2162,0.404043,1.129592,-1.333737,1.333737
1,-1.004254,-0.279988,-1.020508,-0.812485,-1.031049,0.867676,-0.192762,-0.634078,1.039737,-1.301425,-1.333737,1.333737
2,-0.500459,-0.501588,0.260269,-0.15422,0.812723,1.531076,2.352054,1.104358,-0.973294,-0.518928,0.737065,-0.737065
3,-0.45513,-0.630518,0.75756,1.107835,0.245949,0.811555,-1.023488,0.71035,0.933788,-0.522445,0.737065,-0.737065
4,-0.067894,-0.199406,-0.345397,-0.415662,-0.124049,0.212104,-0.314435,-0.273044,0.192145,-0.111853,0.737065,-0.737065


### Trial Model

In [161]:
# loading training/testing data
training_data = trainDataframeProcessed


In [162]:
train_x = training_data.drop(["label"], axis = 1)
train_y = training_data["label"]

In [163]:
model = skl.LogisticRegression()
model.fit(train_x, train_y)
predict_y = model.predict(train_x)
print("training accuracy:", (predict_y == train_y).sum()/ len(predict_y))

predict_y = model.predict(testing_data_processed)
# print("testing accuracy:", (predict_y == test_y).sum()/ len(predict_y))
print(predict_y)

training accuracy: 0.9186046511627907
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0
 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]


In [147]:
result = pd.DataFrame(predict_y,index=testing_data_processed.index,columns = ["label"])
result.to_csv("prediction.csv", index=True)

In [164]:
result.iloc[:,-1].value_counts()

0    44
1    15
Name: label, dtype: int64

### Random Forest

In [166]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report 
from sklearn.ensemble import RandomForestClassifier

In [167]:
modelRF = RandomForestClassifier(max_depth=100, n_estimators=10)
modelRF.fit(train_x, train_y)
predict_y = model.predict(train_x)
print("training accuracy:", (predict_y == train_y).sum()/ len(predict_y))

training accuracy: 0.9186046511627907


In [168]:
# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [5,10,30,50,100], 'max_depth': [3,5,10,20]}]
# 5x4 = 20 combination 

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,
                   scoring='accuracy')

clf.fit(train_x, train_y)

print("best parameters: ", clf.best_params_)
print("best score:", clf.best_score_ )
print(clf.cv_results_['mean_test_score'])
print(clf.cv_results_['std_test_score'])
print(clf.cv_results_["params"])

best parameters:  {'max_depth': 3, 'n_estimators': 50}
best score: 0.918954248366013
[0.84836601 0.88366013 0.90718954 0.91895425 0.90718954 0.80130719
 0.83660131 0.88431373 0.90718954 0.89607843 0.8248366  0.88366013
 0.87189542 0.90784314 0.91895425 0.8372549  0.89542484 0.87189542
 0.89542484 0.89542484]
[0.04810031 0.03729501 0.02785265 0.04614212 0.04647422 0.13800604
 0.06953954 0.05052588 0.02785265 0.04094238 0.05411828 0.06449096
 0.08666972 0.0573676  0.05927207 0.07784091 0.02301545 0.05796026
 0.04374693 0.02301545]
[{'max_depth': 3, 'n_estimators': 5}, {'max_depth': 3, 'n_estimators': 10}, {'max_depth': 3, 'n_estimators': 30}, {'max_depth': 3, 'n_estimators': 50}, {'max_depth': 3, 'n_estimators': 100}, {'max_depth': 5, 'n_estimators': 5}, {'max_depth': 5, 'n_estimators': 10}, {'max_depth': 5, 'n_estimators': 30}, {'max_depth': 5, 'n_estimators': 50}, {'max_depth': 5, 'n_estimators': 100}, {'max_depth': 10, 'n_estimators': 5}, {'max_depth': 10, 'n_estimators': 10}, {'max_d

In [169]:
model = RandomForestClassifier(max_depth=5, n_estimators=30)
cross_val_score(model, train_x, train_y, cv=5, scoring="accuracy")

array([0.88888889, 0.94117647, 0.94117647, 0.88235294, 1.        ])

In [172]:
print("Detailed classification report:") 
y_true, y_pred = train_y, clf.predict(train_x)
print(classification_report(y_true, y_pred))

Detailed classification report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        57
           1       1.00      0.86      0.93        29

    accuracy                           0.95        86
   macro avg       0.97      0.93      0.95        86
weighted avg       0.96      0.95      0.95        86

