In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix ,classification_report,precision_score, recall_score ,f1_score, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [3]:
train = pd.read_csv('./msbd5001-spring-2022/train.csv')
test = pd.read_csv('./msbd5001-spring-2022/test.csv')
sample = pd.read_csv('./msbd5001-spring-2022/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [5]:
test.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
0,0,2843.0,156.0,1358.52,730.78,637.85,127.06,94.82,1588.62,45,1,3256.0
1,1,437.0,137.0,509.43,268.05,243.07,390.86,98.24,1002.76,51,1,491.0
2,2,826.0,82.0,1232.22,493.42,744.08,516.28,320.15,2200.58,32,0,1381.0
3,3,861.0,50.0,1512.86,925.51,590.07,380.25,25.8,1929.1,50,0,1377.0
4,4,1160.0,157.0,890.42,403.91,489.53,266.92,87.63,1251.52,43,0,1844.0


In [6]:
train.isnull().sum()

id                          0
MO HLADR+ MFI (cells/ul)    1
Neu CD64+MFI (cells/ul)     1
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    1
label                       0
dtype: int64

In [7]:
train.drop('id', axis = 1, inplace = True)

In [8]:
# dealing with missing values
train.dropna(inplace=True)

In [9]:
#test.isnull().sum()

In [10]:
data_num = train[['MO HLADR+ MFI (cells/ul)','Neu CD64+MFI (cells/ul)','CD3+T (cells/ul)','CD8+T (cells/ul)',
                  'CD4+T (cells/ul)','NK (cells/ul)','CD19+ (cells/ul)','CD45+ (cells/ul)','Age','Mono CD64+MFI (cells/ul)']]
data_cat = train[['Sex 0M1F']]

In [11]:
# for i in data_num.columns:
#     plt.hist(data_num[i])
#     plt.title(i)
#     plt.show()

In [12]:
#####to handel the skewness in the data 
def handel_outlier(col):
    train[col] =np.log1p(train[col])

In [13]:
#handel_outlier('Neu CD64+MFI (cells/ul)')

In [14]:

trainX = train.drop(columns = ['label'])
trainY = train['label']
testX = test.drop(['id'],axis=1)

In [15]:
# # Feature Scaling
# sc = StandardScaler()
# trainX = sc.fit_transform(trainX)
# testX = sc.fit_transform(testX)

In [16]:
# split the data into train and test
X_train, X_val, y_train, y_val = train_test_split(trainX,trainY, test_size = 0.25, random_state=12)

# Model Building

In [30]:
#------------------------LogisticRegression-----------------------
c_space = np.logspace(-5, 8, 15) 
param_grid = {'max_iter':[20,40,60,100],
    'C':[0.01,0.1,1,10]} 

lr = LogisticRegression() 

lr = GridSearchCV(lr, param_grid, cv = 5) 

lr.fit(X_train,y_train)

y_pred =  lr.predict(X_val)
print(lr.best_params_)
print(' f1 score: ',f1_score(y_val, y_val))
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

{'C': 0.01, 'max_iter': 20}
 f1 score:  1.0
 accuracy score:  0.9090909090909091
[[15  0]
 [ 2  5]]
 precision score:  1.0 

 recall score:  0.7142857142857143 

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.71      0.83         7

    accuracy                           0.91        22
   macro avg       0.94      0.86      0.89        22
weighted avg       0.92      0.91      0.90        22



In [31]:
test_pred =  lr.predict(testX)
submission = pd.DataFrame({'id':test['id'], 'label':test_pred})
submission.to_csv('lr_submission.csv', index=False)

In [19]:
#------------------------k-nearest neighbors (K-nn)-----------------------
param_grid =[{
"weights":["uniform"],
"n_neighbors":[i for i in range(1,11)]
},
{"weights":["distance"],
"n_neighbors":[i for i in range(1,11)],
"p":[i for i in range(1,6)]
}
]

Knnmodel= KNeighborsClassifier()  

Knnmodel = GridSearchCV(Knnmodel, param_grid, cv = 5) 

Knnmodel.fit(X_train, y_train)

y_pred =  Knnmodel.predict(X_val)
print(Knnmodel.best_params_)
print(' f1 score: ',f1_score(y_val, y_val))
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
 f1 score:  1.0
 accuracy score:  0.8636363636363636
[[14  1]
 [ 2  5]]
 precision score:  0.8333333333333334 

 recall score:  0.7142857142857143 

              precision    recall  f1-score   support

           0       0.88      0.93      0.90        15
           1       0.83      0.71      0.77         7

    accuracy                           0.86        22
   macro avg       0.85      0.82      0.84        22
weighted avg       0.86      0.86      0.86        22



In [20]:
test_pred =  Knnmodel.predict(testX)
submission = pd.DataFrame({'id':test['id'], 'label':test_pred})
submission.to_csv('knn_submission.csv', index=False)

In [19]:
#------------------------naive bayes-----------------------
param_grid = {}
NBmodel=  GaussianNB()  
NBmodel.fit(X_train, y_train)

y_pred =  NBmodel.predict(X_val)
print(' f1 score: ',f1_score(y_val, y_val))
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

 f1 score:  1.0
 accuracy score:  0.6363636363636364
[[9 6]
 [2 5]]
 precision score:  0.45454545454545453 

 recall score:  0.7142857142857143 

              precision    recall  f1-score   support

           0       0.82      0.60      0.69        15
           1       0.45      0.71      0.56         7

    accuracy                           0.64        22
   macro avg       0.64      0.66      0.62        22
weighted avg       0.70      0.64      0.65        22



In [28]:
#------------------------support vector classification-----------------------
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
SVCmodel=  SVC(probability=True)  
SVCmodel = GridSearchCV(SVCmodel, param_grid, cv = 5) 
SVCmodel.fit(X_train, y_train)

y_pred =  SVCmodel.predict(X_val)
print(' f1 score: ',f1_score(y_val, y_val))
print('best para: ', SVCmodel.best_params_)
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

 f1 score:  1.0
best para:  {'C': 1, 'kernel': 'linear'}
 accuracy score:  0.9090909090909091
[[15  0]
 [ 2  5]]
 precision score:  1.0 

 recall score:  0.7142857142857143 

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.71      0.83         7

    accuracy                           0.91        22
   macro avg       0.94      0.86      0.89        22
weighted avg       0.92      0.91      0.90        22



In [29]:
test_pred =  SVCmodel.predict(testX)
submission = pd.DataFrame({'id':test['id'], 'label':test_pred})
submission.to_csv('svc_submission.csv', index=False)

In [23]:
#------------------------Decision Tree-----------------------
param_grid = {
    'criterion':['gini','entropy'],
    'max_depth':[1,2,3,4,5,6,7,8,9,10]
}
DTmodel=  DecisionTreeClassifier()
DTmodel = GridSearchCV(DTmodel, param_grid, cv =5)
DTmodel.fit(X_train, y_train)

y_pred =  DTmodel.predict(X_val)
print(DTmodel.best_params_)
print(' f1 score: ',f1_score(y_val, y_val))
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

{'criterion': 'gini', 'max_depth': 2}
 f1 score:  1.0
 accuracy score:  0.8181818181818182
[[14  1]
 [ 3  4]]
 precision score:  0.8 

 recall score:  0.5714285714285714 

              precision    recall  f1-score   support

           0       0.82      0.93      0.87        15
           1       0.80      0.57      0.67         7

    accuracy                           0.82        22
   macro avg       0.81      0.75      0.77        22
weighted avg       0.82      0.82      0.81        22



In [24]:
test_pred =  DTmodel.predict(testX)
submission = pd.DataFrame({'id':test['id'], 'label':test_pred})
submission.to_csv('dt_submission.csv', index=False)

In [26]:
#------------------------Random Forest-----------------------
param_grid = {
    'criterion':['gini','entropy'],
    "n_estimators": np.arange(2, 100, 2),
    "max_depth": np.arange(1, 10, 1),
    
}

RFmodel=  RandomForestClassifier() 
RFmodel = GridSearchCV(RFmodel, param_grid, cv =5)
RFmodel.fit(X_train, y_train)

y_pred =  RFmodel.predict(X_val)
print(' f1 score: ',f1_score(y_val, y_val))
print(' accuracy score: ',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(' precision score: ',precision_score(y_val, y_pred),'\n')
print(' recall score: ',recall_score(y_val, y_pred),'\n')
print(classification_report(y_val, y_pred))

 f1 score:  1.0
 accuracy score:  0.7727272727272727
[[12  3]
 [ 2  5]]
 precision score:  0.625 

 recall score:  0.7142857142857143 

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.62      0.71      0.67         7

    accuracy                           0.77        22
   macro avg       0.74      0.76      0.75        22
weighted avg       0.78      0.77      0.78        22



In [27]:
test_pred =  RFmodel.predict(testX)
submission = pd.DataFrame({'id':test['id'], 'label':test_pred})
submission.to_csv('rf_submission.csv', index=False)