In [114]:

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from array import *
import warnings
warnings.filterwarnings('ignore')

# Random Forest
# Logistic Regression
# Linear Discriminant Analysis (LDA)
# Gaussian Naive Bayes
# Support Vector Machine (SVM)

We split the data-points into 85% for training and 15% for testing, selected randomly using ***sklearn.model_selection.test_train_split()*** with stratification.

Then we create 25 seperate dataframes, each with 18 unique features.
This is done to make a seperate data-frame object for each of the 25 groups present in the original data, as required for the training of the Multi-Classifiers.

In [101]:
# data
data = pd.read_csv("C:/Users/ssmsa/Documents/CODE/ml/project_p2/training_data.csv")
data1 = pd.read_csv("C:/Users/ssmsa/Documents/CODE/ml/project_p2/training_data_targets.csv", names = ['target'])

X = data
y = data1['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, stratify = y)

# X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

X_train_split = [0]*25

i = 0
a = 0
b = 17
while i < 25 :
    X_train_split[i]  = X_train.iloc[:,a:b+1]
    a = a + 18
    b = b + 18
    i = i + 1
    
X_test_split = [0]*25

i = 0
a = 0
b = 17
while i < 25 :
    X_test_split[i]  = X_test.iloc[:,a:b+1]
    a = a + 18
    b = b + 18
    i = i + 1

Defining all the classifiers with hyperparameter tuning enabled, using ***sklearn.model_selection.GridSearchCV()***

In [102]:
# random forest
search_space = {'n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 50, num = 10)],
                'max_features' : ['log2' , 'sqrt'],
               'max_depth' : [4, 5]}

rf_model = RandomForestClassifier(class_weight='balanced')
rf_tuned = GridSearchCV(estimator = rf_model, param_grid = search_space, scoring = 'f1_macro', n_jobs = -1)

In [103]:
# logistic regression
search_space = {'solver' : ['liblinear', 'sag', 'saga'],
               'max_iter' : [200,  10000]}

lr_model = LogisticRegression()
lr_tuned = GridSearchCV(estimator = lr_model, param_grid = search_space, scoring = 'f1_macro', n_jobs = -1)

In [104]:
# linear discriminant analysis (lda)
search_space = {'solver' : ['lsqr', 'eigen'],
               'shrinkage' : ['auto', 0.5]}

lda_model = LinearDiscriminantAnalysis()
lda_tuned = GridSearchCV(estimator = lda_model, param_grid = search_space, scoring = 'f1_macro', n_jobs = -1)

In [105]:
# Gaussian Naive Bayes
cv_method = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3)
search_space = {'var_smoothing': np.logspace(0,-9, num=100)}

gnb_model = GaussianNB()
gnb_tuned = GridSearchCV(estimator = gnb_model, param_grid = search_space, scoring = 'f1_macro', cv = cv_method, n_jobs = -1)

In [106]:
# Support Vector Machine
cv_method1 = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)
search_space = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
               'gamma' : ['scale', 'auto']}

svm_model = SVC()
svm_tuned = GridSearchCV(estimator = svm_model, param_grid = search_space, cv = cv_method1, n_jobs = -1)

#### Fitting Data and Evaluating Performance with Confusion Matrix for ***precision, recall and f1-score***

(Hyperparameter tuning using GridSearchCV has been used for all the classifiers below for best performance.)

Random Forest classifier

In [7]:
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
 #print("The best selected best parameters are :", rf_tuned.best_params_, "\n\n")
print("Confusion Matrix : \n")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report : \n")
print(classification_report(y_test, y_pred))
print("Parameters : ", rf_tuned.best_params_)

Confusion Matrix : 

[[11  1]
 [ 1 11]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.92      0.92      0.92        12
           P       0.92      0.92      0.92        12

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24

Parameters :  {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 33}


Logistic Regression classifier

In [59]:
lr_tuned.fit(X_train, y_train)
y_pred = lr_tuned.predict(X_test)
print("Confusion Matrix : \n")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report : \n")
print(classification_report(y_test, y_pred))
print("Parameters : ", lr_tuned.best_params_)

Confusion Matrix : 

[[11  1]
 [ 2 10]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.85      0.92      0.88        12
           P       0.91      0.83      0.87        12

    accuracy                           0.88        24
   macro avg       0.88      0.88      0.87        24
weighted avg       0.88      0.88      0.87        24

Parameters :  {'max_iter': 10000, 'solver': 'saga'}


Linear Discriminant Analysis classifier

In [42]:
lda_tuned.fit(X_train, y_train)
y_pred = lda_tuned.predict(X_test)
print("Confusion Matrix : \n")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report : \n")
print(classification_report(y_test, y_pred))
print("Parameters : ", lda_tuned.best_params_)

Confusion Matrix : 

[[12  0]
 [ 3  9]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.80      1.00      0.89        12
           P       1.00      0.75      0.86        12

    accuracy                           0.88        24
   macro avg       0.90      0.88      0.87        24
weighted avg       0.90      0.88      0.87        24

Parameters :  {'shrinkage': 'auto', 'solver': 'eigen'}


Gaussian Naive Bayes classifier

In [46]:
gnb_tuned.fit(X_train, y_train)
y_pred = gnb_tuned.predict(X_test)
print("Confusion Matrix : \n")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report : \n")
print(classification_report(y_test, y_pred))
print("Parameters : ", gnb_tuned.best_params_)

Confusion Matrix : 

[[10  2]
 [ 1 11]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.91      0.83      0.87        12
           P       0.85      0.92      0.88        12

    accuracy                           0.88        24
   macro avg       0.88      0.88      0.87        24
weighted avg       0.88      0.88      0.87        24

Parameters :  {'var_smoothing': 6.579332246575682e-08}


Support Vector Machine classifier

In [47]:
svm_tuned.fit(X_train, y_train)
y_pred = svm_tuned.predict(X_test)
print("Confusion Matrix : \n")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report : \n")
print(classification_report(y_test, y_pred, zero_division=1))
print("Parameters : ", svm_tuned.best_params_)

Confusion Matrix : 

[[11  1]
 [ 2 10]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.85      0.92      0.88        12
           P       0.91      0.83      0.87        12

    accuracy                           0.88        24
   macro avg       0.88      0.88      0.87        24
weighted avg       0.88      0.88      0.87        24

Parameters :  {'gamma': 'scale', 'kernel': 'linear'}


**Multi-Classifiers**

**For Multi-Classifier approach, the classifier is trained on each feature group seperately and to predict the the labels. Subsequently 25 different predictions are obtained for the class label of each datapoint. The final label is then decided by majority vote.**

**The Confusion Matrix, prescision, recall and f1-score are shown.**

In [2]:
ym_pred = [0]*25
ym_pred_final = [0]*24
ym_score = [0]*25

**Random Forest Multi-Classifier**

In [32]:
i = 0
while i < 25 :
    rf_tuned.fit(X_train_split[i], y_train)
    ym_pred[i] = rf_tuned.predict(X_test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 24:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (ym_pred[j][i]) == 'H':
            h = h + 1
        if (ym_pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        ym_pred_final[i] = 'H'
    else :
        ym_pred_final[i] = 'P'
    i = i+1

print("Confusion Matrix : \n")
print(confusion_matrix(y_test, ym_pred_final))
print("\nClassification Report : \n")
print(classification_report(y_test, ym_pred_final))

Confusion Matrix : 

[[12  0]
 [ 1 11]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.92      1.00      0.96        12
           P       1.00      0.92      0.96        12

    accuracy                           0.96        24
   macro avg       0.96      0.96      0.96        24
weighted avg       0.96      0.96      0.96        24



In [49]:
i = 0
while i < 25 :
    ym_score[i]  = f1_score(y_test, ym_pred[i], average='macro')
    i = i + 1
    
print(ym_score)

[0.6243478260869566, 0.916083916083916, 0.5714285714285714, 0.7482517482517481, 0.7913043478260869, 0.7078260869565216, 0.8321678321678322, 0.6666666666666666, 0.7037037037037037, 0.5208711433756806, 0.6571428571428571, 0.6643356643356644, 0.6643356643356644, 0.6243478260869566, 0.8321678321678322, 0.6643356643356644, 0.8333333333333334, 0.7913043478260869, 0.7428571428571429, 0.6666666666666666, 0.5555555555555556, 0.6571428571428571, 0.7913043478260869, 0.7913043478260869, 0.6643356643356644]


**Logistic Regression Multi-Classifier**

In [68]:
i = 0
while i < 25 :
    lr_tuned.fit(X_train_split[i], y_train)
    ym_pred[i] = lr_tuned.predict(X_test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 24:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (ym_pred[j][i]) == 'H':
            h = h + 1
        if (ym_pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        ym_pred_final[i] = 'H'
    else :
        ym_pred_final[i] = 'P'
    i = i+1

print("Confusion Matrix : \n")
print(confusion_matrix(y_test, ym_pred_final))
print("\nClassification Report : \n")
print(classification_report(y_test, ym_pred_final))



Confusion Matrix : 

[[11  1]
 [ 0 12]]

Classification Report : 

              precision    recall  f1-score   support

           H       1.00      0.92      0.96        12
           P       0.92      1.00      0.96        12

    accuracy                           0.96        24
   macro avg       0.96      0.96      0.96        24
weighted avg       0.96      0.96      0.96        24



**Linear Discriminant Alalysis Multi-Classifier**

In [61]:
i = 0
while i < 25 :
    lda_tuned.fit(X_train_split[i], y_train)
    ym_pred[i] = lda_tuned.predict(X_test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 24:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (ym_pred[j][i]) == 'H':
            h = h + 1
        if (ym_pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        ym_pred_final[i] = 'H'
    else :
        ym_pred_final[i] = 'P'
    i = i+1

print("Confusion Matrix : \n")
print(confusion_matrix(y_test, ym_pred_final))
print("\nClassification Report : \n")
print(classification_report(y_test, ym_pred_final))

Confusion Matrix : 

[[11  1]
 [ 3  9]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.79      0.92      0.85        12
           P       0.90      0.75      0.82        12

    accuracy                           0.83        24
   macro avg       0.84      0.83      0.83        24
weighted avg       0.84      0.83      0.83        24



**Gaussian Naive Bayes Multi-Classifier**

In [63]:
i = 0
while i < 25 :
    gnb_tuned.fit(X_train_split[i], y_train)
    ym_pred[i] = gnb_tuned.predict(X_test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 24:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (ym_pred[j][i]) == 'H':
            h = h + 1
        if (ym_pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        ym_pred_final[i] = 'H'
    else :
        ym_pred_final[i] = 'P'
    i = i+1

print("Confusion Matrix : \n")
print(confusion_matrix(y_test, ym_pred_final))
print("\nClassification Report : \n")
print(classification_report(y_test, ym_pred_final))

Confusion Matrix : 

[[12  0]
 [ 3  9]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.80      1.00      0.89        12
           P       1.00      0.75      0.86        12

    accuracy                           0.88        24
   macro avg       0.90      0.88      0.87        24
weighted avg       0.90      0.88      0.87        24



**Support Vector Machine Multi-Classifier**

In [67]:
i = 0
while i < 25 :
    svm_model.fit(X_train_split[i], y_train)
    ym_pred[i] = svm_model.predict(X_test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 24:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (ym_pred[j][i]) == 'H':
            h = h + 1
        if (ym_pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        ym_pred_final[i] = 'H'
    else :
        ym_pred_final[i] = 'P'
    i = i+1

print("Confusion Matrix : \n")
print(confusion_matrix(y_test, ym_pred_final))
print("\nClassification Report : \n")
print(classification_report(y_test, ym_pred_final))

Confusion Matrix : 

[[12  0]
 [ 2 10]]

Classification Report : 

              precision    recall  f1-score   support

           H       0.86      1.00      0.92        12
           P       1.00      0.83      0.91        12

    accuracy                           0.92        24
   macro avg       0.93      0.92      0.92        24
weighted avg       0.93      0.92      0.92        24



**From the above results, we can see that the Random Forest Multi-Classifier and the Linear Regression Multi-Classifier have the best results in terms of f1-score (0.96)**

**I have selected Linear Regression as it has higher Recall (1.0) compared to Linear Regression (0.96) for patients(P) as it is not as big a problem if a healthy person is predicted patient as compared to when a patient is predicted healthy**

**Now, training the model on the entire training data and outputing the results into a text file -** 

In [73]:
train = data

train_split = [0]*25

i = 0
a = 0
b = 17
while i < 25 :
    train_split[i]  = train.iloc[:,a:b+1]
    a = a + 18
    b = b + 18
    i = i + 1

test = pd.read_csv("C:/Users/ssmsa/Documents/CODE/ml/project_p2/test_data.csv")

test_split = [0]*25

i = 0
a = 0
b = 17
while i < 25 :
    test_split[i]  = test.iloc[:,a:b+1]
    a = a + 18
    b = b + 18
    i = i + 1

pred = [0]*25
pred_final = [0]*18

In [115]:
i = 0
while i < 25 :
    lr_tuned.fit(train_split[i], y)
    pred[i] = lr_tuned.predict(test_split[i])
    i = i + 1

i = 0
h = 0
p = 0
while i < 18:
    j = 0
    h = 0
    p = 0
    while j < 25:
        if (pred[j][i]) == 'H':
            h = h + 1
        if (pred[j][i]) == 'P' :
            p = p + 1
        j=j+1
    if h > p :
        pred_final[i] = 'H'
    else :
        pred_final[i] = 'P'
    i = i+1

In [116]:
df = pd.DataFrame(pred_final, columns = ['A']) 
np.savetxt('21233.txt', df['A'], fmt='%c', delimiter='\n')