In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import uniform

# Preprocessing

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,age,case
0,0.019818,0.023967,0.029276,-0.006681,-0.071343,-0.03392,-0.038496,-0.060975,-0.000455,0.021085,...,-0.070779,-0.035411,-0.043172,-0.06925,0.00127,0.028683,0.035498,0.0281,63,0
1,0.020079,0.023657,0.028958,-0.002345,-0.067365,-0.030564,-0.03908,-0.076808,-0.000662,0.022751,...,-0.069595,-0.032711,-0.045286,-0.091273,0.000954,0.032006,0.039094,0.029718,63,0
2,0.01408,0.018367,0.020782,0.001239,-0.039494,-0.030699,-0.016737,-0.037789,-0.030324,0.00866,...,-0.039816,-0.032797,-0.019426,-0.048459,-0.036536,0.014475,0.024109,0.023303,23,0
3,0.008646,0.010663,0.011732,0.006878,-0.002025,-0.013261,-0.025155,-0.035414,-0.018992,0.010588,...,-0.002679,-0.012373,-0.023788,-0.035602,-0.017267,0.013834,0.011803,0.00935,53,1
4,0.018779,0.024554,0.027306,0.002198,-0.043008,-0.059243,-0.022734,-0.044385,-0.036621,0.011433,...,-0.044551,-0.06301,-0.025162,-0.057268,-0.043719,0.019238,0.032469,0.030944,23,0


In [None]:
print(df.shape)
print(df.isna().sum())

(800, 210)
0       0
1       0
2       0
3       0
4       0
       ..
205     0
206     0
207     0
age     0
case    0
Length: 210, dtype: int64


In [None]:
x = df.drop('case', axis = 1)
y = df['case']
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state = 42, test_size=0.3)

In [None]:
# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Initialize Random Forest classifier
random_forest = RandomForestClassifier(random_state=4, class_weight='balanced')

kf = KFold(n_splits = 5, shuffle = True, random_state = 10)

# Initialize GridSearchCV
grid_search_RF = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=kf, n_jobs=-1, verbose=2)

In [None]:
grid_search_RF.fit(x_train, y_train)

RF_standard_noFeaturesRemoved_best_params = grid_search_RF.best_params_
RF_standard_noFeaturesRemoved_best_score = grid_search_RF.best_score_

print('Optimal number of params:', RF_standard_noFeaturesRemoved_best_params)
print('Best score:', RF_standard_noFeaturesRemoved_best_score)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Optimal number of params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.9892857142857142


In [None]:
# Predict on the test set
y_pred_rf = grid_search_RF.predict(x_test)

# Evaluate the model
accuracy_rf_1 = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest classifier:", accuracy_rf_1)
recall_rf_1 = recall_score(y_test, y_pred_rf)
print("Recall of Random Forest classifier:", recall_rf_1)
f1score_rf_1 = f1_score(y_test, y_pred_rf)
print("F1 Score of Random Forest classifier:", f1score_rf_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy of Random Forest classifier: 0.9875
Recall of Random Forest classifier: 0.9743589743589743
F1 Score of Random Forest classifier: 0.9870129870129869
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       123
           1       1.00      0.97      0.99       117

    accuracy                           0.99       240
   macro avg       0.99      0.99      0.99       240
weighted avg       0.99      0.99      0.99       240

Confusion Matrix:
[[123   0]
 [  3 114]]


# SVM

In [None]:
svm = SVC()

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {'C': [0.0001, 0.001, 0.1, 1],
              'gamma': [0.01, 0.1, 1, 10],
              'kernel': ['linear', 'rbf']}

svm_gridsearchcv = GridSearchCV(svm, param_grid, cv=kf )

In [None]:
svm_gridsearchcv.fit(x_train, y_train)

svm_standard_noFeaturesRemoved_best_params = svm_gridsearchcv.best_params_
svm_standard_noFeaturesRemoved_best_score = svm_gridsearchcv.best_score_

print('Optimal params:', svm_standard_noFeaturesRemoved_best_params)
print('Best score:', svm_standard_noFeaturesRemoved_best_score)

Optimal params: {'C': 1, 'gamma': 10, 'kernel': 'rbf'}
Best score: 0.9946428571428572


In [None]:
y_pred = svm_gridsearchcv.predict(x_test)

svm_standard_noFeaturesRemoved_accuracy_score = svm_gridsearchcv.score(x_test, y_test)
print('Accuracy of SVM classifier:', svm_standard_noFeaturesRemoved_accuracy_score)

svm_standard_noFeaturesRemoved_recall_score = recall_score(y_test, y_pred)
print("Recall of SVM classifier:", svm_standard_noFeaturesRemoved_recall_score)

svm_standard_noFeaturesRemoved_f1_score = f1_score(y_test, y_pred)
print("F1 of SVM:", svm_standard_noFeaturesRemoved_f1_score)

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy of SVM classifier: 0.9875
Recall of SVM classifier: 0.9743589743589743
F1 of SVM: 0.9870129870129869

Confusion Matrix:
[[123   0]
 [  3 114]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       123
           1       1.00      0.97      0.99       117

    accuracy                           0.99       240
   macro avg       0.99      0.99      0.99       240
weighted avg       0.99      0.99      0.99       240



# Logistic Regression

In [None]:
logistic_regression = LogisticRegression(max_iter=1000)

param_grid = [
    {'solver' : ['newton-cg'],
      'penalty' : ['l2'],
      'max_iter' : [50,100,200,500,1000,2500],
      'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
]

kf = KFold(n_splits = 5, shuffle = True, random_state = 4)

logistic_regression_grid_search = GridSearchCV(logistic_regression, param_grid=param_grid, cv=kf, error_score='raise')

In [None]:
logistic_regression_grid_search.fit(x_train, y_train)

logistic_regression_standard_noFeaturesRemoved_best_params = logistic_regression_grid_search.best_params_
logistic_regression_standard_noFeaturesRemoved_best_score = logistic_regression_grid_search.best_score_

print('Optimal number of params:', logistic_regression_standard_noFeaturesRemoved_best_params)
print('Best score:', logistic_regression_standard_noFeaturesRemoved_best_score)

Optimal number of params: {'C': 1000, 'max_iter': 50, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.9446428571428571


In [None]:
# Predict on the test set
y_pred_logistic_regression = logistic_regression_grid_search.predict(x_test)

# Evaluate the model
accuracy_logreg_1 = accuracy_score(y_test, y_pred_logistic_regression)
print("Accuracy of Logistic Regression classifier:", accuracy_logreg_1)
recall_logreg_1 = recall_score(y_test, y_pred_logistic_regression)
print("Recall of Logistic Regression classifier:", recall_logreg_1)
f1score_logreg_1 = f1_score(y_test, y_pred_logistic_regression)
print("F1 Score of Logistic Regression classifier:", f1score_logreg_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic_regression))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic_regression))

Accuracy of Logistic Regression classifier: 0.9708333333333333
Recall of Logistic Regression classifier: 0.9572649572649573
F1 Score of Logistic Regression classifier: 0.9696969696969696
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       123
           1       0.98      0.96      0.97       117

    accuracy                           0.97       240
   macro avg       0.97      0.97      0.97       240
weighted avg       0.97      0.97      0.97       240

Confusion Matrix:
[[121   2]
 [  5 112]]
