In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Pre-Processing

In [None]:
df = pd.read_csv('balanced_dataset_m.csv')
df.head()

In [None]:
print(df.shape)
print(df.isna().sum())

In [13]:
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline

In [14]:
x = df.drop('status', axis = 1)
y = df['status']
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state = 2, test_size=0.3)

# Models

## Random Forest

In [16]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Initialize Random Forest classifier
random_forest = RandomForestClassifier(random_state=4, class_weight='balanced')

kf = KFold(n_splits = 5, shuffle = True, random_state = 10)

# Initialize GridSearchCV
grid_search_RF = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=kf, n_jobs=-1, verbose=2)

In [None]:
grid_search_RF.fit(x_train, y_train)

RF_standard_noFeaturesRemoved_best_params = grid_search_RF.best_params_
RF_standard_noFeaturesRemoved_best_score = grid_search_RF.best_score_

print('Optimal number of params:', RF_standard_noFeaturesRemoved_best_params)
print('Best score:', RF_standard_noFeaturesRemoved_best_score)

In [None]:
# Predict on the test set
y_pred_rf = grid_search_RF.predict(x_test)

# Evaluate the model
accuracy_rf_1 = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest classifier:", accuracy_rf_1)
recall_rf_1 = recall_score(y_test, y_pred_rf)
print("Recall of Random Forest classifier:", recall_rf_1)
f1score_rf_1 = f1_score(y_test, y_pred_rf)
print("F1 Score of Random Forest classifier:", f1score_rf_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

## SVM

In [19]:
svm = SVC()

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {'C': [0.0001, 0.001, 0.1, 1],
              'gamma': [0.01, 0.1, 1, 10],
              'kernel': ['linear', 'rbf']}

svm_gridsearchcv = GridSearchCV(svm, param_grid, cv=kf )

In [None]:
svm_gridsearchcv.fit(x_train, y_train)

svm_standard_noFeaturesRemoved_best_params = svm_gridsearchcv.best_params_
svm_standard_noFeaturesRemoved_best_score = svm_gridsearchcv.best_score_

print('Optimal params:', svm_standard_noFeaturesRemoved_best_params)
print('Best score:', svm_standard_noFeaturesRemoved_best_score)

In [None]:
y_pred = svm_gridsearchcv.predict(x_test)

svm_standard_noFeaturesRemoved_accuracy_score = svm_gridsearchcv.score(x_test, y_test)
print('Accuracy of SVM classifier:', svm_standard_noFeaturesRemoved_accuracy_score)

svm_standard_noFeaturesRemoved_recall_score = recall_score(y_test, y_pred)
print("Recall of SVM classifier:", svm_standard_noFeaturesRemoved_recall_score)

svm_standard_noFeaturesRemoved_f1_score = f1_score(y_test, y_pred)
print("F1 of SVM:", svm_standard_noFeaturesRemoved_f1_score)

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import uniform

In [23]:
logistic_regression = LogisticRegression(max_iter=1000)

param_grid = [
    {'solver' : ['newton-cg'],
      'penalty' : ['l2'],
      'max_iter' : [50,100,200,500,1000,2500],
      'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
]

kf = KFold(n_splits = 5, shuffle = True, random_state = 4)

logistic_regression_grid_search = GridSearchCV(logistic_regression, param_grid=param_grid, cv=kf, error_score='raise')

In [None]:
logistic_regression_grid_search.fit(x_train, y_train)

logistic_regression_standard_noFeaturesRemoved_best_params = logistic_regression_grid_search.best_params_
logistic_regression_standard_noFeaturesRemoved_best_score = logistic_regression_grid_search.best_score_

print('Optimal number of params:', logistic_regression_standard_noFeaturesRemoved_best_params)
print('Best score:', logistic_regression_standard_noFeaturesRemoved_best_score)

In [None]:
# Predict on the test set
y_pred_logistic_regression = logistic_regression_grid_search.predict(x_test)

# Evaluate the model
accuracy_logreg_1 = accuracy_score(y_test, y_pred_logistic_regression)
print("Accuracy of Logistic Regression classifier:", accuracy_logreg_1)
recall_logreg_1 = recall_score(y_test, y_pred_logistic_regression)
print("Recall of Logistic Regression classifier:", recall_logreg_1)
f1score_logreg_1 = f1_score(y_test, y_pred_logistic_regression)
print("F1 Score of Logistic Regression classifier:", f1score_logreg_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic_regression))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic_regression))