In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Pre-Processing

In [2]:
df = pd.read_csv('downsized_dataset.csv')
df.head()

Unnamed: 0,rms_feat,integral_signal,zero_crossings,waveform_length,auto_regressive_coefficients,temporal_moment,avg_amp_change,min_value,max_value,variance,log_detector,linear_env,v_order,modified_mav,skewness,fourier_series,status
0,0.181467,2.458629,9.0,14.999088,0.112136,0.040264,0.072811,-0.173261,0.125012,0.009854,0.045373,0.643452,-0.040235,0.097932,-2.779578,0.001135,1
1,0.091098,1.225373,11.333333,8.493909,0.113538,0.005682,0.041033,-0.100648,0.062292,0.002716,0.024587,0.637681,-0.005677,0.049322,-3.008738,0.000582,0
2,0.115747,1.685769,9.666667,11.712176,0.110252,0.022562,0.057133,-0.136661,0.079951,0.005886,0.033667,0.625835,-0.022547,0.071522,-3.327338,0.000819,1
3,0.078959,1.177512,10.0,7.179359,0.124593,0.00371,0.035366,-0.086914,0.058312,0.00257,0.029398,0.676005,-0.003698,0.04899,-2.305951,0.000563,0
4,0.074216,1.071029,11.0,7.232317,0.108869,0.003398,0.03528,-0.086399,0.046381,0.001836,0.018263,0.612602,-0.003396,0.04074,-3.068989,0.000471,1


In [3]:
print(df.shape)
print(df.isna().sum())

(40000, 17)
rms_feat                        0
integral_signal                 0
zero_crossings                  0
waveform_length                 0
auto_regressive_coefficients    0
temporal_moment                 0
avg_amp_change                  0
min_value                       0
max_value                       0
variance                        0
log_detector                    0
linear_env                      0
v_order                         0
modified_mav                    0
skewness                        0
fourier_series                  0
status                          0
dtype: int64


In [4]:
x = df.drop('status', axis = 1)
y = df['status']
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state = 2, test_size=0.3)

# Models

## Random Forest

In [5]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Initialize Random Forest classifier
random_forest = RandomForestClassifier(random_state=4, class_weight='balanced')

kf = KFold(n_splits = 5, shuffle = True, random_state = 10)

# Initialize GridSearchCV
grid_search_RF = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=kf, n_jobs=-1, verbose=2)

In [6]:
grid_search_RF.fit(x_train, y_train)

RF_standard_noFeaturesRemoved_best_params = grid_search_RF.best_params_
RF_standard_noFeaturesRemoved_best_score = grid_search_RF.best_score_

print('Optimal number of params:', RF_standard_noFeaturesRemoved_best_params)
print('Best score:', RF_standard_noFeaturesRemoved_best_score)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Optimal number of params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.9803571428571429


In [7]:
# Predict on the test set
y_pred_rf = grid_search_RF.predict(x_test)

# Evaluate the model
accuracy_rf_1 = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest classifier:", accuracy_rf_1)
recall_rf_1 = recall_score(y_test, y_pred_rf)
print("Recall of Random Forest classifier:", recall_rf_1)
f1score_rf_1 = f1_score(y_test, y_pred_rf)
print("F1 Score of Random Forest classifier:", f1score_rf_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy of Random Forest classifier: 0.9833333333333333
Recall of Random Forest classifier: 0.9845420161157704
F1 Score of Random Forest classifier: 0.9835715459175292
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5919
           1       0.98      0.98      0.98      6081

    accuracy                           0.98     12000
   macro avg       0.98      0.98      0.98     12000
weighted avg       0.98      0.98      0.98     12000

Confusion Matrix:
[[5813  106]
 [  94 5987]]


## SVM

In [8]:
svm = SVC()

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {'C': [0.0001, 0.001, 0.1, 1],
              'gamma': [0.01, 0.1, 1, 10],
              'kernel': ['linear', 'rbf']}

svm_gridsearchcv = GridSearchCV(svm, param_grid, cv=kf )

In [9]:
svm_gridsearchcv.fit(x_train, y_train)

svm_standard_noFeaturesRemoved_best_params = svm_gridsearchcv.best_params_
svm_standard_noFeaturesRemoved_best_score = svm_gridsearchcv.best_score_

print('Optimal params:', svm_standard_noFeaturesRemoved_best_params)
print('Best score:', svm_standard_noFeaturesRemoved_best_score)

Optimal params: {'C': 1, 'gamma': 10, 'kernel': 'rbf'}
Best score: 0.8963214285714287


In [10]:
y_pred = svm_gridsearchcv.predict(x_test)

svm_standard_noFeaturesRemoved_accuracy_score = svm_gridsearchcv.score(x_test, y_test)
print('Accuracy of SVM classifier:', svm_standard_noFeaturesRemoved_accuracy_score)

svm_standard_noFeaturesRemoved_recall_score = recall_score(y_test, y_pred)
print("Recall of SVM classifier:", svm_standard_noFeaturesRemoved_recall_score)

svm_standard_noFeaturesRemoved_f1_score = f1_score(y_test, y_pred)
print("F1 of SVM:", svm_standard_noFeaturesRemoved_f1_score)

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy of SVM classifier: 0.9045833333333333
Recall of SVM classifier: 0.8840651208682783
F1 of SVM: 0.9037572497268219

Confusion Matrix:
[[5479  440]
 [ 705 5376]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      5919
           1       0.92      0.88      0.90      6081

    accuracy                           0.90     12000
   macro avg       0.91      0.90      0.90     12000
weighted avg       0.91      0.90      0.90     12000



# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import uniform

In [12]:
logistic_regression = LogisticRegression(max_iter=1000)

param_grid = [
    {'solver' : ['newton-cg'],
      'penalty' : ['l2'],
      'max_iter' : [50,100,200,500,1000,2500],
      'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
]

kf = KFold(n_splits = 5, shuffle = True, random_state = 4)

logistic_regression_grid_search = GridSearchCV(logistic_regression, param_grid=param_grid, cv=kf, error_score='raise')

In [13]:
logistic_regression_grid_search.fit(x_train, y_train)

logistic_regression_standard_noFeaturesRemoved_best_params = logistic_regression_grid_search.best_params_
logistic_regression_standard_noFeaturesRemoved_best_score = logistic_regression_grid_search.best_score_

print('Optimal number of params:', logistic_regression_standard_noFeaturesRemoved_best_params)
print('Best score:', logistic_regression_standard_noFeaturesRemoved_best_score)

Optimal number of params: {'C': 100, 'max_iter': 50, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.7999285714285713


In [14]:
# Predict on the test set
y_pred_logistic_regression = logistic_regression_grid_search.predict(x_test)

# Evaluate the model
accuracy_logreg_1 = accuracy_score(y_test, y_pred_logistic_regression)
print("Accuracy of Logistic Regression classifier:", accuracy_logreg_1)
recall_logreg_1 = recall_score(y_test, y_pred_logistic_regression)
print("Recall of Logistic Regression classifier:", recall_logreg_1)
f1score_logreg_1 = f1_score(y_test, y_pred_logistic_regression)
print("F1 Score of Logistic Regression classifier:", f1score_logreg_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic_regression))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic_regression))

Accuracy of Logistic Regression classifier: 0.7988333333333333
Recall of Logistic Regression classifier: 0.7932905772076961
F1 Score of Logistic Regression classifier: 0.7998673520145912
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.80      5919
           1       0.81      0.79      0.80      6081

    accuracy                           0.80     12000
   macro avg       0.80      0.80      0.80     12000
weighted avg       0.80      0.80      0.80     12000

Confusion Matrix:
[[4762 1157]
 [1257 4824]]
