In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Pre-Processing

In [25]:
df = pd.read_csv('balanced_dataset_m.csv')
df.head()

Unnamed: 0,rms_feat,integral_signal,zero_crossings,waveform_length,auto_regressive_coefficients,temporal_moment,avg_amp_change,min_value,max_value,variance,log_detector,linear_env,v_order,modified_mav,skewness,fourier_series,status,Age
0,0.140082,1.927722,9.666667,13.050424,0.108009,0.025644,0.063352,-0.150793,0.083833,0.006556,0.036135,0.588843,-0.025626,0.075217,-3.266544,0.000842,0,27
1,0.110296,1.513884,10.333333,10.587024,0.113399,0.011723,0.051145,-0.106041,0.069044,0.004046,0.030241,0.636949,-0.011714,0.061025,-2.916992,0.000678,0,27
2,0.092105,1.259206,10.0,8.259146,0.10949,0.004074,0.039899,-0.074507,0.063078,0.002315,0.030243,0.679395,-0.004066,0.048415,-2.494197,0.000552,1,23
3,0.16003,2.224544,10.0,16.377881,0.100938,0.052523,0.079504,-0.186571,0.093419,0.009781,0.039544,0.605646,-0.052484,0.088825,-3.570959,0.001021,1,27
4,0.051452,0.739716,9.666667,4.432552,0.118829,0.00077,0.021517,-0.047927,0.035715,0.000845,0.018044,0.65511,-0.000768,0.029428,-2.28361,0.00032,0,53


In [26]:
print(df.shape)
print(df.isna().sum())

(8000, 18)
rms_feat                        0
integral_signal                 0
zero_crossings                  0
waveform_length                 0
auto_regressive_coefficients    0
temporal_moment                 0
avg_amp_change                  0
min_value                       0
max_value                       0
variance                        0
log_detector                    0
linear_env                      0
v_order                         0
modified_mav                    0
skewness                        0
fourier_series                  0
status                          0
Age                             0
dtype: int64


In [27]:
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline

In [28]:
x = df.drop('status', axis = 1)
y = df['status']
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state = 2, test_size=0.3)

In [29]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    #('lda', LDA()),  # Keep 85% of the variance
    ('classifier', RandomForestClassifier())
])

# Training the pipeline
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
# Evaluating the model
print(pipeline.score(x_test, y_test))
print(pipeline.score(x_train, y_train))


0.9816666666666667
1.0


# Models

## Random Forest

In [30]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Initialize Random Forest classifier
random_forest = RandomForestClassifier(random_state=4, class_weight='balanced')

kf = KFold(n_splits = 5, shuffle = True, random_state = 10)

# Initialize GridSearchCV
grid_search_RF = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=kf, n_jobs=-1, verbose=2)

In [31]:
grid_search_RF.fit(x_train, y_train)

RF_standard_noFeaturesRemoved_best_params = grid_search_RF.best_params_
RF_standard_noFeaturesRemoved_best_score = grid_search_RF.best_score_

print('Optimal number of params:', RF_standard_noFeaturesRemoved_best_params)
print('Best score:', RF_standard_noFeaturesRemoved_best_score)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


Optimal number of params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best score: 0.9816071428571428


In [33]:
# Predict on the test set
y_pred_rf = grid_search_RF.predict(x_test)

# Evaluate the model
accuracy_rf_1 = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest classifier:", accuracy_rf_1)
recall_rf_1 = recall_score(y_test, y_pred_rf)
print("Recall of Random Forest classifier:", recall_rf_1)
f1score_rf_1 = f1_score(y_test, y_pred_rf)
print("F1 Score of Random Forest classifier:", f1score_rf_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy of Random Forest classifier: 0.9825
Recall of Random Forest classifier: 0.9825145711906744
F1 Score of Random Forest classifier: 0.9825145711906744
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1199
           1       0.98      0.98      0.98      1201

    accuracy                           0.98      2400
   macro avg       0.98      0.98      0.98      2400
weighted avg       0.98      0.98      0.98      2400

Confusion Matrix:
[[1178   21]
 [  21 1180]]


## SVM

In [34]:
svm = SVC()

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {'C': [0.0001, 0.001, 0.1, 1],
              'gamma': [0.01, 0.1, 1, 10],
              'kernel': ['linear', 'rbf']}

svm_gridsearchcv = GridSearchCV(svm, param_grid, cv=kf )

In [35]:
svm_gridsearchcv.fit(x_train, y_train)

svm_standard_noFeaturesRemoved_best_params = svm_gridsearchcv.best_params_
svm_standard_noFeaturesRemoved_best_score = svm_gridsearchcv.best_score_

print('Optimal params:', svm_standard_noFeaturesRemoved_best_params)
print('Best score:', svm_standard_noFeaturesRemoved_best_score)

Optimal params: {'C': 1, 'gamma': 10, 'kernel': 'rbf'}
Best score: 0.9428571428571428


In [36]:
y_pred = svm_gridsearchcv.predict(x_test)

svm_standard_noFeaturesRemoved_accuracy_score = svm_gridsearchcv.score(x_test, y_test)
print('Accuracy of SVM classifier:', svm_standard_noFeaturesRemoved_accuracy_score)

svm_standard_noFeaturesRemoved_recall_score = recall_score(y_test, y_pred)
print("Recall of SVM classifier:", svm_standard_noFeaturesRemoved_recall_score)

svm_standard_noFeaturesRemoved_f1_score = f1_score(y_test, y_pred)
print("F1 of SVM:", svm_standard_noFeaturesRemoved_f1_score)

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy of SVM classifier: 0.9383333333333334
Recall of SVM classifier: 0.9267277268942548
F1 of SVM: 0.9376579612468408

Confusion Matrix:
[[1139   60]
 [  88 1113]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1199
           1       0.95      0.93      0.94      1201

    accuracy                           0.94      2400
   macro avg       0.94      0.94      0.94      2400
weighted avg       0.94      0.94      0.94      2400



In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import uniform

In [38]:
logistic_regression = LogisticRegression(max_iter=1000)

param_grid = [
    {'solver' : ['newton-cg'],
      'penalty' : ['l2'],
      'max_iter' : [50,100,200,500,1000,2500],
      'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
]

kf = KFold(n_splits = 5, shuffle = True, random_state = 4)

logistic_regression_grid_search = GridSearchCV(logistic_regression, param_grid=param_grid, cv=kf, error_score='raise')

In [39]:
logistic_regression_grid_search.fit(x_train, y_train)

logistic_regression_standard_noFeaturesRemoved_best_params = logistic_regression_grid_search.best_params_
logistic_regression_standard_noFeaturesRemoved_best_score = logistic_regression_grid_search.best_score_

print('Optimal number of params:', logistic_regression_standard_noFeaturesRemoved_best_params)
print('Best score:', logistic_regression_standard_noFeaturesRemoved_best_score)

Optimal number of params: {'C': 1000, 'max_iter': 50, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.8926785714285714


In [40]:
# Predict on the test set
y_pred_logistic_regression = logistic_regression_grid_search.predict(x_test)

# Evaluate the model
accuracy_logreg_1 = accuracy_score(y_test, y_pred_logistic_regression)
print("Accuracy of Logistic Regression classifier:", accuracy_logreg_1)
recall_logreg_1 = recall_score(y_test, y_pred_logistic_regression)
print("Recall of Logistic Regression classifier:", recall_logreg_1)
f1score_logreg_1 = f1_score(y_test, y_pred_logistic_regression)
print("F1 Score of Logistic Regression classifier:", f1score_logreg_1)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic_regression))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic_regression))

Accuracy of Logistic Regression classifier: 0.895
Recall of Logistic Regression classifier: 0.8825978351373855
F1 Score of Logistic Regression classifier: 0.893760539629005
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1199
           1       0.91      0.88      0.89      1201

    accuracy                           0.90      2400
   macro avg       0.90      0.90      0.89      2400
weighted avg       0.90      0.90      0.89      2400

Confusion Matrix:
[[1088  111]
 [ 141 1060]]
