In [None]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data mining/mfcc_values_0_49.csv')
df.head()

In [None]:
# Separating predictor and target variables

X = df[[_ for _ in df.columns if _ not in ['target']]]
y = df['target']

In [None]:

dict_values_cv = {'sampling_rate' : [], 'Algorithm_detailed_name' : [], 'Algorithm' : [], 'ROC_AUC_cv':[], 'Accuracy_cv':[], 'F1_cv':[], 'Precision_cv':[], 'Recall_cv': [], 'Accuracy':[], 'F1':[], 'Precision':[], 'Recall':[] }

# List of undersampling rates ranging from 0.1 to 1.0
sampling_ratios = [round(x, 1) for x in np.arange(0, 1.1, 0.1).tolist()][1:]

for rate in sampling_ratios:

  # under sampling the majority class
  under = RandomUnderSampler(sampling_strategy=rate)

  X_under_sampling, y_under_sampling = under.fit_resample(X, y)

  # Train Test split
  X_train, X_test, y_train, y_test = train_test_split(X_under_sampling, y_under_sampling , test_size = 0.25, random_state = 42)

  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = LogisticRegression(random_state=0)
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append("Default Logistic Regression")
  dict_values_cv['Algorithm'].append("Default Logistic Regression")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))


  dict_values_cv['sampling_rate'].append(rate)

  w = {0:rate, 1:1}

  # define model
  model = LogisticRegression(random_state=0, class_weight=w)
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append(f"Weighted Logistic Regression {w}")
  dict_values_cv['Algorithm'].append("Weighted Logistic Regression")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))


  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = GaussianNB()
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append("Naive Bayes Classifier")
  dict_values_cv['Algorithm'].append("Naive Bayes Classifier")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))


  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = SVC(gamma='scale')
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append("SVM Classifier")
  dict_values_cv['Algorithm'].append("SVM Classifier")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))


  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = SVC(kernel="linear", class_weight=w)
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append(f"Weighted SVM Classifier {w}")
  dict_values_cv['Algorithm'].append("Weighted SVM Classifier")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))
  

  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = BaggingClassifier()
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append("Random Forest Classifier")
  dict_values_cv['Algorithm'].append("Random Forest Classifier")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))


  dict_values_cv['sampling_rate'].append(rate)

  # define model
  model = BaggingClassifier(base_estimator=SVC(kernel="linear"))
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores_cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  accuracy_cv = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  f1_cv = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
  precision_cv = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
  recall_cv = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  dict_values_cv['Algorithm_detailed_name'].append("SVC Bagging Classifier")
  dict_values_cv['Algorithm'].append("SVC Bagging Classifier")
  dict_values_cv['ROC_AUC_cv'].append(mean(scores_cv))
  dict_values_cv['Accuracy_cv'].append(mean(accuracy_cv))
  dict_values_cv['F1_cv'].append(mean(f1_cv))
  dict_values_cv['Precision_cv'].append(mean(precision_cv))
  dict_values_cv['Recall_cv'].append(mean(recall_cv))

  dict_values_cv['Accuracy'].append(accuracy_score(y_test, y_pred))
  dict_values_cv['F1'].append(f1_score(y_test, y_pred, average='binary'))
  dict_values_cv['Precision'].append(precision_score(y_test, y_pred, average='binary'))
  dict_values_cv['Recall'].append(recall_score(y_test, y_pred, average='binary'))



In [None]:
# converting into a DataFrame
sampling_comparison_cv = pd.DataFrame(dict_values_cv)
sampling_comparison_cv

In [None]:
# checking best models for Undersampling rate < 0.6
sampling_comparison_cv[sampling_comparison_cv['sampling_rate']<0.6].sort_values(by = ['Recall_cv','Accuracy_cv', 'sampling_rate'], ascending = [False,False,True])[['sampling_rate',	'Algorithm_detailed_name',	'Algorithm','Accuracy_cv', 'Recall_cv', 'Accuracy','Recall']].reset_index(drop = True)

Cross Validation Accuracy plot

In [None]:

figure(figsize=(10, 8), dpi=80)
sampling_cv_plot_df = sampling_comparison_cv.copy()

sampling_cv_plot_df.set_index('sampling_rate', inplace=True)
sampling_cv_plot_df.groupby('Algorithm')['Accuracy_cv'].plot( marker='o', legend=True)
plt.title('Cross validation Accuracy comparisons for different algorithms for different UnderSampled data', color='black')
plt.ylabel('Accuracy cross validation')
plt.xlabel('Undersampling rate')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

Cross Validation Recall plot

In [None]:

figure(figsize=(10, 8), dpi=80)
sampling_cv_plot_df = sampling_comparison_cv.copy()

sampling_cv_plot_df.set_index('sampling_rate', inplace=True)
sampling_cv_plot_df.groupby('Algorithm')['Recall_cv'].plot(marker='o',legend=True)
plt.title('Cross validation Recall comparisons for different algorithms for different UnderSampled data', color='black')
plt.ylabel('Recall cross validation')
plt.xlabel('Undersampling rate')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

Test Accuracy plot

In [None]:

figure(figsize=(10, 8), dpi=80)
sampling_cv_plot_df = sampling_comparison_cv.copy()

sampling_cv_plot_df.set_index('sampling_rate', inplace=True)
sampling_cv_plot_df.groupby('Algorithm')['Accuracy'].plot( marker='o', legend=True)
plt.title('Test Accuracy comparisons for different algorithms for different UnderSampled data', color='black')
plt.ylabel('Test Accuracy')
plt.xlabel('Undersampling rate')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

Test Recall plot

In [None]:

figure(figsize=(10, 8), dpi=80)
sampling_cv_plot_df = sampling_comparison_cv.copy()

sampling_cv_plot_df.set_index('sampling_rate', inplace=True)
sampling_cv_plot_df.groupby('Algorithm')['Recall'].plot(marker='o',legend=True)
plt.title('Test Recall comparisons for different algorithms for different UnderSampled data', color='black')
plt.ylabel('Test Recall')
plt.xlabel('Undersampling rate')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()