In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

import imblearn
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Filter out ConvergenceWarning explicitly
warnings.filterwarnings('ignore', category=ConvergenceWarning)

## Dataset

In [3]:
df_processed = pd.read_csv("data_ml.csv", index_col = 0)
df_processed.head()

Unnamed: 0_level_0,Customer Lifetime Value,Response,Coverage,Education,Gender,Income,Monthly Premium Auto,Number of Policies,Total Claim Amount,Vehicle Size,...,Sales Channel_Agent,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2763.519279,0,0,2,1,56274,69,1,384.811147,1,...,1,0,0,0,0,0,0,0,0,1
QZ44356,6979.535903,0,1,2,1,0,94,8,1131.464935,1,...,1,0,0,0,1,0,0,0,0,0
AI49188,12887.43165,0,2,2,1,48767,108,2,566.472247,1,...,1,0,0,0,0,0,0,0,0,1
WW63253,7645.861827,0,0,2,0,0,106,7,529.881344,1,...,0,0,1,0,0,0,0,1,0,0
HB64268,2813.692575,0,0,2,0,43836,73,1,138.130879,1,...,1,0,0,0,1,0,0,0,0,0


# Splitting Data

In [4]:
x = df_processed.drop(columns = ['Response'] , axis = 1)
y = df_processed['Response']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(7236, 48) (1809, 48) (7236,) (1809,)


# Handling Imbalance Data

In [6]:
y_train.value_counts()

Response
0    6194
1    1042
Name: count, dtype: int64

#### Random Over Sampling

In [7]:
df_train = pd.concat([x_train, y_train], axis=1)

not_renewal = df_train[df_train['Response'] == 0]
renewal = df_train[df_train['Response'] == 1]

renewal_oversample = resample(renewal, replace=True, n_samples = len(not_renewal), random_state = 42)

df_OverSampled = pd.concat([not_renewal, renewal_oversample])
df_OverSampled['Response'].value_counts()

Response
0    6194
1    6194
Name: count, dtype: int64

In [8]:
x_train_os = df_OverSampled.drop(columns = ['Response'])
y_train_os = df_OverSampled['Response']

#### Smote

In [9]:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

x = pd.DataFrame(data = x_train_sm, columns = x_train.columns)
y = pd.DataFrame(data = y_train_sm, columns = ['Response'])

df_smote = x.join(y)
# df_smote = pd.concat([x_train_sm, y_train_sm], axis = 1)
df_smote['Response'].value_counts()

Response
0    6194
1    6194
Name: count, dtype: int64

In [10]:
columns_continuous = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Policies', 
                      'Total Claim Amount']

In [11]:
std_scale = StandardScaler()
mm_scale = MinMaxScaler()
rb_scale = RobustScaler()

# Tuning Model

## SVM

In [12]:
svm = SVC(max_iter = 1000)

param_svm = {'C' : np.linspace(0.00001, 10, 1000),
             'kernel' : ['linear', 'rbf'],
             "gamma" : np.arange(0.0001, 100)}

#### Random Over Sampling without Scaling

In [13]:
svm_os = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                            verbose = 1, scoring = 'f1')

svm_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [14]:
svm_os_tuned = svm_os.best_estimator_

pred_train_os = svm_os_tuned.predict(x_train_os)
pred_test_os = svm_os_tuned.predict(x_test)

svm_os_tuned

In [15]:
svm_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
svm_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)

svm_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
svm_recall_tuned_test_os = recall_score(y_test, pred_test_os)

svm_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
svm_prec_tuned_test_os = precision_score(y_test, pred_test_os)

svm_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
svm_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [16]:
cm_svm_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])

cm_svm_tuned_os = pd.DataFrame(data=cm_svm_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,221,39
Akt 0,0,1549


In [17]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1549
           1       1.00      0.85      0.92       260

    accuracy                           0.98      1809
   macro avg       0.99      0.93      0.95      1809
weighted avg       0.98      0.98      0.98      1809



In [18]:
tp_svm_os = cm_svm_tuned_os['Pred 1'][0]
tn_svm_os = cm_svm_tuned_os['Pred 0'][1]

fp_svm_os = cm_svm_tuned_os['Pred 1'][1]
fn_svm_os = cm_svm_tuned_os['Pred 0'][0]

#### Smote without Scaling

In [19]:
svm_sm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                            verbose = 1, scoring = 'f1')

svm_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [20]:
svm_sm_tuned = svm_sm.best_estimator_

pred_train_sm = svm_sm_tuned.predict(x_train_sm)
pred_test_sm = svm_sm_tuned.predict(x_test)

svm_sm_tuned

In [21]:
svm_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
svm_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)

svm_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
svm_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)

svm_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
svm_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)

svm_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
svm_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [22]:
cm_svm_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])

cm_svm_tuned_sm = pd.DataFrame(data=cm_svm_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,255,5
Akt 0,25,1524


In [23]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1549
           1       0.91      0.98      0.94       260

    accuracy                           0.98      1809
   macro avg       0.95      0.98      0.97      1809
weighted avg       0.98      0.98      0.98      1809



In [24]:
tp_svm_sm = cm_svm_tuned_sm['Pred 1'][0]
tn_svm_sm = cm_svm_tuned_sm['Pred 0'][1]

fp_svm_sm = cm_svm_tuned_sm['Pred 1'][1]
fn_svm_sm = cm_svm_tuned_sm['Pred 0'][0]

#### Random Over Sampling with Standard Scaler

In [25]:
svm_os_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                                verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])

svm_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [26]:
svm_os_std_tuned = svm_os_std.best_estimator_

pred_train_os_std = svm_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = svm_os_std_tuned.predict(x_test)

svm_os_std_tuned

In [27]:
svm_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
svm_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)

svm_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
svm_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)

svm_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
svm_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)

svm_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
svm_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [28]:
cm_svm_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])

cm_svm_tuned_os_std = pd.DataFrame(data=cm_svm_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,213,47
Akt 0,7,1542


In [29]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1549
           1       0.97      0.82      0.89       260

    accuracy                           0.97      1809
   macro avg       0.97      0.91      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [30]:
tp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][0]
tn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][1]

fp_svm_os_std = cm_svm_tuned_os_std['Pred 1'][1]
fn_svm_os_std = cm_svm_tuned_os_std['Pred 0'][0]

#### Random Over Sampling with MinMax Scaler

In [31]:
svm_os_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])

svm_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [32]:
svm_os_mm_tuned = svm_os_mm.best_estimator_

pred_train_os_mm = svm_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = svm_os_mm_tuned.predict(x_test)

svm_os_mm_tuned

In [33]:
svm_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
svm_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)

svm_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
svm_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)

svm_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
svm_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)

svm_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
svm_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [34]:
cm_svm_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])

cm_svm_tuned_os_mm = pd.DataFrame(data=cm_svm_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,57,203
Akt 0,0,1549


In [35]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1549
           1       1.00      0.22      0.36       260

    accuracy                           0.89      1809
   macro avg       0.94      0.61      0.65      1809
weighted avg       0.90      0.89      0.86      1809



In [36]:
tp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][0]
tn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][1]

fp_svm_os_mm = cm_svm_tuned_os_mm['Pred 1'][1]
fn_svm_os_mm = cm_svm_tuned_os_mm['Pred 0'][0]

#### Random Over Sampling with Robust Scaler

In [37]:
svm_os_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])

svm_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [38]:
svm_os_rb_tuned = svm_os_rb.best_estimator_

pred_train_os_rb = svm_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = svm_os_rb_tuned.predict(x_test)

svm_os_rb_tuned

In [39]:
svm_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
svm_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)

svm_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
svm_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)

svm_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
svm_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)

svm_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
svm_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [40]:
cm_svm_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])

cm_svm_tuned_os_rb = pd.DataFrame(data=cm_svm_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,209,51
Akt 0,10,1539


In [41]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1549
           1       0.95      0.80      0.87       260

    accuracy                           0.97      1809
   macro avg       0.96      0.90      0.93      1809
weighted avg       0.97      0.97      0.97      1809



In [42]:
tp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][0]
tn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][1]

fp_svm_os_rb = cm_svm_tuned_os_rb['Pred 1'][1]
fn_svm_os_rb = cm_svm_tuned_os_rb['Pred 0'][0]

#### Smote with Standard Scaler

In [43]:
svm_sm_std = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                                verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])

svm_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [44]:
svm_sm_std_tuned = svm_sm_std.best_estimator_

pred_train_sm_std = svm_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = svm_sm_std_tuned.predict(x_test)

svm_sm_std_tuned

In [45]:
svm_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
svm_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)

svm_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
svm_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)

svm_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
svm_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)

svm_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
svm_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [46]:
cm_svm_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])

cm_svm_tuned_sm_std = pd.DataFrame(data=cm_svm_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,227,33
Akt 0,17,1532


In [47]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1549
           1       0.93      0.87      0.90       260

    accuracy                           0.97      1809
   macro avg       0.95      0.93      0.94      1809
weighted avg       0.97      0.97      0.97      1809



In [48]:
tp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][0]
tn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][1]

fp_svm_sm_std = cm_svm_tuned_sm_std['Pred 1'][1]
fn_svm_sm_std = cm_svm_tuned_sm_std['Pred 0'][0]

#### Smote with MinMax Scaler

In [49]:
svm_sm_mm = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])

svm_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [50]:
svm_sm_mm_tuned = svm_sm_mm.best_estimator_

pred_train_sm_mm = svm_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = svm_sm_mm_tuned.predict(x_test)

svm_sm_mm_tuned

In [51]:
svm_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
svm_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)

svm_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
svm_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)

svm_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
svm_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)

svm_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
svm_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [52]:
cm_svm_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])

cm_svm_tuned_sm_mm = pd.DataFrame(data=cm_svm_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,80,180
Akt 0,16,1533


In [53]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      1549
           1       0.83      0.31      0.45       260

    accuracy                           0.89      1809
   macro avg       0.86      0.65      0.69      1809
weighted avg       0.89      0.89      0.87      1809



In [54]:
tp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][0]
tn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][1]

fp_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 1'][1]
fn_svm_sm_mm = cm_svm_tuned_sm_mm['Pred 0'][0]

#### Smote with Robust Scaler

In [55]:
svm_sm_rb = RandomizedSearchCV(estimator = svm, param_distributions = param_svm, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])

svm_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [56]:
svm_sm_rb_tuned = svm_sm_rb.best_estimator_

pred_train_sm_rb = svm_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = svm_sm_rb_tuned.predict(x_test)

svm_sm_rb_tuned

In [57]:
svm_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
svm_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)

svm_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
svm_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)

svm_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
svm_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)

svm_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
svm_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [58]:
cm_svm_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])

cm_svm_tuned_sm_rb = pd.DataFrame(data=cm_svm_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,224,36
Akt 0,29,1520


In [59]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1549
           1       0.89      0.86      0.87       260

    accuracy                           0.96      1809
   macro avg       0.93      0.92      0.93      1809
weighted avg       0.96      0.96      0.96      1809



In [60]:
tp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][0]
tn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][1]

fp_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 1'][1]
fn_svm_sm_rb = cm_svm_tuned_sm_rb['Pred 0'][0]

## Logistic Regression

In [61]:
logreg = LogisticRegression()

In [62]:
param_logreg = {'C' : np.linspace(0.00001, 10, 1000),
                'penalty' : ['l1', 'l2', 'elasticnet', None],
                'class_weight': [None, 'weight'],
                'fit_intercept' : [True, False]}

#### Random Over Sampling without Scaling

In [63]:
logreg_os = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
logreg_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


675 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
417 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [64]:
logreg_os_tuned = logreg_os.best_estimator_

pred_train_os = logreg_os_tuned.predict(x_train_os)
pred_test_os = logreg_os_tuned.predict(x_test)

logreg_os_tuned

In [65]:
logreg_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
logreg_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)

logreg_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
logreg_recall_tuned_test_os = recall_score(y_test, pred_test_os)

logreg_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
logreg_prec_tuned_test_os = precision_score(y_test, pred_test_os)

logreg_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
logreg_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [66]:
cm_logreg_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])

cm_logreg_tuned_os = pd.DataFrame(data=cm_logreg_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,190,70
Akt 0,452,1097


In [67]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.61      1809
weighted avg       0.85      0.71      0.75      1809



In [68]:
tp_logreg_os = cm_logreg_tuned_os['Pred 1'][0]
tn_logreg_os = cm_logreg_tuned_os['Pred 0'][1]

fp_logreg_os = cm_logreg_tuned_os['Pred 1'][1]
fn_logreg_os = cm_logreg_tuned_os['Pred 0'][0]

#### Smote Sampling without Scaling

In [69]:
logreg_sm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                               verbose = 1, scoring = 'f1')
logreg_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


666 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
438 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [70]:
logreg_sm_tuned = logreg_sm.best_estimator_

pred_train_sm = logreg_sm_tuned.predict(x_train_sm)
pred_test_sm = logreg_sm_tuned.predict(x_test)

logreg_sm_tuned

In [71]:
logreg_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
logreg_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)

logreg_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
logreg_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)

logreg_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
logreg_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)

logreg_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
logreg_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [72]:
cm_logreg_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])

cm_logreg_tuned_sm = pd.DataFrame(data=cm_logreg_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,47,213
Akt 0,29,1520


In [73]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.62      0.18      0.28       260

    accuracy                           0.87      1809
   macro avg       0.75      0.58      0.60      1809
weighted avg       0.84      0.87      0.83      1809



In [74]:
tp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][0]
tn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][1]

fp_logreg_sm = cm_logreg_tuned_sm['Pred 1'][1]
fn_logreg_sm = cm_logreg_tuned_sm['Pred 0'][0]

#### Standard Scaling with Random Over Sampling

In [75]:
logreg_os_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                   verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = std_scale.fit_transform(x_train_os[columns_continuous])

logreg_os_std.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


678 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
141 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages

In [76]:
logreg_os_std_tuned = logreg_os_std.best_estimator_

pred_train_os_std = logreg_os_std_tuned.predict(x_train_os)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_os_std = logreg_os_std_tuned.predict(x_test)

logreg_os_std_tuned

In [77]:
logreg_acc_tuned_train_os_std = accuracy_score(y_train_os, pred_train_os_std)
logreg_acc_tuned_test_os_std = accuracy_score(y_test, pred_test_os_std)

logreg_recall_tuned_train_os_std = recall_score(y_train_os, pred_train_os_std)
logreg_recall_tuned_test_os_std = recall_score(y_test, pred_test_os_std)

logreg_prec_tuned_train_os_std = precision_score(y_train_os, pred_train_os_std)
logreg_prec_tuned_test_os_std = precision_score(y_test, pred_test_os_std)

logreg_f1_tuned_train_os_std = f1_score(y_train_os, pred_train_os_std)
logreg_f1_tuned_test_os_std = f1_score(y_test, pred_test_os_std)

In [78]:
cm_logreg_tuned_os_std = confusion_matrix(y_test, pred_test_os_std, labels=[1, 0])

cm_logreg_tuned_os_std = pd.DataFrame(data=cm_logreg_tuned_os_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,194,66
Akt 0,458,1091


In [79]:
print(classification_report(y_test, pred_test_os_std))

              precision    recall  f1-score   support

           0       0.94      0.70      0.81      1549
           1       0.30      0.75      0.43       260

    accuracy                           0.71      1809
   macro avg       0.62      0.73      0.62      1809
weighted avg       0.85      0.71      0.75      1809



In [80]:
tp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][0]
tn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][1]

fp_logreg_os_std = cm_logreg_tuned_os_std['Pred 1'][1]
fn_logreg_os_std = cm_logreg_tuned_os_std['Pred 0'][0]

#### MinMax Scaling with Random Over Sampling

In [81]:
logreg_os_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                  verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = mm_scale.fit_transform(x_train_os[columns_continuous])

logreg_os_mm.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


696 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
435 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [82]:
logreg_os_mm_tuned = logreg_os_mm.best_estimator_

pred_train_os_mm = logreg_os_mm_tuned.predict(x_train_os)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_os_mm = logreg_os_mm_tuned.predict(x_test)

logreg_os_mm_tuned

In [83]:
logreg_acc_tuned_train_os_mm = accuracy_score(y_train_os, pred_train_os_mm)
logreg_acc_tuned_test_os_mm = accuracy_score(y_test, pred_test_os_mm)

logreg_recall_tuned_train_os_mm = recall_score(y_train_os, pred_train_os_mm)
logreg_recall_tuned_test_os_mm = recall_score(y_test, pred_test_os_mm)

logreg_prec_tuned_train_os_mm = precision_score(y_train_os, pred_train_os_mm)
logreg_prec_tuned_test_os_mm = precision_score(y_test, pred_test_os_mm)

logreg_f1_tuned_train_os_mm = f1_score(y_train_os, pred_train_os_mm)
logreg_f1_tuned_test_os_mm = f1_score(y_test, pred_test_os_mm)

In [84]:
cm_logreg_tuned_os_mm = confusion_matrix(y_test, pred_test_os_mm, labels=[1, 0])

cm_logreg_tuned_os_mm = pd.DataFrame(data=cm_logreg_tuned_os_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,183,77
Akt 0,423,1126


In [85]:
print(classification_report(y_test, pred_test_os_mm))

              precision    recall  f1-score   support

           0       0.94      0.73      0.82      1549
           1       0.30      0.70      0.42       260

    accuracy                           0.72      1809
   macro avg       0.62      0.72      0.62      1809
weighted avg       0.84      0.72      0.76      1809



In [86]:
tp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][0]
tn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][1]

fp_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 1'][1]
fn_logreg_os_mm = cm_logreg_tuned_os_mm['Pred 0'][0]

#### Robust Scaling with Random Over Sampling

In [87]:
logreg_os_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                  verbose = 1, scoring = 'f1')
x_train_os[columns_continuous] = rb_scale.fit_transform(x_train_os[columns_continuous])

logreg_os_rb.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


705 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
438 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [88]:
logreg_os_rb_tuned = logreg_os_rb.best_estimator_

pred_train_os_rb = logreg_os_rb_tuned.predict(x_train_os)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_os_rb = logreg_os_rb_tuned.predict(x_test)

logreg_os_rb_tuned

In [89]:
logreg_acc_tuned_train_os_rb = accuracy_score(y_train_os, pred_train_os_rb)
logreg_acc_tuned_test_os_rb = accuracy_score(y_test, pred_test_os_rb)

logreg_recall_tuned_train_os_rb = recall_score(y_train_os, pred_train_os_rb)
logreg_recall_tuned_test_os_rb = recall_score(y_test, pred_test_os_rb)

logreg_prec_tuned_train_os_rb = precision_score(y_train_os, pred_train_os_rb)
logreg_prec_tuned_test_os_rb = precision_score(y_test, pred_test_os_rb)

logreg_f1_tuned_train_os_rb = f1_score(y_train_os, pred_train_os_rb)
logreg_f1_tuned_test_os_rb = f1_score(y_test, pred_test_os_rb)

In [90]:
cm_logreg_tuned_os_rb = confusion_matrix(y_test, pred_test_os_rb, labels=[1, 0])

cm_logreg_tuned_os_rb = pd.DataFrame(data=cm_logreg_tuned_os_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_os_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,190,70
Akt 0,453,1096


In [91]:
print(classification_report(y_test, pred_test_os_rb))

              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1549
           1       0.30      0.73      0.42       260

    accuracy                           0.71      1809
   macro avg       0.62      0.72      0.61      1809
weighted avg       0.85      0.71      0.75      1809



In [92]:
tp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][0]
tn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][1]

fp_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 1'][1]
fn_logreg_os_rb = cm_logreg_tuned_os_rb['Pred 0'][0]

#### Standard Scaling with Smote

In [93]:
logreg_sm_std = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                   verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = std_scale.fit_transform(x_train_sm[columns_continuous])

logreg_sm_std.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


708 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
483 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [94]:
logreg_sm_std_tuned = logreg_sm_std.best_estimator_

pred_train_sm_std = logreg_sm_std_tuned.predict(x_train_sm)
x_test[columns_continuous] = std_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_std = logreg_sm_std_tuned.predict(x_test)

logreg_sm_std_tuned

In [95]:
logreg_acc_tuned_train_sm_std = accuracy_score(y_train_sm, pred_train_sm_std)
logreg_acc_tuned_test_sm_std = accuracy_score(y_test, pred_test_sm_std)

logreg_recall_tuned_train_sm_std = recall_score(y_train_sm, pred_train_sm_std)
logreg_recall_tuned_test_sm_std = recall_score(y_test, pred_test_sm_std)

logreg_prec_tuned_train_sm_std = precision_score(y_train_sm, pred_train_sm_std)
logreg_prec_tuned_test_sm_std = precision_score(y_test, pred_test_sm_std)

logreg_f1_tuned_train_sm_std = f1_score(y_train_sm, pred_train_sm_std)
logreg_f1_tuned_test_sm_std = f1_score(y_test, pred_test_sm_std)

In [96]:
cm_logreg_tuned_sm_std = confusion_matrix(y_test, pred_test_sm_std, labels=[1, 0])

cm_logreg_tuned_sm_std = pd.DataFrame(data=cm_logreg_tuned_sm_std, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_std

Unnamed: 0,Pred 1,Pred 0
Akt 1,52,208
Akt 0,30,1519


In [97]:
print(classification_report(y_test, pred_test_sm_std))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.63      0.20      0.30       260

    accuracy                           0.87      1809
   macro avg       0.76      0.59      0.62      1809
weighted avg       0.84      0.87      0.84      1809



In [98]:
tp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][0]
tn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][1]

fp_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 1'][1]
fn_logreg_sm_std = cm_logreg_tuned_sm_std['Pred 0'][0]

#### MinMax Scaling with Smote

In [99]:
logreg_sm_mm = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                  verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = mm_scale.fit_transform(x_train_sm[columns_continuous])

logreg_sm_mm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


636 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
69 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\

In [100]:
logreg_sm_mm_tuned = logreg_sm_mm.best_estimator_

pred_train_sm_mm = logreg_sm_mm_tuned.predict(x_train_sm)
x_test[columns_continuous] = mm_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_mm = logreg_sm_mm_tuned.predict(x_test)

logreg_sm_mm_tuned

In [101]:
logreg_acc_tuned_train_sm_mm = accuracy_score(y_train_sm, pred_train_sm_mm)
logreg_acc_tuned_test_sm_mm = accuracy_score(y_test, pred_test_sm_mm)

logreg_recall_tuned_train_sm_mm = recall_score(y_train_sm, pred_train_sm_mm)
logreg_recall_tuned_test_sm_mm = recall_score(y_test, pred_test_sm_mm)

logreg_prec_tuned_train_sm_mm = precision_score(y_train_sm, pred_train_sm_mm)
logreg_prec_tuned_test_sm_mm = precision_score(y_test, pred_test_sm_mm)

logreg_f1_tuned_train_sm_mm = f1_score(y_train_sm, pred_train_sm_mm)
logreg_f1_tuned_test_sm_mm = f1_score(y_test, pred_test_sm_mm)

In [102]:
cm_logreg_tuned_sm_mm = confusion_matrix(y_test, pred_test_sm_mm, labels=[1, 0])

cm_logreg_tuned_sm_mm = pd.DataFrame(data=cm_logreg_tuned_sm_mm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_mm

Unnamed: 0,Pred 1,Pred 0
Akt 1,46,214
Akt 0,24,1525


In [103]:
print(classification_report(y_test, pred_test_sm_mm))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.66      0.18      0.28       260

    accuracy                           0.87      1809
   macro avg       0.77      0.58      0.60      1809
weighted avg       0.85      0.87      0.83      1809



In [104]:
tp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][0]
tn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][1]

fp_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 1'][1]
fn_logreg_sm_mm = cm_logreg_tuned_sm_mm['Pred 0'][0]

#### Robust Scaling with Smote

In [105]:
logreg_sm_rb = RandomizedSearchCV(estimator = logreg, param_distributions = param_logreg, cv = 3, n_jobs = -1 , n_iter=300, 
                                  verbose = 1, scoring = 'f1')
x_train_sm[columns_continuous] = rb_scale.fit_transform(x_train_sm[columns_continuous])

logreg_sm_rb.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


708 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
465 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [106]:
logreg_sm_rb_tuned = logreg_sm_rb.best_estimator_

pred_train_sm_rb = logreg_sm_rb_tuned.predict(x_train_sm)
x_test[columns_continuous] = rb_scale.fit_transform(x_test[columns_continuous])
pred_test_sm_rb = logreg_sm_rb_tuned.predict(x_test)

logreg_sm_rb_tuned

In [107]:
logreg_acc_tuned_train_sm_rb = accuracy_score(y_train_sm, pred_train_sm_rb)
logreg_acc_tuned_test_sm_rb = accuracy_score(y_test, pred_test_sm_rb)

logreg_recall_tuned_train_sm_rb = recall_score(y_train_sm, pred_train_sm_rb)
logreg_recall_tuned_test_sm_rb = recall_score(y_test, pred_test_sm_rb)

logreg_prec_tuned_train_sm_rb = precision_score(y_train_sm, pred_train_sm_rb)
logreg_prec_tuned_test_sm_rb = precision_score(y_test, pred_test_sm_rb)

logreg_f1_tuned_train_sm_rb = f1_score(y_train_sm, pred_train_sm_rb)
logreg_f1_tuned_test_sm_rb = f1_score(y_test, pred_test_sm_rb)

In [108]:
cm_logreg_tuned_sm_rb = confusion_matrix(y_test, pred_test_sm_rb, labels=[1, 0])

cm_logreg_tuned_sm_rb = pd.DataFrame(data=cm_logreg_tuned_sm_rb, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned_sm_rb

Unnamed: 0,Pred 1,Pred 0
Akt 1,47,213
Akt 0,29,1520


In [109]:
print(classification_report(y_test, pred_test_sm_rb))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1549
           1       0.62      0.18      0.28       260

    accuracy                           0.87      1809
   macro avg       0.75      0.58      0.60      1809
weighted avg       0.84      0.87      0.83      1809



In [110]:
tp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][0]
tn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][1]

fp_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 1'][1]
fn_logreg_sm_rb = cm_logreg_tuned_sm_rb['Pred 0'][0]

### Evaluation For Logistic Regression and Support Vector Classifier

In [111]:
distance_tuned = {
    "SVM_OS_Train": [svm_acc_tuned_train_os, svm_recall_tuned_train_os, svm_prec_tuned_train_os, svm_f1_tuned_train_os],
    "SVM_OS_Test" : [svm_acc_tuned_test_os, svm_recall_tuned_test_os, svm_prec_tuned_test_os, svm_f1_tuned_test_os],
    
    "SVM_SM_Train": [svm_acc_tuned_train_sm, svm_recall_tuned_train_sm, svm_prec_tuned_train_sm, svm_f1_tuned_train_sm],
    "SVM_SM_Test" : [svm_acc_tuned_test_sm, svm_recall_tuned_test_sm, svm_prec_tuned_test_sm, svm_f1_tuned_test_sm],
    
    "Logreg_OS_Train": [logreg_acc_tuned_train_os, logreg_recall_tuned_train_os, logreg_prec_tuned_train_os, logreg_f1_tuned_train_os],
    "Logreg_OS_Test" : [logreg_acc_tuned_test_os, logreg_recall_tuned_test_os, logreg_prec_tuned_test_os, logreg_f1_tuned_test_os],
    
    "Logreg_SM_Train": [logreg_acc_tuned_train_sm, logreg_recall_tuned_train_sm, logreg_prec_tuned_train_sm, logreg_f1_tuned_train_sm],
    "Logreg_SM_Test" : [logreg_acc_tuned_test_sm, logreg_recall_tuned_test_sm, logreg_prec_tuned_test_sm, logreg_f1_tuned_test_sm]
    }
tuned_matrix = pd.DataFrame(data = distance_tuned, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


dictance_tuned_os = {
    "SVM Standard_OS_Train": [svm_acc_tuned_train_os_std, svm_recall_tuned_train_os_std, svm_prec_tuned_train_os_std, svm_f1_tuned_train_os_std],
    "SVM Standard_OS_Test" : [svm_acc_tuned_test_os_std, svm_recall_tuned_test_os_std, svm_prec_tuned_test_os_std, svm_f1_tuned_test_os_std],
    
    "SVM MinMax_OS_Train": [svm_acc_tuned_train_os_mm, svm_recall_tuned_train_os_mm, svm_prec_tuned_train_os_mm, svm_f1_tuned_train_os_mm],
    "SVM MinMax_OS_Test" : [svm_acc_tuned_test_os_mm, svm_recall_tuned_test_os_mm, svm_prec_tuned_test_os_mm, svm_f1_tuned_test_os_mm],
    
    "SVM Robust_OS_Train": [svm_acc_tuned_train_os_rb, svm_recall_tuned_train_os_rb, svm_prec_tuned_train_os_rb, svm_f1_tuned_train_os_rb],
    "SVM Robust_OS_Test" : [svm_acc_tuned_test_os_rb, svm_recall_tuned_test_os_rb, svm_prec_tuned_test_os_rb, svm_f1_tuned_test_os_rb],
    
    "Logreg Standard_OS_Train": [logreg_acc_tuned_train_os_std, logreg_recall_tuned_train_os_std, logreg_prec_tuned_train_os_std, logreg_f1_tuned_train_os_std],
    "Logreg Standard_OS_Test" : [logreg_acc_tuned_test_os_std, logreg_recall_tuned_test_os_std, logreg_prec_tuned_test_os_std, logreg_f1_tuned_test_os_std],
    
    "Logreg MinMax_OS_Train": [logreg_acc_tuned_train_os_mm, logreg_recall_tuned_train_os_mm, logreg_prec_tuned_train_os_mm, logreg_f1_tuned_train_os_mm],
    "Logreg MinMax_OS_Test" : [logreg_acc_tuned_test_os_mm, logreg_recall_tuned_test_os_mm, logreg_prec_tuned_test_os_mm, logreg_f1_tuned_test_os_mm],
    
    "Logreg Robust_OS_Train": [logreg_acc_tuned_train_os_rb, logreg_recall_tuned_train_os_rb, logreg_prec_tuned_train_os_rb, logreg_f1_tuned_train_os_rb],
    "Logreg Robust_OS_Test" : [logreg_acc_tuned_test_os_rb, logreg_recall_tuned_test_os_rb, logreg_prec_tuned_test_os_rb, logreg_f1_tuned_test_os_rb]
    }
distance_tuned_os_matrix = pd.DataFrame(data = dictance_tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


distance_tuned_sm = {
    "SVM Standard_SM_Train": [svm_acc_tuned_train_sm_std, svm_recall_tuned_train_sm_std, svm_prec_tuned_train_sm_std, svm_f1_tuned_train_sm_std],
    "SVM Standard_SM_Test" : [svm_acc_tuned_test_sm_std, svm_recall_tuned_test_sm_std, svm_prec_tuned_test_sm_std, svm_f1_tuned_test_sm_std],
    
    "SVM MinMax_SM_Train": [svm_acc_tuned_train_sm_mm, svm_recall_tuned_train_sm_mm, svm_prec_tuned_train_sm_mm, svm_f1_tuned_train_sm_mm],
    "SVM MinMax_SM_Test" : [svm_acc_tuned_test_sm_mm, svm_recall_tuned_test_sm_mm, svm_prec_tuned_test_sm_mm, svm_f1_tuned_test_sm_mm],
    
    "SVM Robust_SM_Train": [svm_acc_tuned_train_sm_rb, svm_recall_tuned_train_sm_rb, svm_prec_tuned_train_sm_rb, svm_f1_tuned_train_sm_rb],
    "SVM Robust_SM_Test" : [svm_acc_tuned_test_sm_rb, svm_recall_tuned_test_sm_rb, svm_prec_tuned_test_sm_rb, svm_f1_tuned_test_sm_rb],
    
    "Logreg Standard_SM_Train": [logreg_acc_tuned_train_sm_std, logreg_recall_tuned_train_sm_std, logreg_prec_tuned_train_sm_std, logreg_f1_tuned_train_sm_std],
    "Logreg Standard_SM_Test" : [logreg_acc_tuned_test_sm_std, logreg_recall_tuned_test_sm_std, logreg_prec_tuned_test_sm_std, logreg_f1_tuned_test_sm_std],
    
    "Logreg MinMax_SM_Train": [logreg_acc_tuned_train_sm_mm, logreg_recall_tuned_train_sm_mm, logreg_prec_tuned_train_sm_mm, logreg_f1_tuned_train_sm_mm],
    "Logreg MinMax_SM_Test" : [logreg_acc_tuned_test_sm_mm, logreg_recall_tuned_test_sm_mm, logreg_prec_tuned_test_sm_mm, logreg_f1_tuned_test_sm_mm],
    
    "Logreg Robust_SM_Train": [logreg_acc_tuned_train_sm_rb, logreg_recall_tuned_train_sm_rb, logreg_prec_tuned_train_sm_rb, logreg_f1_tuned_train_sm_rb],
    "Logreg Robust_SM_Test" : [logreg_acc_tuned_test_sm_rb, logreg_recall_tuned_test_sm_rb, logreg_prec_tuned_test_sm_rb, logreg_f1_tuned_test_sm_rb]
    }
distance_tuned_sm_matrix = pd.DataFrame(data = distance_tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [112]:
tuned_matrix

Unnamed: 0,SVM_OS_Train,SVM_OS_Test,SVM_SM_Train,SVM_SM_Test,Logreg_OS_Train,Logreg_OS_Test,Logreg_SM_Train,Logreg_SM_Test
Accuracy,1.0,0.978441,0.991201,0.983416,0.74217,0.711443,0.909509,0.866224
Recall,1.0,0.85,0.995964,0.980769,0.783662,0.730769,0.837423,0.180769
Precision,1.0,1.0,0.986566,0.910714,0.723614,0.29595,0.978495,0.618421
F1 Score,1.0,0.918919,0.991243,0.944444,0.752441,0.421286,0.902479,0.279762


In [113]:
distance_tuned_os_matrix

Unnamed: 0,SVM Standard_OS_Train,SVM Standard_OS_Test,SVM MinMax_OS_Train,SVM MinMax_OS_Test,SVM Robust_OS_Train,SVM Robust_OS_Test,Logreg Standard_OS_Train,Logreg Standard_OS_Test,Logreg MinMax_OS_Train,Logreg MinMax_OS_Test,Logreg Robust_OS_Train,Logreg Robust_OS_Test
Accuracy,0.999919,0.970149,0.999919,0.887783,0.999919,0.96628,0.742977,0.710337,0.742896,0.723604,0.742896,0.71089
Recall,1.0,0.819231,1.0,0.219231,1.0,0.803846,0.78463,0.746154,0.785115,0.703846,0.78463,0.730769
Precision,0.999839,0.968182,0.999839,1.0,0.999839,0.954338,0.724292,0.297546,0.723984,0.30198,0.724184,0.29549
F1 Score,0.999919,0.8875,0.999919,0.359621,0.999919,0.872651,0.753255,0.425439,0.753311,0.422633,0.753196,0.420819


In [114]:
distance_tuned_sm_matrix

Unnamed: 0,SVM Standard_SM_Train,SVM Standard_SM_Test,SVM MinMax_SM_Train,SVM MinMax_SM_Test,SVM Robust_SM_Train,SVM Robust_SM_Test,Logreg Standard_SM_Train,Logreg Standard_SM_Test,Logreg MinMax_SM_Train,Logreg MinMax_SM_Test,Logreg Robust_SM_Train,Logreg Robust_SM_Test
Accuracy,1.0,0.97236,0.981595,0.891653,1.0,0.964069,0.909267,0.868436,0.909428,0.868436,0.909509,0.866224
Recall,1.0,0.873077,0.96319,0.307692,1.0,0.861538,0.835486,0.2,0.836778,0.176923,0.837423,0.180769
Precision,1.0,0.930328,1.0,0.833333,1.0,0.885375,0.980114,0.634146,0.979033,0.657143,0.978495,0.618421
F1 Score,1.0,0.900794,0.98125,0.449438,1.0,0.873294,0.902039,0.304094,0.902333,0.278788,0.902479,0.279762


## Decision Tree

In [115]:
decision_tree = DecisionTreeClassifier()

In [116]:
param_dt =  {"criterion" : ['gini', 'entropy'],
             
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
                           27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 
                                   28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 
                                 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 
             
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [117]:
dt_os = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, 
                           verbose = 1, scoring = 'f1')

dt_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


297 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
128 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [118]:
dt_tuned_os = dt_os.best_estimator_

pred_train_os = dt_tuned_os.predict(x_train_os)
pred_test_os = dt_tuned_os.predict(x_test)

dt_tuned_os

In [119]:
dt_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
dt_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)

dt_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
dt_recall_tuned_test_os = recall_score(y_test, pred_test_os)

dt_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
dt_prec_tuned_test_os = precision_score(y_test, pred_test_os)

dt_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
dt_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [120]:
cm_dt_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])

cm_dt_tuned_os = pd.DataFrame(data=cm_dt_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,171,89
Akt 0,97,1452


In [121]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1549
           1       0.64      0.66      0.65       260

    accuracy                           0.90      1809
   macro avg       0.79      0.80      0.79      1809
weighted avg       0.90      0.90      0.90      1809



In [122]:
tp_dt_os = cm_dt_tuned_os['Pred 1'][0]
tn_dt_os = cm_dt_tuned_os['Pred 0'][1]

fp_dt_os = cm_dt_tuned_os['Pred 1'][1]
fn_dt_os = cm_dt_tuned_os['Pred 0'][0]

### Smote

In [123]:
dt_sm = RandomizedSearchCV(estimator = decision_tree, param_distributions = param_dt, cv = 3, n_jobs = -1 , n_iter=300, 
                           verbose = 1, scoring = 'f1')
dt_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


297 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
181 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [124]:
dt_tuned_sm = dt_sm.best_estimator_

pred_train_sm = dt_tuned_sm.predict(x_train_sm)
pred_test_sm = dt_tuned_sm.predict(x_test)

dt_tuned_sm

In [125]:
dt_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
dt_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)

dt_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
dt_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)

dt_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
dt_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)

dt_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
dt_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [126]:
cm_dt_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])

cm_dt_tuned_sm = pd.DataFrame(data=cm_dt_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_dt_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,97,163
Akt 0,205,1344


In [127]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1549
           1       0.32      0.37      0.35       260

    accuracy                           0.80      1809
   macro avg       0.61      0.62      0.61      1809
weighted avg       0.81      0.80      0.80      1809



In [128]:
tp_dt_sm = cm_dt_tuned_sm['Pred 1'][0]
tn_dt_sm = cm_dt_tuned_sm['Pred 0'][1]

fp_dt_sm = cm_dt_tuned_sm['Pred 1'][1]
fn_dt_sm = cm_dt_tuned_sm['Pred 0'][0]

## Random Forest

In [129]:
random_forest = RandomForestClassifier()

In [130]:
param_rf =  {"n_estimators":np.arange(100, 2000),
             
             "criterion" : ['gini', 'entropy'],
             
             "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                           28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             
             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                                   28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             
             "min_samples_leaf":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 
                                 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
             
             "max_features" : ['auto', 'sqrt', 'log2']}

### Random Over Sampling

In [131]:
rf_os = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, 
                           verbose = 1, scoring = 'f1')
rf_os.fit(x_train_os, y_train_os)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


345 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
234 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [132]:
rf_tuned_os = rf_os.best_estimator_

pred_train_os = rf_tuned_os.predict(x_train_os)
pred_test_os = rf_tuned_os.predict(x_test)

rf_tuned_os

In [133]:
rf_acc_tuned_train_os = accuracy_score(y_train_os, pred_train_os)
rf_acc_tuned_test_os = accuracy_score(y_test, pred_test_os)

rf_recall_tuned_train_os = recall_score(y_train_os, pred_train_os)
rf_recall_tuned_test_os = recall_score(y_test, pred_test_os)

rf_prec_tuned_train_os = precision_score(y_train_os, pred_train_os)
rf_prec_tuned_test_os = precision_score(y_test, pred_test_os)

rf_f1_tuned_train_os = f1_score(y_train_os, pred_train_os)
rf_f1_tuned_test_os = f1_score(y_test, pred_test_os)

In [134]:
cm_rf_tuned_os = confusion_matrix(y_test, pred_test_os, labels=[1, 0])

cm_rf_tuned_os = pd.DataFrame(data=cm_rf_tuned_os, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_os

Unnamed: 0,Pred 1,Pred 0
Akt 1,215,45
Akt 0,11,1538


In [135]:
print(classification_report(y_test, pred_test_os))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1549
           1       0.95      0.83      0.88       260

    accuracy                           0.97      1809
   macro avg       0.96      0.91      0.93      1809
weighted avg       0.97      0.97      0.97      1809



In [136]:
tp_rf_os = cm_rf_tuned_os['Pred 1'][0]
tn_rf_os = cm_rf_tuned_os['Pred 0'][1]

fp_rf_os = cm_rf_tuned_os['Pred 1'][1]
fn_rf_os = cm_rf_tuned_os['Pred 0'][0]

### Smote

In [137]:
rf_sm = RandomizedSearchCV(estimator = random_forest, param_distributions = param_rf, cv = 3, n_jobs = -1 , n_iter=300, 
                           verbose = 1, scoring = 'f1')
rf_sm.fit(x_train_sm, y_train_sm)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


330 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\pb69930\Anaconda3\envs\myenv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in va

In [138]:
rf_tuned_sm = rf_sm.best_estimator_

pred_train_sm = rf_tuned_sm.predict(x_train_sm)
pred_test_sm = rf_tuned_sm.predict(x_test)

rf_tuned_sm

In [139]:
rf_acc_tuned_train_sm = accuracy_score(y_train_sm, pred_train_sm)
rf_acc_tuned_test_sm = accuracy_score(y_test, pred_test_sm)

rf_recall_tuned_train_sm = recall_score(y_train_sm, pred_train_sm)
rf_recall_tuned_test_sm = recall_score(y_test, pred_test_sm)

rf_prec_tuned_train_sm = precision_score(y_train_sm, pred_train_sm)
rf_prec_tuned_test_sm = precision_score(y_test, pred_test_sm)

rf_f1_tuned_train_sm = f1_score(y_train_sm, pred_train_sm)
rf_f1_tuned_test_sm = f1_score(y_test, pred_test_sm)

In [140]:
cm_rf_tuned_sm = confusion_matrix(y_test, pred_test_sm, labels=[1, 0])

cm_rf_tuned_sm = pd.DataFrame(data=cm_rf_tuned_sm, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_rf_tuned_sm

Unnamed: 0,Pred 1,Pred 0
Akt 1,83,177
Akt 0,21,1528


In [141]:
print(classification_report(y_test, pred_test_sm))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1549
           1       0.80      0.32      0.46       260

    accuracy                           0.89      1809
   macro avg       0.85      0.65      0.70      1809
weighted avg       0.88      0.89      0.87      1809



In [142]:
tp_rf_sm = cm_rf_tuned_sm['Pred 1'][0]
tn_rf_sm = cm_rf_tuned_sm['Pred 0'][1]

fp_rf_sm = cm_rf_tuned_sm['Pred 1'][1]
fn_rf_sm = cm_rf_tuned_sm['Pred 0'][0]

## Evaluation Matrix For Decision Tree and Random Forest

In [143]:
tuned_os = {
    "DT OS Train": [dt_acc_tuned_train_os, dt_recall_tuned_train_os, dt_prec_tuned_train_os, dt_f1_tuned_train_os],
    "DT OS Test" : [dt_acc_tuned_test_os, dt_recall_tuned_test_os, dt_prec_tuned_test_os, dt_f1_tuned_test_os],
    
    "RF OS Train": [rf_acc_tuned_train_os, rf_recall_tuned_train_os, rf_prec_tuned_train_os, rf_f1_tuned_train_os],
    "RF OS Test" : [rf_acc_tuned_test_os, rf_recall_tuned_test_os, rf_prec_tuned_test_os, rf_f1_tuned_test_os]}

tuned_os_matrix = pd.DataFrame(data = tuned_os, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])


tuned_sm = {
    "DT SM Train": [dt_acc_tuned_train_sm, dt_recall_tuned_train_sm, dt_prec_tuned_train_sm, dt_f1_tuned_train_sm],
    "DT SM Test" : [dt_acc_tuned_test_sm, dt_recall_tuned_test_sm, dt_prec_tuned_test_sm, dt_f1_tuned_test_sm],
    
    "RF SM Train": [rf_acc_tuned_train_sm, rf_recall_tuned_train_sm, rf_prec_tuned_train_sm, rf_f1_tuned_train_sm],
    "RF SM Test" : [rf_acc_tuned_test_sm, rf_recall_tuned_test_sm, rf_prec_tuned_test_sm, rf_f1_tuned_test_sm]}

tuned_sm_matrix = pd.DataFrame(data = tuned_sm, index = ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

In [144]:
tuned_os_matrix

Unnamed: 0,DT OS Train,DT OS Test,RF OS Train,RF OS Test
Accuracy,0.994026,0.897181,0.999031,0.969044
Recall,0.996933,0.657692,1.0,0.826923
Precision,0.991172,0.63806,0.998066,0.951327
F1 Score,0.994044,0.647727,0.999032,0.884774


In [145]:
tuned_sm_matrix

Unnamed: 0,DT SM Train,DT SM Test,RF SM Train,RF SM Test
Accuracy,0.920326,0.796573,0.985793,0.890547
Recall,0.917178,0.373077,0.975137,0.319231
Precision,0.922989,0.321192,0.996371,0.798077
F1 Score,0.920074,0.345196,0.98564,0.456044


In [146]:
cm = {
    "True Positive" : [tp_svm_os, tp_svm_os_std, tp_svm_os_mm, tp_svm_os_rb, 
                       tp_svm_sm, tp_svm_sm_std, tp_svm_sm_mm, tp_svm_sm_rb,
                       tp_logreg_os, tp_logreg_os_std, tp_logreg_os_mm, tp_logreg_os_rb, 
                       tp_logreg_sm, tp_logreg_sm_std, tp_logreg_sm_mm, tp_logreg_sm_rb,
                       tp_dt_os, tp_dt_sm, tp_rf_os, tp_rf_sm],
    
    "True Negative" : [tn_svm_os, tn_svm_os_std, tn_svm_os_mm, tn_svm_os_rb, 
                       tn_svm_sm, tn_svm_sm_std, tn_svm_sm_mm, tn_svm_sm_rb,
                       tn_logreg_os, tn_logreg_os_std, tn_logreg_os_mm, tn_logreg_os_rb, 
                       tn_logreg_sm, tn_logreg_sm_std, tn_logreg_sm_mm, tn_logreg_sm_rb,
                       tn_dt_os, tn_dt_sm, tn_rf_os, tn_rf_sm],
    
    "False Positive": [fp_svm_os, fp_svm_os_std, fp_svm_os_mm, fp_svm_os_rb, 
                       fp_svm_sm, fp_svm_sm_std, fp_svm_sm_mm, fp_svm_sm_rb,
                       fp_logreg_os, fp_logreg_os_std, fp_logreg_os_mm, fp_logreg_os_rb, 
                       fp_logreg_sm, fp_logreg_sm_std, fp_logreg_sm_mm, fp_logreg_sm_rb,
                       fp_dt_os, fp_dt_sm, fp_rf_os, fp_rf_sm],
    
    "False Negative": [fn_svm_os, fn_svm_os_std, fn_svm_os_mm, fn_svm_os_rb, 
                       fn_svm_sm, fn_svm_sm_std, fn_svm_sm_mm, fn_svm_sm_rb,
                       fn_logreg_os, fn_logreg_os_std, fn_logreg_os_mm, fn_logreg_os_rb, 
                       fn_logreg_sm, fn_logreg_sm_std, fn_logreg_sm_mm, fn_logreg_sm_rb,
                       fn_dt_os, fn_dt_sm, fn_rf_os, fn_rf_sm]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['SVM OS', 'SVM OS Standard', 'SVM OS MinMax', 'SVM OS Robust',
                                             'SVM SM', 'SVM SM Standard', 'SVM SM MinMax', 'SVM SM Robust',
                                             'LogReg OS', 'Logreg OS Standard', 'Logreg OS MinMax', 'Logreg OS Robust',
                                             'LogReg SM', 'Logreg SM Standard', 'Logreg SM MinMax', 'Logreg SM Robust',
                                             'Decision Tree OS', 'Decision Tree SM', 'Random Forest OS', 'Random Forest SM'])

In [147]:
cm_matrix.sort_values('False Negative')

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM SM,255,1524,25,5
SVM SM Standard,227,1532,17,33
SVM SM Robust,224,1520,29,36
SVM OS,221,1549,0,39
Random Forest OS,215,1538,11,45
SVM OS Standard,213,1542,7,47
SVM OS Robust,209,1539,10,51
Logreg OS Standard,194,1091,458,66
LogReg OS,190,1097,452,70
Logreg OS Robust,190,1096,453,70


In [148]:
x = rf_os.predict_proba([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
                          1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

x



array([[0.54382353, 0.45617647]])

In [149]:
y = rf_os.predict([[3826, 0, 2, 1, 25000, 54, 2, 5200, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 
                    0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

y



array([0], dtype=int64)

In [150]:
import joblib

joblib.dump(rf_os, 'Support Vector Machine with Smote')

['Support Vector Machine with Smote']