In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# German

## Load Dataset

In [None]:
# Access to google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Read dataset from google drive
df_ger = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Pattern Project/german.data', delim_whitespace=True, header=None)
df_ger.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [None]:
df_ger.shape

(1000, 21)

## Preprocess

In [None]:
# Convert column names to string values
df_ger.columns = df_ger.columns.astype(str)

In [None]:
# separate features from class label
def separate(df):
  X = df.iloc[:,:-1]
  y = df.iloc[:,-1]
  return X, y

In [None]:
X, y = separate(df_ger)

In [None]:
# Change categories to One Hot
X = pd.get_dummies(X, columns=['0', '2', '3', '5', '6', '8', '9', '11', '13', '14', '16', '18', '19'])

In [None]:
X.shape

(1000, 61)

In [None]:
# split data to test and train
def split(X, y):

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

  X_train.reset_index(inplace=True, drop=True)
  X_test.reset_index(inplace=True, drop=True)
  y_train.reset_index(inplace=True, drop=True)
  y_test.reset_index(inplace=True, drop=True)

  print("X_train dataset: ", X_train.shape)
  print("y_train dataset: ", y_train.shape)
  print("X_test dataset: ", X_test.shape)
  print("y_test dataset: ", y_test.shape)

  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split(X, y)

X_train dataset:  (800, 61)
y_train dataset:  (800,)
X_test dataset:  (200, 61)
y_test dataset:  (200,)


In [None]:
print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

missed values in train: 0
missed values in test: 0


In [None]:
indexes = ['1', '4', '7', '10', '12', '15', '17']
norm = StandardScaler().fit(X_train[indexes])

# transform training data
X_train[indexes] = pd.DataFrame(norm.transform(X_train[indexes]))

# transform testing data
X_test[indexes] = pd.DataFrame(norm.transform(X_test[indexes]))

In [None]:
print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

missed values in train: 0
missed values in test: 0


## Evaluation Metrics

In [None]:
metrics_dict = {}

In [None]:
def evaluate(y_true, y_pred, clf_name):
  # Matthews Correlation Coefficient/ Phi coefficient
  mcc = matthews_corrcoef(y_true, y_pred)

  # Area Under the Curve
  auc = roc_auc_score(y_true, y_pred)

  # Cost of failure
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  cof = (fn * 1000) + (fp * 100)

  metrics_dict[clf_name] = [mcc, auc, cof]

## Models

In [None]:
def clf_grid(X_train, y_train, param_grid, clf):

  # use gridsearch to test all values for n_neighbors
  grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, scoring='roc_auc')

  grid_result = grid.fit(X_train, y_train)

  # report the best configuration
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

  # report all configurations
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))

### KNN

In [None]:
# create new a knn model
clf = KNeighborsClassifier()

# create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(30, 40)}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.739354 using {'n_neighbors': 33}
0.737230 (0.012745) with: {'n_neighbors': 30}
0.737595 (0.015526) with: {'n_neighbors': 31}
0.737745 (0.018343) with: {'n_neighbors': 32}
0.739354 (0.017992) with: {'n_neighbors': 33}
0.736915 (0.019513) with: {'n_neighbors': 34}
0.738105 (0.018438) with: {'n_neighbors': 35}
0.736492 (0.015529) with: {'n_neighbors': 36}
0.735732 (0.017036) with: {'n_neighbors': 37}
0.735811 (0.015580) with: {'n_neighbors': 38}
0.737983 (0.017797) with: {'n_neighbors': 39}


In [None]:
knn = KNeighborsClassifier(n_neighbors=33)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.78      0.99      0.87       149
           2       0.83      0.20      0.32        51

    accuracy                           0.79       200
   macro avg       0.81      0.59      0.59       200
weighted avg       0.80      0.79      0.73       200

0.785


In [None]:
evaluate(y_test, predictions, 'knn')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200]}

### SVM

In [None]:
clf = SVC()

param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001]}]

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.757746 using {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.744063 (0.024787) with: {'C': 1, 'kernel': 'linear'}
0.741767 (0.022061) with: {'C': 10, 'kernel': 'linear'}
0.741840 (0.021828) with: {'C': 100, 'kernel': 'linear'}
0.741586 (0.021414) with: {'C': 1000, 'kernel': 'linear'}
0.756674 (0.018169) with: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.756294 (0.027549) with: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.757708 (0.030905) with: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.755228 (0.031247) with: {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.721838 (0.032915) with: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.752762 (0.025720) with: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.757746 (0.030638) with: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.756980 (0.031122) with: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.719169 (0.031786) with: {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.721118 (0.029976) with: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.749813 (0.026677) with: {'C

In [None]:
# svm = SVC(C=10, gamma=0.001, kernel='rbf')
svm = SVC(C=1, kernel='linear')
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.87      0.91      0.89       149
           2       0.69      0.61      0.65        51

    accuracy                           0.83       200
   macro avg       0.78      0.76      0.77       200
weighted avg       0.82      0.83      0.83       200

0.83


In [None]:
evaluate(y_test, predictions, 'svm')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200],
 'svm': [0.5363776954330505, 0.7569417028556389, 21400]}

### Random Forest

In [None]:
clf = RandomForestClassifier()

param_grid = {'max_depth':[3, 5, None],
              'n_estimators':[3, 5, 10],
              'max_features':[5, 6, 7, 8]}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.748200 using {'max_depth': 5, 'max_features': 8, 'n_estimators': 10}
0.655139 (0.060552) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 3}
0.710282 (0.038706) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 5}
0.743641 (0.026778) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 10}
0.678316 (0.033422) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 3}
0.705551 (0.018649) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 5}
0.736237 (0.023708) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 10}
0.620490 (0.049648) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 3}
0.693859 (0.023197) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 5}
0.732557 (0.029249) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 10}
0.677245 (0.059739) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 3}
0.702189 (0.038679) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 5}
0.720921 (0.028687) with: {'m

In [None]:
rf = RandomForestClassifier(max_depth=5, max_features=8, n_estimators=10)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.79      0.96      0.86       149
           2       0.67      0.24      0.35        51

    accuracy                           0.78       200
   macro avg       0.73      0.60      0.61       200
weighted avg       0.76      0.78      0.73       200

0.775


In [None]:
evaluate(y_test, predictions, 'rf')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200],
 'svm': [0.5363776954330505, 0.7569417028556389, 21400],
 'rf': [0.2970284403632074, 0.59751283063561, 39600]}

### Logistic Regression

In [None]:
clf = LogisticRegression()

# param_grid = [{'penalty':['l1']},
#               {'penalty':['l2'], 'C':[1, 10, 100, 1000]}]

param_grid = [{'C': 10**np.linspace(-3,3,20)}]

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.767839 using {'C': 0.0379269019073225}
0.734689 (0.035527) with: {'C': 0.001}
0.740770 (0.035343) with: {'C': 0.00206913808111479}
0.748995 (0.032777) with: {'C': 0.004281332398719396}
0.756899 (0.032893) with: {'C': 0.008858667904100823}
0.764993 (0.032130) with: {'C': 0.018329807108324356}
0.767839 (0.033262) with: {'C': 0.0379269019073225}
0.767295 (0.033313) with: {'C': 0.07847599703514611}
0.764346 (0.033705) with: {'C': 0.1623776739188721}
0.760335 (0.032256) with: {'C': 0.3359818286283781}
0.756509 (0.028702) with: {'C': 0.6951927961775606}
0.753928 (0.026021) with: {'C': 1.438449888287663}
0.751014 (0.022669) with: {'C': 2.976351441631316}
0.747555 (0.019925) with: {'C': 6.158482110660261}
0.745554 (0.017881) with: {'C': 12.742749857031322}
0.744826 (0.017043) with: {'C': 26.366508987303554}
0.744317 (0.017376) with: {'C': 54.555947811685144}
0.744134 (0.017478) with: {'C': 112.88378916846884}
0.744025 (0.017410) with: {'C': 233.57214690901213}
0.743843 (0.017449) with:

In [None]:
lr = LogisticRegression(C=0.037)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.82      0.95      0.88       149
           2       0.72      0.41      0.53        51

    accuracy                           0.81       200
   macro avg       0.77      0.68      0.70       200
weighted avg       0.80      0.81      0.79       200

0.81


In [None]:
evaluate(y_test, predictions, 'lr')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200],
 'svm': [0.5363776954330505, 0.7569417028556389, 21400],
 'rf': [0.2970284403632074, 0.59751283063561, 39600],
 'lr': [0.4432546143591779, 0.6790367153572837, 30800]}

### Naive Bayes

In [None]:
gaussion_nb = GaussianNB()
gaussion_nb.fit(X_train, y_train)
predictions = gaussion_nb.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.91      0.77      0.83       149
           2       0.53      0.78      0.63        51

    accuracy                           0.77       200
   macro avg       0.72      0.77      0.73       200
weighted avg       0.82      0.77      0.78       200

0.77


In [None]:
evaluate(y_test, predictions, 'nb')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200],
 'svm': [0.5363776954330505, 0.7569417028556389, 21400],
 'rf': [0.2970284403632074, 0.59751283063561, 39600],
 'lr': [0.4432546143591779, 0.6790367153572837, 30800],
 'nb': [0.49464385635835584, 0.7747071983155678, 14500]}

### Ensemble Classifier

In [None]:
# metrics_dict = {'knn': [0.33522908850925615, 0.5913278062903013, 41200],
#                 'svm': [0.5363776954330505, 0.7569417028556389, 21400],
#                 'rf': [0.2970284403632074, 0.59751283063561, 39600],
#                 'lr': [0.4432546143591779, 0.6790367153572837, 30800],
#                 'nb': [0.49464385635835584, 0.7747071983155678, 14500]}

In [None]:
ensemble = VotingClassifier(estimators=[('svm', svm), ('lr', lr), ('nb', gaussion_nb)], voting='hard')
ensemble.fit(X_train, y_train)
predictions = ensemble.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           1       0.88      0.91      0.89       149
           2       0.71      0.63      0.67        51

    accuracy                           0.84       200
   macro avg       0.79      0.77      0.78       200
weighted avg       0.84      0.84      0.84       200

0.84


In [None]:
evaluate(y_test, predictions, 'ensemble')
metrics_dict

{'knn': [0.33522908850925615, 0.5913278062903013, 41200],
 'svm': [0.5363776954330505, 0.7569417028556389, 21400],
 'rf': [0.2970284403632074, 0.59751283063561, 39600],
 'lr': [0.4432546143591779, 0.6790367153572837, 30800],
 'nb': [0.49464385635835584, 0.7747071983155678, 14500],
 'ensemble': [0.5638490242644487, 0.7701013291222529, 20300]}

# Australian

## Load Dataset

In [None]:
# Read dataset from google drive
df_aus = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Pattern Project/australian.dat', delim_whitespace=True, header=None)
df_aus.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
3,0,21.67,11.5,1,5,3,0.0,1,1,11,1,2,0,1,1
4,1,20.17,8.17,2,6,4,1.96,1,1,14,0,2,60,159,1


In [None]:
df_aus.shape

(690, 15)

## Preprocess

In [None]:
# Convert column names to string values
df_aus.columns = df_aus.columns.astype(str)

X, y = separate(df_aus)

# Change categories to One Hot
X = pd.get_dummies(X, columns=['0', '3', '4', '5', '7', '8', '10', '11'])

X_train, X_test, y_train, y_test = split(X, y)

print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

X_train dataset:  (552, 42)
y_train dataset:  (552,)
X_test dataset:  (138, 42)
y_test dataset:  (138,)
missed values in train: 0
missed values in test: 0


In [None]:
X_train.head()

Unnamed: 0,1,2,6,9,12,13,0_0,0_1,3_1,3_2,...,5_9,7_0,7_1,8_0,8_1,10_0,10_1,11_1,11_2,11_3
0,29.5,2.0,2.0,0,256,18,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
1,40.92,0.835,0.0,0,130,2,0,1,0,1,...,0,0,1,1,0,1,0,0,1,0
2,20.08,0.125,1.0,1,240,769,1,0,0,1,...,0,1,0,0,1,1,0,0,1,0
3,20.33,10.0,1.0,4,50,1466,1,0,0,1,...,0,0,1,0,1,1,0,0,1,0
4,27.83,1.5,2.0,11,434,36,0,1,0,1,...,0,0,1,0,1,0,1,0,1,0


In [None]:
# Normalization
indexes = ['1', '2', '6', '9', '12', '13']
norm = StandardScaler().fit(X_train[indexes])
X_train[indexes] = pd.DataFrame(norm.transform(X_train[indexes]))
X_test[indexes] = pd.DataFrame(norm.transform(X_test[indexes]))

In [None]:
print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

missed values in train: 0
missed values in test: 0


## Models

In [None]:
metrics_dict = {}

### KNN

In [None]:
# create new a knn model
clf = KNeighborsClassifier()

# create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(35, 50)}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.917889 using {'n_neighbors': 42}
0.915293 (0.039637) with: {'n_neighbors': 35}
0.914865 (0.039225) with: {'n_neighbors': 36}
0.915507 (0.037683) with: {'n_neighbors': 37}
0.914808 (0.038496) with: {'n_neighbors': 38}
0.916835 (0.038888) with: {'n_neighbors': 39}
0.916898 (0.038337) with: {'n_neighbors': 40}
0.916361 (0.039310) with: {'n_neighbors': 41}
0.917889 (0.037917) with: {'n_neighbors': 42}
0.916995 (0.037983) with: {'n_neighbors': 43}
0.916723 (0.038168) with: {'n_neighbors': 44}
0.917029 (0.036622) with: {'n_neighbors': 45}
0.917355 (0.037166) with: {'n_neighbors': 46}
0.916468 (0.037021) with: {'n_neighbors': 47}
0.915334 (0.037479) with: {'n_neighbors': 48}
0.915925 (0.036840) with: {'n_neighbors': 49}


In [None]:
knn = KNeighborsClassifier(n_neighbors=42)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86        77
           1       0.85      0.77      0.81        61

    accuracy                           0.84       138
   macro avg       0.84      0.83      0.84       138
weighted avg       0.84      0.84      0.84       138

0.8405797101449275


In [None]:
evaluate(y_test, predictions, 'knn')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800]}

### SVM

In [None]:
clf = SVC()

param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001]}]

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.928368 using {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.922951 (0.034243) with: {'C': 1, 'kernel': 'linear'}
0.918639 (0.034864) with: {'C': 10, 'kernel': 'linear'}
0.918184 (0.034098) with: {'C': 100, 'kernel': 'linear'}
0.919896 (0.033224) with: {'C': 1000, 'kernel': 'linear'}
0.919157 (0.038929) with: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.928368 (0.037256) with: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.911785 (0.038787) with: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.905676 (0.036389) with: {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.901917 (0.030344) with: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.919877 (0.034818) with: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.927806 (0.034364) with: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.912121 (0.038677) with: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.868738 (0.029419) with: {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.914256 (0.028959) with: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.922790 (0.030197) with: {'C':

In [None]:
svm = SVC(C=1, gamma=0.01, kernel='rbf')
# svm = SVC(C=1, kernel='linear')
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.77      0.84        77
           1       0.76      0.92      0.83        61

    accuracy                           0.83       138
   macro avg       0.84      0.84      0.83       138
weighted avg       0.85      0.83      0.83       138

0.8333333333333334


In [None]:
evaluate(y_test, predictions, 'svm')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800],
 'svm': [0.6814433307568912, 0.8421332765595062, 6800]}

### Random Forest

In [None]:
clf = RandomForestClassifier()

param_grid = {'max_depth':[3, 5, None],
              'n_estimators':[3, 5, 10],
              'max_features':[5, 6, 7, 8]}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.937113 using {'max_depth': 5, 'max_features': 6, 'n_estimators': 10}
0.900909 (0.033672) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 3}
0.919177 (0.042262) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 5}
0.913386 (0.051330) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 10}
0.898449 (0.047081) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 3}
0.907742 (0.028959) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 5}
0.926751 (0.039740) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 10}
0.924303 (0.018945) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 3}
0.911479 (0.040750) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 5}
0.918541 (0.032798) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 10}
0.912897 (0.032186) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 3}
0.921537 (0.034300) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 5}
0.932548 (0.026484) with: {'m

In [None]:
rf = RandomForestClassifier(max_depth=5, max_features=6, n_estimators=10)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88        77
           1       0.85      0.84      0.84        61

    accuracy                           0.86       138
   macro avg       0.86      0.86      0.86       138
weighted avg       0.86      0.86      0.86       138

0.8623188405797102


In [None]:
evaluate(y_test, predictions, 'rf')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800],
 'svm': [0.6814433307568912, 0.8421332765595062, 6800],
 'rf': [0.720487480297988, 0.8595912284436874, 10900]}

### Logistic Regression

In [None]:
clf = LogisticRegression()

# param_grid = [{'penalty':['l1']},
#               {'penalty':['l2'], 'C':[1, 10, 100, 1000]}]

param_grid = [{'C': 10**np.linspace(-3,3,20)}]

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.931204 using {'C': 0.6951927961775606}
0.903441 (0.036484) with: {'C': 0.001}
0.906758 (0.036383) with: {'C': 0.00206913808111479}
0.911669 (0.036909) with: {'C': 0.004281332398719396}
0.916712 (0.037657) with: {'C': 0.008858667904100823}
0.919831 (0.036329) with: {'C': 0.018329807108324356}
0.923553 (0.035868) with: {'C': 0.0379269019073225}
0.927076 (0.035315) with: {'C': 0.07847599703514611}
0.928611 (0.034000) with: {'C': 0.1623776739188721}
0.929868 (0.034241) with: {'C': 0.3359818286283781}
0.931204 (0.033558) with: {'C': 0.6951927961775606}
0.930549 (0.032681) with: {'C': 1.438449888287663}
0.929951 (0.032911) with: {'C': 2.976351441631316}
0.927761 (0.033521) with: {'C': 6.158482110660261}
0.925707 (0.032310) with: {'C': 12.742749857031322}
0.924126 (0.030667) with: {'C': 26.366508987303554}
0.922607 (0.029882) with: {'C': 54.555947811685144}
0.920566 (0.028831) with: {'C': 112.88378916846884}
0.918064 (0.028801) with: {'C': 233.57214690901213}
0.916813 (0.027835) with:

In [None]:
lr = LogisticRegression(C=0.695)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85        77
           1       0.79      0.87      0.83        61

    accuracy                           0.84       138
   macro avg       0.84      0.84      0.84       138
weighted avg       0.84      0.84      0.84       138

0.8405797101449275


In [None]:
evaluate(y_test, predictions, 'lr')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800],
 'svm': [0.6814433307568912, 0.8421332765595062, 6800],
 'rf': [0.720487480297988, 0.8595912284436874, 10900],
 'lr': [0.6826877493179975, 0.8435171385991057, 9400]}

### Naive Bayes

In [None]:
gaussion_nb = GaussianNB()
gaussion_nb.fit(X_train, y_train)
predictions = gaussion_nb.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        77
           1       0.80      0.80      0.80        61

    accuracy                           0.83       138
   macro avg       0.82      0.82      0.82       138
weighted avg       0.83      0.83      0.83       138

0.8260869565217391


In [None]:
evaluate(y_test, predictions, 'nb')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800],
 'svm': [0.6814433307568912, 0.8421332765595062, 6800],
 'rf': [0.720487480297988, 0.8595912284436874, 10900],
 'lr': [0.6826877493179975, 0.8435171385991057, 9400],
 'nb': [0.6474345326804343, 0.8237172663402171, 13200]}

### Ensemble Classifier

In [None]:
metrics_dict = {'knn': [0.6761645481060181, 0.8332978496912923, 14800],
                'svm': [0.6814433307568912, 0.8421332765595062, 6800],
                'rf': [0.720487480297988, 0.8595912284436874, 10900],
                'lr': [0.6826877493179975, 0.8435171385991057, 9400],
                'nb': [0.6474345326804343, 0.8237172663402171, 13200]}

In [None]:
ensemble = VotingClassifier(estimators=[('svm', svm), ('lr', lr), ('rf', rf)], voting='hard')
ensemble.fit(X_train, y_train)
predictions = ensemble.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86        77
           1       0.79      0.89      0.84        61

    accuracy                           0.85       138
   macro avg       0.85      0.85      0.85       138
weighted avg       0.85      0.85      0.85       138

0.8478260869565217


In [None]:
evaluate(y_test, predictions, 'ensemble')
metrics_dict

{'knn': [0.6761645481060181, 0.8332978496912923, 14800],
 'svm': [0.6814433307568912, 0.8421332765595062, 6800],
 'rf': [0.720487480297988, 0.8595912284436874, 10900],
 'lr': [0.6826877493179975, 0.8435171385991057, 9400],
 'nb': [0.6474345326804343, 0.8237172663402171, 13200],
 'ensemble': [0.6987571779654349, 0.8517138599105812, 8400]}

# European

## Load Dataset

In [None]:
# Access to google drive
from google.colab import drive
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [None]:
#df_eu = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Pattern Project/European.csv')
df_eu = pd.read_csv('/content/MyDrive/MyDrive/Pattern Project/European.csv')

df_eu.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df_eu.shape

(284807, 31)

## Preprocess

In [None]:
# Convert column names to string values
df_eu.columns = df_eu.columns.astype(str)

X, y = separate(df_eu)

X_train, X_test, y_train, y_test = split(X, y)

print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

indexes = ['Time', 'Amount']
norm = StandardScaler().fit(X_train[indexes])
X_train[indexes] = pd.DataFrame(norm.transform(X_train[indexes]))
X_test[indexes] = pd.DataFrame(norm.transform(X_test[indexes]))

print('missed values in train:', X_train.isnull().sum().sum())
print('missed values in test:', X_test.isnull().sum().sum())

X_train dataset:  (227845, 30)
y_train dataset:  (227845,)
X_test dataset:  (56962, 30)
y_test dataset:  (56962,)
missed values in train: 0
missed values in test: 0
missed values in train: 0
missed values in test: 0


In [None]:
X_train.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.108763,-0.557458,1.231524,0.573268,-0.263955,0.94612,-1.203171,1.057426,-0.130107,-1.042645,...,-0.012924,0.025811,0.023363,-0.230233,0.224216,-0.088442,0.156625,0.071094,0.154123,-0.363884
1,1.010436,-0.344935,1.385742,0.68668,1.035083,0.256868,-0.460239,0.963755,-0.294085,-0.565671,...,0.385026,-0.211719,-0.288454,-0.367644,-0.007358,0.396272,0.465707,-0.002034,0.097146,-0.197146
2,1.552282,-2.638648,0.975973,0.897377,0.156363,0.096289,0.208338,-0.150398,0.298436,0.245605,...,-0.772491,-0.117961,-0.781257,-0.637791,0.722013,0.267284,-1.026775,-1.288352,-0.249419,-0.215394
3,-0.05885,-0.036151,1.360794,-0.991409,1.003604,0.865763,-0.693477,0.926102,0.141968,0.198633,...,-0.339327,0.225501,0.76763,-0.018212,-0.493617,-0.878398,-0.557764,0.115271,0.188096,-0.264474
4,1.047226,2.143097,-1.129054,-0.673094,-1.093833,-0.882792,0.115427,-1.267968,0.267143,0.143981,...,-0.118375,-0.041227,-0.264483,0.394481,0.207038,-0.541776,-0.502798,-0.006515,-0.052021,-0.324387


## Models

In [None]:
metrics_dict = {}

### KNN

In [None]:
# create new a knn model
clf = KNeighborsClassifier()

# create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(35, 45)}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.939352 using {'n_neighbors': 42}
0.934339 (0.017511) with: {'n_neighbors': 35}
0.934328 (0.017513) with: {'n_neighbors': 36}
0.935588 (0.018701) with: {'n_neighbors': 37}
0.935576 (0.018699) with: {'n_neighbors': 38}
0.936846 (0.017493) with: {'n_neighbors': 39}
0.938114 (0.016596) with: {'n_neighbors': 40}
0.938099 (0.016599) with: {'n_neighbors': 41}
0.939352 (0.015636) with: {'n_neighbors': 42}
0.939341 (0.015638) with: {'n_neighbors': 43}
0.939325 (0.015643) with: {'n_neighbors': 44}


In [None]:
knn = KNeighborsClassifier(n_neighbors=42)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.88      0.78      0.83       100

    accuracy                           1.00     56962
   macro avg       0.94      0.89      0.91     56962
weighted avg       1.00      1.00      1.00     56962

0.999420666409185


In [None]:
evaluate(y_test, predictions, 'knn')
metrics_dict

{'knn': [0.8265133479830457, 0.8899032745946327, 23100]}

### SVM

In [None]:
clf = SVC()

param_grid = {'C': [1,10,100, 1000], 'kernel': ['linear']}

clf_grid(X_train, y_train, param_grid, clf)



In [None]:
svm = SVC(C=1, gamma=0.01, kernel='rbf')

svm.fit(X_train, y_train)
predictions = svm.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.94      0.81      0.87       100

    accuracy                           1.00     56962
   macro avg       0.97      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962

0.9995786664794073


In [None]:
evaluate(y_test, predictions, 'svm')
metrics_dict

{'svm': [0.8732431491970408, 0.9049560339066512, 19500]}

### Random Forest

In [None]:
clf = RandomForestClassifier()

param_grid = {'max_depth':[3, 5, None],
              'n_estimators':[3, 5, 10],
              'max_features':[5, 6, 7, 8]}

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.947769 using {'max_depth': 5, 'max_features': 6, 'n_estimators': 10}
0.902573 (0.029840) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 3}
0.907548 (0.024625) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 5}
0.915655 (0.016654) with: {'max_depth': 3, 'max_features': 5, 'n_estimators': 10}
0.892514 (0.023654) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 3}
0.913896 (0.024226) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 5}
0.915102 (0.019984) with: {'max_depth': 3, 'max_features': 6, 'n_estimators': 10}
0.909935 (0.015403) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 3}
0.907192 (0.022122) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 5}
0.926467 (0.021901) with: {'max_depth': 3, 'max_features': 7, 'n_estimators': 10}
0.912934 (0.020977) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 3}
0.918824 (0.010770) with: {'max_depth': 3, 'max_features': 8, 'n_estimators': 5}
0.928850 (0.015555) with: {'m

In [None]:
rf = RandomForestClassifier(max_depth=5, max_features=6, n_estimators=10)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.94      0.73      0.82       100

    accuracy                           1.00     56962
   macro avg       0.97      0.86      0.91     56962
weighted avg       1.00      1.00      1.00     56962

0.9994382219725431


In [None]:
evaluate(y_test, predictions, 'rf')
metrics_dict

{'rf': [0.826302837834568, 0.8649560339066511, 27500]}

### Logistic Regression

In [None]:
clf = LogisticRegression()

param_grid = [{'C': 10**np.linspace(-3,3,20)}]

clf_grid(X_train, y_train, param_grid, clf)

Best: 0.982064 using {'C': 0.004281332398719396}
0.981174 (0.012704) with: {'C': 0.001}
0.981741 (0.012410) with: {'C': 0.00206913808111479}
0.982064 (0.012030) with: {'C': 0.004281332398719396}
0.981956 (0.011406) with: {'C': 0.008858667904100823}
0.981432 (0.010629) with: {'C': 0.018329807108324356}
0.980729 (0.009875) with: {'C': 0.0379269019073225}
0.979912 (0.009320) with: {'C': 0.07847599703514611}
0.979220 (0.008977) with: {'C': 0.1623776739188721}
0.978723 (0.008794) with: {'C': 0.3359818286283781}
0.978453 (0.008692) with: {'C': 0.6951927961775606}
0.978301 (0.008636) with: {'C': 1.438449888287663}
0.978228 (0.008610) with: {'C': 2.976351441631316}
0.978188 (0.008599) with: {'C': 6.158482110660261}
0.978173 (0.008589) with: {'C': 12.742749857031322}
0.978165 (0.008586) with: {'C': 26.366508987303554}
0.978159 (0.008586) with: {'C': 54.555947811685144}
0.978158 (0.008584) with: {'C': 112.88378916846884}
0.978158 (0.008582) with: {'C': 233.57214690901213}
0.978158 (0.008582) wit

In [None]:
lr = LogisticRegression(C=0.004281332398719396)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.84      0.53      0.65       100

    accuracy                           1.00     56962
   macro avg       0.92      0.76      0.82     56962
weighted avg       1.00      1.00      1.00     56962

0.9989993328885924


In [None]:
evaluate(y_test, predictions, 'lr')
metrics_dict

{'rf': [0.826302837834568, 0.8649560339066511, 27500],
 'lr': [0.6672985976795321, 0.7649120678133025, 48000]}

### Naive Bayes

In [None]:
gaussion_nb = GaussianNB()
gaussion_nb.fit(X_train, y_train)
predictions = gaussion_nb.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56862
           1       0.06      0.87      0.12       100

    accuracy                           0.98     56962
   macro avg       0.53      0.92      0.55     56962
weighted avg       1.00      0.98      0.99     56962

0.9772655454513536


In [None]:
evaluate(y_test, predictions, 'nb')
metrics_dict

{'rf': [0.826302837834568, 0.8649560339066511, 27500],
 'lr': [0.6672985976795321, 0.7649120678133025, 48000],
 'nb': [0.23164104524818194, 0.9237270936653653, 141200]}

### Ensemble Classifier

In [None]:
metrics_dict = {'knn': [0.8265133479830457, 0.8899032745946327, 23100],
                'svm': [0.8732431491970408, 0.9049560339066512, 19500],
                'rf': [0.826302837834568, 0.8649560339066511, 27500],
                'lr': [0.6672985976795321, 0.7649120678133025, 48000],
                'nb': [0.23164104524818194, 0.9237270936653653, 141200]}

In [None]:
rf = RandomForestClassifier(max_depth=5, max_features=6, n_estimators=10)
svm = SVC(C=1, gamma=0.01, kernel='rbf')
knn = KNeighborsClassifier(n_neighbors=42)

In [None]:
ensemble = VotingClassifier(estimators=[('svm', svm), ('knn', knn), ('rf', rf)], voting='hard')
ensemble.fit(X_train, y_train)
predictions = ensemble.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.92      0.82      0.87       100

    accuracy                           1.00     56962
   macro avg       0.96      0.91      0.93     56962
weighted avg       1.00      1.00      1.00     56962

0.9995611109160493


In [None]:
evaluate(y_test, predictions, 'ensemble')
metrics_dict

{'knn': [0.8265133479830457, 0.8899032745946327, 23100],
 'svm': [0.8732431491970408, 0.9049560339066512, 19500],
 'rf': [0.826302837834568, 0.8649560339066511, 27500],
 'lr': [0.6672985976795321, 0.7649120678133025, 48000],
 'nb': [0.23164104524818194, 0.9237270936653653, 141200],
 'ensemble': [0.8689837217630202, 0.9099384474693116, 18700]}