### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikitplot as skplt

import random
random.seed(777)

In [None]:
# Sklearn modules
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_curve, roc_auc_score

In [None]:
# XGboost (Extreme Gradient Boosting)
import xgboost as xgb

In [None]:
# save models
import pickle

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn_extra.cluster import KMedoids

In [None]:
# Setting np random seed
np.random.seed(777)

### Loading data

The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. 
However, the positive class has a clear pattern where all the samples are in a corner if we make a scatter plot of the 3 first components. 

In [None]:
# read in data
df = pd.read_csv('./data_creditCardFraud.csv')

df.head()

In [None]:
df.describe()

In [None]:
print("Class distribution")
print(df.Class.value_counts())

In [None]:
plt.bar(range(2), [df.Class.value_counts()[0], df.Class.value_counts()[1]])

In [None]:
Fraud_transacation = df[df["Class"]==1]
Normal_transacation= df[df["Class"]==0]
plt.figure(figsize=(10,6))
plt.subplot(121)
Fraud_transacation[Fraud_transacation["Amount"]<= 2500].Amount.plot.hist(title="Fraud")
plt.subplot(122)
Normal_transacation[Normal_transacation["Amount"]<=2500].Amount.plot.hist(title="Non-Fraud")

Each fraud transaction can represent a very significant expense, which together can represent billions of dollars of lost revenue each year.

### Creating Models

If we had a classifier that always predicts that is not Fraud, in this case, we would have a 99.8% of accuracy.
We will train a Logistic Regression, a Decision Tree, a Random Forest, a GBM and a Neural Network.

In [None]:
# Splitting Data to Train and Test
# We also drop feature time because it is different from every transacion and does not 
y = df.Class
X = df.drop(['Class','Time'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

In [None]:
X_train.head()

In [None]:
# We are maintaining the same training and test data in order to have accurate results.
X_test_solution = X_test.copy()
y_test_solution = y_test.copy()

# we rejoin the training data
training_set = X_train.copy()
training_set["Class"] = pd.DataFrame(y_train)
training_set.head()

In [None]:
len(training_set[training_set.Class == 1])

#### Logistic Regression

In [None]:
logistic_regression = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [None]:
lr_pred = logistic_regression.predict(X_test)

In [None]:
logistic_regression.coef_[0]

In [None]:
training_set.columns

In [None]:
plt.figure(figsize=(12,8))
plt.barh(X_train.columns,logistic_regression.coef_[0])
plt.title("Coefficient values")
plt.show()

#### Decision Tree

In [None]:
decision_tree = tree.DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
dt_pred = decision_tree.predict(X_test)

#### Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

In [None]:
rf_pred = random_forest.predict(X_test)

#### GBM

In [None]:
gbm = xgb.XGBClassifier()
gbm.fit(X_train, y_train)

In [None]:
gbm_pred = gbm.predict(X_test)

### Results

#### Accuracies

In [None]:
print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred)*100) + "%")
print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred)*100) + "%")
print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred)*100) + "%")
print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred)*100) + "%")

#### F1 Score

In [None]:
print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred)*100) + "%")
print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred)*100) + "%")
print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred)*100) + "%")
print("F1 Score GBM:", str(f1_score(y_test, gbm_pred)*100) + "%")

#### Precision

In [None]:
print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred)*100) + "%")
print("Precision Decision Tree:", str(precision_score(y_test, dt_pred)*100) + "%")
print("Precision Random Forest:", str(precision_score(y_test, rf_pred)*100) + "%")
print("Precision GBM:", str(precision_score(y_test, gbm_pred)*100) + "%")

#### Recall

In [None]:
print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred)*100) + "%")
print("Recall Decision Tree:", str(recall_score(y_test, dt_pred)*100) + "%")
print("Recall Random Forest:", str(recall_score(y_test, rf_pred)*100) + "%")
print("Recall GBM:", str(recall_score(y_test, gbm_pred)*100) + "%")

#### Confusion Matrix

In [None]:
print("Confusion Matrix Logistic Regression")
lr_cm = pd.DataFrame(confusion_matrix(y_test, lr_pred))
lr_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
lr_cm

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, lr_pred)

In [None]:
print("Confusion Matrix Decision Tree")
dt_cm = pd.DataFrame(confusion_matrix(y_test, dt_pred))
dt_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
dt_cm

In [None]:
print("Confusion Matrix Random Forest")
rf_cm = pd.DataFrame(confusion_matrix(y_test, rf_pred))
rf_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
rf_cm

In [None]:
print("Confusion Matrix GBM")
gbm_cm = pd.DataFrame(confusion_matrix(y_test, gbm_pred))
gbm_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
gbm_cm

##### ROC/AUC

In [None]:
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_pred)
auc_lr = roc_auc_score(y_test, lr_pred)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, auc="+str(auc_lr))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt_pred)
auc_dt = roc_auc_score(y_test, dt_pred)
plt.plot(fpr_dt,tpr_dt,label="Decision Tree, auc="+str(auc_dt))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_pred)
auc_rf = roc_auc_score(y_test, rf_pred)
plt.plot(fpr_rf,tpr_rf,label="Random Forest, auc="+str(auc_rf))
plt.legend(loc=4)
plt.show()

### Saving models

In [None]:
pickle.dump(logistic_regression, open("logisticRegression.sav", 'wb'))

In [None]:
pickle.dump(decision_tree, open("decisionTree.sav", 'wb'))

In [None]:
pickle.dump(random_forest, open("randomForest.sav", 'wb'))

In [None]:
pickle.dump(gbm, open("gbm.sav", 'wb'))

### Techniques to solve the imbalanced datasets problem

In [None]:
# We are maintaining the same training and test data in order to have accurate results.
X_test_solution = X_test.copy()
y_test_solution = y_test.copy()

# we rejoin the training data
training_set = X_train.copy()
training_set["Class"] = pd.DataFrame(y_train)
training_set.head()

In [None]:
len(training_set[training_set.Class == 1])

### 1. Undersampling

We are reducing the amount of training samples of the majority class in order to create a balanced dataset 1-1

In [None]:
fraud_indices = training_set[training_set.Class == 1].index
normal_indices = training_set[training_set.Class == 0].index

under_sample_indices = np.random.choice(normal_indices, len(training_set[training_set.Class == 1]) , False)
train_undersampled = training_set.loc[np.concatenate([fraud_indices, under_sample_indices]),:]
X_train_undersampled = train_undersampled.iloc[:,0:29]
Y_train_undersampled = train_undersampled.Class

In [None]:
X_train_undersampled.head()

In [None]:
print("Length training set", len(X_train_undersampled))

#### Training undersampled models

##### Logistic Regression

In [None]:
logistic_regression_undersampled = LogisticRegression(solver='liblinear').fit(X_train_undersampled, Y_train_undersampled)

In [None]:
lr_pred_undersampled = logistic_regression_undersampled.predict(X_test)

##### Decision Tree

In [None]:
decision_tree_undersampled = tree.DecisionTreeClassifier().fit(X_train_undersampled, Y_train_undersampled)

In [None]:
dt_pred_undersampled = decision_tree_undersampled.predict(X_test)

##### Random Forest

In [None]:
random_forest_undersampled = RandomForestClassifier(n_estimators=10).fit(X_train_undersampled, Y_train_undersampled)

In [None]:
rf_pred_undersampled = random_forest_undersampled.predict(X_test)

##### GBM

In [None]:
gbm_undersampled = xgb.XGBClassifier()
gbm_undersampled.fit(X_train_undersampled, Y_train_undersampled)

In [None]:
gbm_pred_undersampled = gbm_undersampled.predict(X_test)

#### Saving models

In [None]:
pickle.dump(logistic_regression_undersampled, open("logisticRegression_undersampled.sav", 'wb'))
pickle.dump(decision_tree_undersampled, open("decisionTree_undersampled.sav", 'wb'))
pickle.dump(random_forest_undersampled, open("randomForest_undersampled.sav", 'wb'))
pickle.dump(gbm_undersampled, open("gbm_undersampled.sav", 'wb'))

##### Evaluating Models

##### Accuracy

In [None]:
print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_undersampled)*100) + "%")
print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_undersampled)*100) + "%")
print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_undersampled)*100) + "%")
print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_undersampled)*100) + "%")

##### F1 Score

In [None]:
print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_undersampled)*100) + "%")
print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_undersampled)*100) + "%")
print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_undersampled)*100) + "%")
print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_undersampled)*100) + "%")

##### Precision

In [None]:
print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_undersampled)*100) + "%")
print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_undersampled)*100) + "%")
print("Precision Random Forest:", str(precision_score(y_test, rf_pred_undersampled)*100) + "%")
print("Precision GBM:", str(precision_score(y_test, gbm_pred_undersampled)*100) + "%")

##### Recall

In [None]:
print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_undersampled)*100) + "%")
print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_undersampled)*100) + "%")
print("Recall Random Forest:", str(recall_score(y_test, rf_pred_undersampled)*100) + "%")
print("Recall GBM:", str(recall_score(y_test, gbm_pred_undersampled)*100) + "%")

##### Confusion Matrix

In [None]:
print("Confusion Matrix Logistic Regression")
lr_cm_undersampled = pd.DataFrame(confusion_matrix(y_test, lr_pred_undersampled))
lr_cm_undersampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
lr_cm_undersampled
# We are missclassifing 6 

In [None]:
print("Confusion Matrix Decision Tree")
dt_cm_undersampled = pd.DataFrame(confusion_matrix(y_test, dt_pred_undersampled))
dt_cm_undersampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
dt_cm_undersampled
# We are missclassifing 6

In [None]:
print("Confusion Matrix Random Forest")
rf_cm_undersampled = pd.DataFrame(confusion_matrix(y_test, rf_pred_undersampled))
rf_cm_undersampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
rf_cm_undersampled
# We are missclassifing 5

In [None]:
print("Confusion Matrix GBM")
gbm_cm_undersampled = pd.DataFrame(confusion_matrix(y_test, gbm_pred_undersampled))
gbm_cm_undersampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
gbm_cm_undersampled
# We are missclassifing 6

##### ROC/AUC

In [None]:
fpr_lr_undersampled, tpr_lr_undersampled, _ = roc_curve(y_test, lr_pred_undersampled)
auc_lr_undersampled = roc_auc_score(y_test, lr_pred_undersampled)
plt.plot(fpr_lr_undersampled,tpr_lr_undersampled,label="Logistic Regression, auc="+str(auc_lr_undersampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_dt_undersampled, tpr_dt_undersampled, _ = roc_curve(y_test, dt_pred_undersampled)
auc_dt_undersampled = roc_auc_score(y_test, dt_pred_undersampled)
plt.plot(fpr_dt_undersampled,tpr_dt_undersampled,label="Decision Tree, auc="+str(auc_dt_undersampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_rf_undersampled, tpr_rf_undersampled, _ = roc_curve(y_test, rf_pred_undersampled)
auc_rf_undersampled = roc_auc_score(y_test, rf_pred_undersampled)
plt.plot(fpr_rf_undersampled,tpr_rf_undersampled,label="Random Forest, auc="+str(auc_rf_undersampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_gbm_undersampled, tpr_gbm_undersampled, _ = roc_curve(y_test, gbm_pred_undersampled)
auc_gbm_undersampled = roc_auc_score(y_test, gbm_pred_undersampled)
plt.plot(fpr_gbm_undersampled,tpr_gbm_undersampled,label="GBM, auc="+str(auc_gbm_undersampled))
plt.legend(loc=4)
plt.show()

### 2.1 Oversampling (brute force)

We are going to create copies from the minority class in order to balance the dataset

In [None]:
fraud_indices_len = len(training_set[training_set.Class == 1].index)
normal_indices_len = len(training_set[training_set.Class == 0].index)

train_oversampled = training_set[training_set.Class == 0]
fraud_samples = training_set[training_set.Class == 1]

i = 0
total_copies = int(normal_indices_len/fraud_indices_len)
while i < total_copies:
    frames = [train_oversampled, fraud_samples]
    train_oversampled = pd.concat(frames)
    i += 1

In [None]:
# Returning all rows in random order
train_oversampled = train_oversampled.sample(frac=1)

In [None]:
X_train_oversampled = train_oversampled.iloc[:,0:29]
Y_train_oversampled = train_oversampled.Class

In [None]:
X_train_oversampled.head()

In [None]:
# Checking same number of samples per class
train_oversampled.Class.value_counts()

#### Training oversampled models

#### Logistic Regression

In [None]:
logistic_regression_oversampled = LogisticRegression(solver='liblinear').fit(X_train_oversampled, Y_train_oversampled)

In [None]:
lr_pred_oversampled = logistic_regression_oversampled.predict(X_test)

##### Decision Tree

In [None]:
decision_tree_oversampled = tree.DecisionTreeClassifier().fit(X_train_oversampled, Y_train_oversampled)

In [None]:
dt_pred_oversampled = decision_tree_oversampled.predict(X_test)

##### Random Forest

In [None]:
random_forest_oversampled = RandomForestClassifier(n_estimators=10).fit(X_train_oversampled, Y_train_oversampled)

In [None]:
rf_pred_oversampled = random_forest_oversampled.predict(X_test)

##### GBM

In [None]:
gbm_oversampled = xgb.XGBClassifier()
gbm_oversampled.fit(X_train_oversampled, Y_train_oversampled)

In [None]:
gbm_pred_oversampled = gbm_oversampled.predict(X_test)

##### Saving Models

In [None]:
pickle.dump(logistic_regression_oversampled, open("logisticRegression_oversampled_bruteForce.sav", 'wb'))
pickle.dump(decision_tree_oversampled, open("decisionTree_oversampled_bruteForce.sav", 'wb'))
pickle.dump(random_forest_oversampled, open("randomForest_oversampled_bruteForce.sav", 'wb'))
pickle.dump(gbm_oversampled, open("gbm_oversampled_bruteForce.sav", 'wb'))

##### Evaluating Models

##### Accuracy

In [None]:
print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_oversampled)*100) + "%")
print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_oversampled)*100) + "%")
print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_oversampled)*100) + "%")
print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_oversampled)*100) + "%")

##### F1 Score

In [None]:
print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_oversampled)*100) + "%")
print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_oversampled)*100) + "%")
print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_oversampled)*100) + "%")
print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_oversampled)*100) + "%")

##### Precision

In [None]:
print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_oversampled)*100) + "%")
print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_oversampled)*100) + "%")
print("Precision Random Forest:", str(precision_score(y_test, rf_pred_oversampled)*100) + "%")
print("Precision GBM:", str(precision_score(y_test, gbm_pred_oversampled)*100) + "%")

##### Recall

In [None]:
print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_oversampled)*100) + "%")
print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_oversampled)*100) + "%")
print("Recall Random Forest:", str(recall_score(y_test, rf_pred_oversampled)*100) + "%")
print("Recall GBM:", str(recall_score(y_test, gbm_pred_oversampled)*100) + "%")

In [None]:
print("Confusion Matrix Logistic Regression")
lr_cm_oversampled = pd.DataFrame(confusion_matrix(y_test, lr_pred_oversampled))
lr_cm_oversampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
lr_cm_oversampled

In [None]:
print("Confusion Matrix Decision Tree")
dt_cm_oversampled = pd.DataFrame(confusion_matrix(y_test, dt_pred_oversampled))
dt_cm_oversampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
dt_cm_oversampled
# We are missclassifing 6

In [None]:
print("Confusion Matrix Random Forest")
rf_cm_oversampled = pd.DataFrame(confusion_matrix(y_test, rf_pred_oversampled))
rf_cm_oversampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
rf_cm_oversampled
# We are missclassifing 5

In [None]:
print("Confusion Matrix GBM")
gbm_cm_oversampled = pd.DataFrame(confusion_matrix(y_test, gbm_pred_oversampled))
gbm_cm_oversampled.columns = ["Predicted Not Fraud","Predicted Fraud"]
gbm_cm_oversampled
# We are missclassifing 6

##### ROC/UAC

In [None]:
fpr_lr_oversampled, tpr_lr_oversampled, _ = roc_curve(y_test, lr_pred_oversampled)
auc_lr_oversampled = roc_auc_score(y_test, lr_pred_oversampled)
plt.plot(fpr_lr_oversampled,tpr_lr_oversampled,label="Logistic Regression, auc="+str(auc_lr_oversampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_dt_oversampled, tpr_dt_oversampled, _ = roc_curve(y_test, dt_pred_oversampled)
auc_dt_oversampled = roc_auc_score(y_test, dt_pred_oversampled)
plt.plot(fpr_dt_oversampled,tpr_dt_oversampled,label="Decision Tree, auc="+str(auc_dt_oversampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_rf_oversampled, tpr_rf_oversampled, _ = roc_curve(y_test, rf_pred_oversampled)
auc_rf_oversampled = roc_auc_score(y_test, rf_pred_oversampled)
plt.plot(fpr_rf_oversampled,tpr_rf_oversampled,label="Random Forest, auc="+str(auc_rf_oversampled))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_gbm_oversampled, tpr_gbm_oversampled, _ = roc_curve(y_test, gbm_pred_oversampled)
auc_gbm_oversampled = roc_auc_score(y_test, gbm_pred_oversampled)
plt.plot(fpr_gbm_oversampled,tpr_gbm_oversampled,label="GBM, auc="+str(auc_gbm_oversampled))
plt.legend(loc=4)
plt.show()

### 2.2 Oversampling v2 (sklearn)

We are going to create copies from the minority class in order to balance the dataset with sklearn resample module

In [None]:
fraud_df = training_set[training_set.Class == 1]
normal_df = training_set[training_set.Class == 0]

# upsample minority
fraud_oversampled_df = resample(fraud_df,
                          replace=True, # sample with replacement
                          n_samples=len(normal_df), # match number in majority class
                          random_state=777)

# Join both fraud and normal dataframes
train_oversampled_v2 = pd.concat([normal_df, fraud_oversampled_df])

In [None]:
# Checking same number of samples per class
train_oversampled_v2.Class.value_counts()

In [None]:
# Returning all rows in random order
train_oversampled_v2 = train_oversampled_v2.sample(frac=1)

In [None]:
X_train_oversampled_v2 = train_oversampled_v2.iloc[:,0:29]
Y_train_oversampled_v2 = train_oversampled_v2.Class

In [None]:
X_train_oversampled_v2.head()

#### Training oversampled models

#### Logistic Regression

In [None]:
logistic_regression_oversampled_v2 = LogisticRegression(solver='liblinear').fit(X_train_oversampled_v2, Y_train_oversampled_v2)

In [None]:
lr_pred_oversampled_v2 = logistic_regression_oversampled_v2.predict(X_test)

##### Decision Tree

In [None]:
decision_tree_oversampled_v2 = tree.DecisionTreeClassifier().fit(X_train_oversampled_v2, Y_train_oversampled_v2)

In [None]:
dt_pred_oversampled_v2 = decision_tree_oversampled_v2.predict(X_test)

##### Random Forest

In [None]:
random_forest_oversampled_v2 = RandomForestClassifier(n_estimators=10).fit(X_train_oversampled_v2, Y_train_oversampled_v2)

In [None]:
rf_pred_oversampled_v2 = random_forest_oversampled_v2.predict(X_test)

##### GBM

In [None]:
gbm_oversampled_v2 = xgb.XGBClassifier()
gbm_oversampled_v2.fit(X_train_oversampled_v2, Y_train_oversampled_v2)

In [None]:
gbm_pred_oversampled_v2 = gbm_oversampled_v2.predict(X_test)

##### Saving Models

In [None]:
pickle.dump(logistic_regression_oversampled_v2, open("logisticRegression_oversampled_v2.sav", 'wb'))
pickle.dump(decision_tree_oversampled_v2, open("decisionTree_oversampled_v2.sav", 'wb'))
pickle.dump(random_forest_oversampled_v2, open("randomForest_oversampled_v2.sav", 'wb'))
pickle.dump(gbm_oversampled_v2, open("gbm_oversampled_v2.sav", 'wb'))

##### Evaluating Models

##### Accuracy

In [None]:
print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_oversampled_v2)*100) + "%")
print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_oversampled_v2)*100) + "%")
print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_oversampled_v2)*100) + "%")
print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_oversampled_v2)*100) + "%")

##### F1 Score

In [None]:
print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_oversampled_v2)*100) + "%")
print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_oversampled_v2)*100) + "%")
print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_oversampled_v2)*100) + "%")
print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_oversampled_v2)*100) + "%")

##### Precision

In [None]:
print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_oversampled_v2)*100) + "%")
print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_oversampled_v2)*100) + "%")
print("Precision Random Forest:", str(precision_score(y_test, rf_pred_oversampled_v2)*100) + "%")
print("Precision GBM:", str(precision_score(y_test, gbm_pred_oversampled_v2)*100) + "%")

##### Recall

In [None]:
print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_oversampled_v2)*100) + "%")
print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_oversampled_v2)*100) + "%")
print("Recall Random Forest:", str(recall_score(y_test, rf_pred_oversampled_v2)*100) + "%")
print("Recall GBM:", str(recall_score(y_test, gbm_pred_oversampled_v2)*100) + "%")

In [None]:
print("Confusion Matrix Logistic Regression")
lr_cm_oversampled_v2 = pd.DataFrame(confusion_matrix(y_test, lr_pred_oversampled_v2))
lr_cm_oversampled_v2.columns = ["Predicted Not Fraud","Predicted Fraud"]
lr_cm_oversampled_v2

In [None]:
print("Confusion Matrix Decision Tree")
dt_cm_oversampled_v2 = pd.DataFrame(confusion_matrix(y_test, dt_pred_oversampled_v2))
dt_cm_oversampled_v2.columns = ["Predicted Not Fraud","Predicted Fraud"]
dt_cm_oversampled_v2
# We are missclassifing 6

In [None]:
print("Confusion Matrix Random Forest")
rf_cm_oversampled_v2 = pd.DataFrame(confusion_matrix(y_test, rf_pred_oversampled_v2))
rf_cm_oversampled_v2.columns = ["Predicted Not Fraud","Predicted Fraud"]
rf_cm_oversampled_v2
# We are missclassifing 5

In [None]:
print("Confusion Matrix GBM")
gbm_cm_oversampled_v2 = pd.DataFrame(confusion_matrix(y_test, gbm_pred_oversampled_v2))
gbm_cm_oversampled_v2.columns = ["Predicted Not Fraud","Predicted Fraud"]
gbm_cm_oversampled_v2
# We are missclassifing 6

##### ROC/UAC

In [None]:
fpr_lr_oversampled_v2, tpr_lr_oversampled_v2, _ = roc_curve(y_test, lr_pred_oversampled_v2)
auc_lr_oversampled_v2 = roc_auc_score(y_test, lr_pred_oversampled_v2)
plt.plot(fpr_lr_oversampled_v2,tpr_lr_oversampled_v2,label="Logistic Regression, auc="+str(auc_lr_oversampled_v2))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_dt_oversampled_v2, tpr_dt_oversampled_v2, _ = roc_curve(y_test, dt_pred_oversampled_v2)
auc_dt_oversampled_v2 = roc_auc_score(y_test, dt_pred_oversampled_v2)
plt.plot(fpr_dt_oversampled_v2,tpr_dt_oversampled_v2,label="Decision Tree, auc="+str(auc_dt_oversampled_v2))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_rf_oversampled_v2, tpr_rf_oversampled_v2, _ = roc_curve(y_test, rf_pred_oversampled_v2)
auc_rf_oversampled_v2 = roc_auc_score(y_test, rf_pred_oversampled_v2)
plt.plot(fpr_rf_oversampled_v2,tpr_rf_oversampled_v2,label="Random Forest, auc="+str(auc_rf_oversampled_v2))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_gbm_oversampled_v2, tpr_gbm_oversampled_v2, _ = roc_curve(y_test, gbm_pred_oversampled_v2)
auc_gbm_oversampled_v2 = roc_auc_score(y_test, gbm_pred_oversampled_v2)
plt.plot(fpr_gbm_oversampled_v2,tpr_gbm_oversampled_v2,label="GBM, auc="+str(auc_gbm_oversampled_v2))
plt.legend(loc=4)
plt.show()

### 2.3 Oversampling with SMOTE

We are going to create syntethic copies from the minority class in order to balance the dataset. This technique is called SMOTE or Synthetic Minority Oversampling Technique.

In [None]:
# Increasing the training set
sm = SMOTE(random_state=777, sampling_strategy=1.0)
X_train_oversampled_syntethic, Y_train_oversampled_syntethic = sm.fit_sample(X_train, y_train)

In [None]:
# Checking same number of samples per class
pd.DataFrame(Y_train_oversampled_syntethic).Class.value_counts()

In [None]:
X_train_oversampled_syntethic.head()

#### Training oversampled models

#### Logistic Regression

In [None]:
logistic_regression_oversampled_syntethic = LogisticRegression(solver='liblinear').fit(X_train_oversampled_syntethic, Y_train_oversampled_syntethic)

In [None]:
lr_pred_oversampled_syntethic = logistic_regression_oversampled_syntethic.predict(X_test)

##### Decision Tree

In [None]:
decision_tree_oversampled_syntethic = tree.DecisionTreeClassifier().fit(X_train_oversampled_syntethic, Y_train_oversampled_syntethic)

In [None]:
dt_pred_oversampled_syntethic = decision_tree_oversampled_syntethic.predict(X_test)

##### Random Forest

In [None]:
random_forest_oversampled_syntethic = RandomForestClassifier(n_estimators=10).fit(X_train_oversampled_syntethic, Y_train_oversampled_syntethic)

In [None]:
rf_pred_oversampled_syntethic = random_forest_oversampled_syntethic.predict(X_test)

##### GBM

In [None]:
gbm_oversampled_syntethic = xgb.XGBClassifier()
gbm_oversampled_syntethic.fit(X_train_oversampled_syntethic, Y_train_oversampled_syntethic)

In [None]:
gbm_pred_oversampled_syntethic = gbm_oversampled_syntethic.predict(X_test)

##### Saving Models

In [None]:
pickle.dump(logistic_regression_oversampled_syntethic, open("logisticRegression_oversampled_syntethic.sav", 'wb'))
pickle.dump(decision_tree_oversampled_syntethic, open("decisionTree_oversampled_syntethic.sav", 'wb'))
pickle.dump(random_forest_oversampled_syntethic, open("randomForest_oversampled_syntethic.sav", 'wb'))
pickle.dump(gbm_oversampled_syntethic, open("gbm_oversampled_syntethic.sav", 'wb'))

##### Evaluating Models

##### Accuracy

In [None]:
print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_oversampled_syntethic)*100) + "%")
print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_oversampled_syntethic)*100) + "%")
print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_oversampled_syntethic)*100) + "%")
print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_oversampled_syntethic)*100) + "%")

##### F1 Score

In [None]:
print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_oversampled_syntethic)*100) + "%")
print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_oversampled_syntethic)*100) + "%")
print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_oversampled_syntethic)*100) + "%")
print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_oversampled_syntethic)*100) + "%")

##### Precision

In [None]:
print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_oversampled_syntethic)*100) + "%")
print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_oversampled_syntethic)*100) + "%")
print("Precision Random Forest:", str(precision_score(y_test, rf_pred_oversampled_syntethic)*100) + "%")
print("Precision GBM:", str(precision_score(y_test, gbm_pred_oversampled_syntethic)*100) + "%")

##### Recall

In [None]:
print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_oversampled_syntethic)*100) + "%")
print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_oversampled_syntethic)*100) + "%")
print("Recall Random Forest:", str(recall_score(y_test, rf_pred_oversampled_syntethic)*100) + "%")
print("Recall GBM:", str(recall_score(y_test, gbm_pred_oversampled_syntethic)*100) + "%")

In [None]:
print("Confusion Matrix Logistic Regression")
lr_cm_oversampled_syntethic = pd.DataFrame(confusion_matrix(y_test, lr_pred_oversampled_syntethic))
lr_cm_oversampled_syntethic.columns = ["Predicted Not Fraud","Predicted Fraud"]
lr_cm_oversampled_syntethic

In [None]:
print("Confusion Matrix Decision Tree")
dt_cm_oversampled_syntethic = pd.DataFrame(confusion_matrix(y_test, dt_pred_oversampled_syntethic))
dt_cm_oversampled_syntethic.columns = ["Predicted Not Fraud","Predicted Fraud"]
dt_cm_oversampled_syntethic
# We are missclassifing 6

In [None]:
print("Confusion Matrix Random Forest")
rf_cm_oversampled_syntethic = pd.DataFrame(confusion_matrix(y_test, rf_pred_oversampled_syntethic))
rf_cm_oversampled_syntethic.columns = ["Predicted Not Fraud","Predicted Fraud"]
rf_cm_oversampled_syntethic
# We are missclassifing 5

In [None]:
print("Confusion Matrix GBM")
gbm_cm_oversampled_syntethic = pd.DataFrame(confusion_matrix(y_test, gbm_pred_oversampled_syntethic))
gbm_cm_oversampled_syntethic.columns = ["Predicted Not Fraud","Predicted Fraud"]
gbm_cm_oversampled_syntethic
# We are missclassifing 6

##### ROC/UAC

In [None]:
fpr_lr_oversampled_syntethic, tpr_lr_oversampled_syntethic, _ = roc_curve(y_test, lr_pred_oversampled_syntethic)
auc_lr_oversampled_syntethic = roc_auc_score(y_test, lr_pred_oversampled_syntethic)
plt.plot(fpr_lr_oversampled_syntethic,tpr_lr_oversampled_syntethic,label="Logistic Regression, auc="+str(auc_lr_oversampled_syntethic))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_dt_oversampled_syntethic, tpr_dt_oversampled_syntethic, _ = roc_curve(y_test, dt_pred_oversampled_syntethic)
auc_dt_oversampled_syntethic = roc_auc_score(y_test, dt_pred_oversampled_syntethic)
plt.plot(fpr_dt_oversampled_syntethic,tpr_dt_oversampled_syntethic,label="Decision Tree, auc="+str(auc_dt_oversampled_syntethic))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_rf_oversampled_syntethic, tpr_rf_oversampled_syntethic, _ = roc_curve(y_test, rf_pred_oversampled_syntethic)
auc_rf_oversampled_syntethic = roc_auc_score(y_test, rf_pred_oversampled_syntethic)
plt.plot(fpr_rf_oversampled_syntethic,tpr_rf_oversampled_syntethic,label="Random Forest, auc="+str(auc_rf_oversampled_syntethic))
plt.legend(loc=4)
plt.show()

In [None]:
fpr_gbm_oversampled_syntethic, tpr_gbm_oversampled_syntethic, _ = roc_curve(y_test, gbm_pred_oversampled_syntethic)
auc_gbm_oversampled_syntethic = roc_auc_score(y_test, gbm_pred_oversampled_syntethic)
plt.plot(fpr_gbm_oversampled_syntethic,tpr_gbm_oversampled_syntethic,label="GBM, auc="+str(auc_gbm_oversampled_syntethic))
plt.legend(loc=4)
plt.show()

### 3.1 Ensembling L different models with a resampled dataset

We are going to build N models, in this example 5, 10, 20 and 50, where every model has all the training data from the minority class and N/10 samples of the majority. Then, the prediction will be accorded between all the models having all of them the same weigth. 

In [None]:
# Calculates the average prediction of N list predictions with everyone having the same weight.
# where X is a list of arrays
# maybe can be imrpvoed using np.average([array_1, array_2], axis=0, weights=[weight_1, weight_2])
def calculate_weighted_prediction(lista):
    res = 0
    for i in range(0,len(lista)):
        weight = 1/len(lista)
        res += weight*lista[i]
    return (res > 0.5).astype(int)

In [None]:
def ensembleMethod(N, training_set):
    
    # We divide the training set in oredr to be able to make different datasets
    fraud_samples = training_set[training_set.Class == 1]
    normal_samples = training_set[training_set.Class == 0]
    
    new_length = int(len(normal_samples)/N)
    training_ensemble = []
    X_train_ensemble = []
    Y_train_ensemble = []
    
    i = 0
    while i < N:
        new_frames = [fraud_samples, pd.DataFrame(normal_samples[i*new_length:(i+1)*new_length])]
        tmp = pd.concat(new_frames)
        training_ensemble.append(tmp)
        # Random order
        training_ensemble[i] = training_ensemble[i].sample(frac=1)
        # Dividing X and Y
        tmp = training_ensemble[i].iloc[:,0:29]
        X_train_ensemble.append(tmp)
        tmp = training_ensemble[i].Class
        Y_train_ensemble.append(tmp)
        i += 1

    # Training models
    n = 0
    logistic_regression_ensemble = []
    decision_tree_ensemble = []
    random_forest_ensemble = []
    gbm_ensemble = []
    while n < N:
        # Logistic Regression
        tmp = LogisticRegression(solver='liblinear').fit(X_train_ensemble[n],Y_train_ensemble[n])
        logistic_regression_ensemble.append(tmp)
        # Decision Tree
        tmp = tree.DecisionTreeClassifier().fit(X_train_ensemble[n],Y_train_ensemble[n])
        decision_tree_ensemble.append(tmp)
        # Random Forest
        tmp = RandomForestClassifier(n_estimators=10).fit(X_train_ensemble[n],Y_train_ensemble[n])
        random_forest_ensemble.append(tmp)
        # GBM
        tmp = xgb.XGBClassifier()
        tmp.fit(X_train_ensemble[n],Y_train_ensemble[n])      
        gbm_ensemble.append(tmp)
        n += 1
     
    # Saving models
    pickle.dump(logistic_regression_ensemble, open("models_2080/logisticRegression_ensemble1_" + str(N) + ".sav", 'wb'))
    pickle.dump(decision_tree_ensemble, open("models_2080/decisionTree_ensemble1_" + str(N) + ".sav", 'wb'))
    pickle.dump(random_forest_ensemble, open("models_2080/randomForest_ensemble1_" + str(N) + ".sav", 'wb'))
    pickle.dump(gbm_ensemble, open("models_2080/gbm_ensemble1_" + str(N) + ".sav", 'wb'))
    
    # Predicting with each model
    n = 0
    lr_pred_ensemble = []
    rf_pred_ensemble = []
    dt_pred_ensemble = []
    gbm_pred_ensemble = []
    while n < N:
        # Logistic Regression
        tmp = logistic_regression_ensemble[n].predict(X_test)
        lr_pred_ensemble.append(tmp)
        # Decision Tree
        tmp = decision_tree_ensemble[n].predict(X_test)
        dt_pred_ensemble.append(tmp)
        # Random Forest
        tmp = random_forest_ensemble[n].predict(X_test)
        rf_pred_ensemble.append(tmp)
        # GBM
        tmp = gbm_ensemble[n].predict(X_test)
        gbm_pred_ensemble.append(tmp)
        n += 1
        
    # Calculate average prediction
    lr_pred_ensemble_total = calculate_weighted_prediction(lr_pred_ensemble)
    dt_pred_ensemble_total = calculate_weighted_prediction(dt_pred_ensemble)
    rf_pred_ensemble_total = calculate_weighted_prediction(rf_pred_ensemble)
    gbm_pred_ensemble_total = calculate_weighted_prediction(gbm_pred_ensemble)
    
    # Results
    print("************************* RESULTS *************************")
    print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Precision Random Forest:", str(precision_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Precision GBM:", str(precision_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Recall Random Forest:", str(recall_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Recall GBM:", str(recall_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Confusion Matrix Logistic Regression Ensemble " + str(N))
    lr_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, lr_pred_ensemble_total))
    lr_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(lr_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, lr_pred_ensemble_total))
    print("")
    print("Confusion Matrix Decision Tree Ensemble " + str(N))
    dt_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, dt_pred_ensemble_total))
    dt_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(dt_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, dt_pred_ensemble_total))
    print("")
    print("Confusion Matrix Random Forest Ensemble " + str(N))
    rf_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, rf_pred_ensemble_total))
    rf_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(rf_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, rf_pred_ensemble_total))
    print("")
    print("Confusion Matrix GBM Ensemble " + str(N))
    gbm_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, gbm_pred_ensemble_total))
    gbm_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(gbm_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, gbm_pred_ensemble_total))
    print("")

In [None]:
ensembleMethod(2, training_set)

In [None]:
ensembleMethod(3, training_set)

In [None]:
ensembleMethod(5, training_set)

In [None]:
ensembleMethod(7, training_set)

In [None]:
ensembleMethod(10, training_set)

In [None]:
ensembleMethod(15, training_set)

In [None]:
ensembleMethod(20, training_set)

In [None]:
ensembleMethod(27, training_set)

In [None]:
ensembleMethod(35, training_set)

In [None]:
ensembleMethod(50, training_set)

In [None]:
ensembleMethod(100, training_set)

In [None]:
ensembleMethod(250, training_set)

In [None]:
ensembleMethod(500, training_set)

In [None]:
ensembleMethod(750, training_set)

In [None]:
ensembleMethod(1000, training_set)

### 3.2 Ensembling L different models with different ratios of a resampled dataset with an equal weight

We are going to build N models where every model has all the training data from the minority class and different ratios [(1,1),(1,1.5),(1,2),(1,3),(1,4),(1,5),(1,7.5),(1,10),(2,1),(3,1)] from the majority class.For instance, if we make 5 models, we could make 1:1, 1:10, 2:1, (1:5) and (3,1). We are picking randomly the ratios in order to validate the hypothesis (if picking all or doing distribution, will take too much time).
Then, the prediction will be accorded between all the models having all of them the same weigth. 

In [None]:
# Calculates the average prediction of N list predictions with everyone having the same weight.
# where X is a list of arrays
# maybe can be imrpvoed using np.average([array_1, array_2], axis=0, weights=[weight_1, weight_2])
def calculate_weighted_prediction(lista):
    res = 0
    for i in range(0,len(lista)):
        weight = 1/len(lista)
        res += weight*lista[i]
    return (res > 0.5).astype(int)

In [None]:
# N max = 10
# We are picking random 
def ensembleMethod_2(N, training_set):
    
    if N > 10 or N < 2:
        return "Maximum N value is 10 and minimum is 2."
    
    # (rare/abundant)
    ratios = np.array(([1,1],[1,1.5],[1,2],[1,3],[1,4],[1,5],[1,7.5],[1,10],[2,1],[3,1]))
    random_samples = random.choices(ratios, k=N)

    # We divide the training set in oredr to be able to make different datasets
    fraud_samples = training_set[training_set.Class == 1]
    normal_samples = training_set[training_set.Class == 0]
    
    # Having this high imbalance, we will never achieve a limit with the majority class
    # However, if it happens, we should duplicate the majority class samples with different models; 
    # for instance, Model A from 1 to 8, model B from 9 to 13, model C from 14 to 8; like a wheel :)
        
    
    training_ensemble = []
    X_train_ensemble = []
    Y_train_ensemble = []
    start_majority_class = 0
    i = 0
    while i < N:
        # fraud and normal samples distribution
        new_length_normal_samples = int(ratios[i][1]*len(fraud_samples) + start_majority_class)
        # in fraud we only duplicate or triplicate, so
        if ratios[i][0] == 2:
            tmp_fraud_samples = pd.concat([fraud_samples, fraud_samples])
        elif ratios[i][0] == 3:
            tmp_fraud_samples = pd.concat([fraud_samples, fraud_samples, fraud_samples])
        else:
            tmp_fraud_samples = fraud_samples
        # we create the new dataset
        new_frames = [tmp_fraud_samples, pd.DataFrame(normal_samples[start_majority_class:new_length_normal_samples])]
        tmp = pd.concat(new_frames)
        training_ensemble.append(tmp)
        # Random order
        training_ensemble[i] = training_ensemble[i].sample(frac=1)
        # Dividing X and Y
        tmp = training_ensemble[i].iloc[:,0:29]
        X_train_ensemble.append(tmp)
        tmp = training_ensemble[i].Class
        Y_train_ensemble.append(tmp)
        
        start_majority_class = new_length_normal_samples + 1
        i += 1

    # Training models, each model has all different datasets and then the prediction is weighted
    n = 0
    logistic_regression_ensemble = []
    decision_tree_ensemble = []
    random_forest_ensemble = []
    gbm_ensemble = []
    while n < N:
        # Logistic Regression
        tmp = LogisticRegression(solver='liblinear').fit(X_train_ensemble[n],Y_train_ensemble[n])
        logistic_regression_ensemble.append(tmp)
        # Decision Tree
        tmp = tree.DecisionTreeClassifier().fit(X_train_ensemble[n],Y_train_ensemble[n])
        decision_tree_ensemble.append(tmp)
        # Random Forest
        tmp = RandomForestClassifier(n_estimators=10).fit(X_train_ensemble[n],Y_train_ensemble[n])
        random_forest_ensemble.append(tmp)
        # GBM
        tmp = xgb.XGBClassifier()
        tmp.fit(X_train_ensemble[n],Y_train_ensemble[n])      
        gbm_ensemble.append(tmp)
        n += 1
     
    # Saving models
    pickle.dump(logistic_regression_ensemble, open("models_2080/logisticRegression_ensemble2_" + str(N) + ".sav", 'wb'))
    pickle.dump(decision_tree_ensemble, open("models_2080/decisionTree_ensemble2_" + str(N) + ".sav", 'wb'))
    pickle.dump(random_forest_ensemble, open("models_2080/randomForest_ensemble2_" + str(N) + ".sav", 'wb'))
    pickle.dump(gbm_ensemble, open("models_2080/gbm_ensemble2_" + str(N) + ".sav", 'wb'))
    
    # Predicting with each model
    n = 0
    lr_pred_ensemble = []
    rf_pred_ensemble = []
    dt_pred_ensemble = []
    gbm_pred_ensemble = []
    while n < N:
        # Logistic Regression
        tmp = logistic_regression_ensemble[n].predict(X_test)
        lr_pred_ensemble.append(tmp)
        # Decision Tree
        tmp = decision_tree_ensemble[n].predict(X_test)
        dt_pred_ensemble.append(tmp)
        # Random Forest
        tmp = random_forest_ensemble[n].predict(X_test)
        rf_pred_ensemble.append(tmp)
        # GBM
        tmp = gbm_ensemble[n].predict(X_test)
        gbm_pred_ensemble.append(tmp)
        n += 1
        
    # Calculate average prediction
    lr_pred_ensemble_total = calculate_weighted_prediction(lr_pred_ensemble)
    dt_pred_ensemble_total = calculate_weighted_prediction(dt_pred_ensemble)
    rf_pred_ensemble_total = calculate_weighted_prediction(rf_pred_ensemble)
    gbm_pred_ensemble_total = calculate_weighted_prediction(gbm_pred_ensemble)
    
    # Results
    print("************************* RESULTS *************************")
    print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("F1 Score GBM:", str(f1_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Precision Decision Tree:", str(precision_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Precision Random Forest:", str(precision_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Precision GBM:", str(precision_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred_ensemble_total)*100) + "%")
    print("Recall Decision Tree:", str(recall_score(y_test, dt_pred_ensemble_total)*100) + "%")
    print("Recall Random Forest:", str(recall_score(y_test, rf_pred_ensemble_total)*100) + "%")
    print("Recall GBM:", str(recall_score(y_test, gbm_pred_ensemble_total)*100) + "%")
    print("")
    print("Confusion Matrix Logistic Regression Ensemble2 " + str(N))
    lr_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, lr_pred_ensemble_total))
    lr_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(lr_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, lr_pred_ensemble_total))
    print("")
    print("Confusion Matrix Decision Tree Ensemble2 " + str(N))
    dt_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, dt_pred_ensemble_total))
    dt_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(dt_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, dt_pred_ensemble_total))
    print("")
    print("Confusion Matrix Random Forest Ensemble2 " + str(N))
    rf_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, rf_pred_ensemble_total))
    rf_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(rf_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, rf_pred_ensemble_total))
    print("")
    print("Confusion Matrix GBM Ensemble2 " + str(N))
    gbm_cm_ensemble = pd.DataFrame(confusion_matrix(y_test, gbm_pred_ensemble_total))
    gbm_cm_ensemble.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(gbm_cm_ensemble)
    print("AUC:", roc_auc_score(y_test, gbm_pred_ensemble_total))
    print("")

In [None]:
ensembleMethod_2(2, training_set)

In [None]:
ensembleMethod_2(3, training_set)

In [None]:
ensembleMethod_2(4, training_set)

In [None]:
ensembleMethod_2(5, training_set)

In [None]:
ensembleMethod_2(6, training_set)

In [None]:
ensembleMethod_2(7, training_set)

In [None]:
ensembleMethod_2(8, training_set)

In [None]:
ensembleMethod_2(9, training_set)

In [None]:
ensembleMethod_2(10, training_set)

### 3.3 Ensembling L different models with different ratios of a resampled dataset with different weights (Suggestion)

In [None]:
# Maybe this can be done, altough we have validate it our hyptohestis with an equal weigth. 

### 4 Clustering the majority class in R groups and using the center of it to the training set. 

We are going to cluster the majority class in R groups, being R => nº of minority class samples. In this example, R will be from the same number of the minority class ("like undersampling but with the medoid) to R*500. 
(R, 2R, 3R... 500R)

This decision/premise has been decided due to computation problems; doing the KMedoids in a large dataset consumes a lot of resources.

In [None]:
def clusteringMethod(R, training_set):
    if R > 500 or R < 1:
        return "R must be between 1 and 500."
    
    fraud_samples = training_set[training_set.Class == 1]
    normal_samples = training_set[training_set.Class == 0]
    
    # As we set the minum value of R which is the same number of the minority class, we can divide our data 
    # creating clusters of the min samples nº and then  we will create X cluster in function of R. 
    new_normal_samples_medoid = pd.DataFrame()
    start = 0
    size = int(len(normal_samples)/len(fraud_samples)) 
    for i in range(0,len(fraud_samples)):
        tmp = pd.DataFrame(normal_samples[start:(start+size-1)])
        start = start + size
        kmedoids = KMedoids(n_clusters=R, random_state=777).fit(tmp)
        centers = kmedoids.cluster_centers_    
        for y in centers:
            new_normal_samples_medoid = new_normal_samples_medoid.append(
                pd.DataFrame(y.reshape(1,-1), columns=list(normal_samples)), ignore_index=True)
                       
    # Here we have all the medioids, so we join the dataset in order to start training 
    new_frames = [fraud_samples, new_normal_samples_medoid]
    training_set_cluster = pd.concat(new_frames)  
    # Random order
    training_set_cluster = training_set_cluster.sample(frac=1)
    # Dividing X and Y
    X_train_clustering = training_set_cluster.iloc[:,0:29]
    Y_train_clustering = training_set_cluster.Class
    
    # Now we train the models
    ##### Logistic regression
    logistic_regression = LogisticRegression(solver='liblinear').fit(X_train_clustering, Y_train_clustering)
    lr_pred = logistic_regression.predict(X_test)
    ##### Decision Tree
    decision_tree = tree.DecisionTreeClassifier().fit(X_train_clustering, Y_train_clustering)
    dt_pred = decision_tree.predict(X_test)
    ##### Random Forest
    random_forest = RandomForestClassifier(n_estimators=10).fit(X_train_clustering, Y_train_clustering)
    rf_pred = random_forest.predict(X_test)
    ##### GBM
    gbm = xgb.XGBClassifier()
    gbm.fit(X_train_clustering, Y_train_clustering)
    gbm_pred = gbm.predict(X_test)
    
    # Saving Models
    pickle.dump(logistic_regression, open("models_2080/logisticRegression_clusteredR" + str(R) + ".sav", 'wb'))
    pickle.dump(decision_tree, open("models_2080/decisionTree_clustered" + str(R) + ".sav", 'wb'))
    pickle.dump(random_forest, open("models_2080/randomForest_clustered" + str(R) + ".sav", 'wb'))
    pickle.dump(gbm, open("models_2080/gbm_clustered" + str(R) + ".sav", 'wb'))
    
    # Results
    print("************************* RESULTS *************************")
    print("Accuracy Logistic Regression:", str(accuracy_score(y_test, lr_pred)*100) + "%")
    print("Accuracy Decision Tree:", str(accuracy_score(y_test, dt_pred)*100) + "%")
    print("Accuracy Random Forest:", str(accuracy_score(y_test, rf_pred)*100) + "%")
    print("Accuracy GBM:", str(accuracy_score(y_test, gbm_pred)*100) + "%")
    print("")
    print("F1 Score Logistic Regression:", str(f1_score(y_test, lr_pred)*100) + "%")
    print("F1 Score Decision Tree:", str(f1_score(y_test, dt_pred)*100) + "%")
    print("F1 Score Random Forest:", str(f1_score(y_test, rf_pred)*100) + "%")
    print("F1 Score GBM:", str(f1_score(y_test, gbm_pred)*100) + "%")
    print("")
    print("Precision Logistic Regression:", str(precision_score(y_test, lr_pred)*100) + "%")
    print("Precision Decision Tree:", str(precision_score(y_test, dt_pred)*100) + "%")
    print("Precision Random Forest:", str(precision_score(y_test, rf_pred)*100) + "%")
    print("Precision GBM:", str(precision_score(y_test, gbm_pred)*100) + "%")
    print("")
    print("Recall Logistic Regression:", str(recall_score(y_test, lr_pred)*100) + "%")
    print("Recall Decision Tree:", str(recall_score(y_test, dt_pred)*100) + "%")
    print("Recall Random Forest:", str(recall_score(y_test, rf_pred)*100) + "%")
    print("Recall GBM:", str(recall_score(y_test, gbm_pred)*100) + "%")
    print("")
    print("Confusion Matrix Logistic Regression Clustering " + str(R))
    lr_cm = pd.DataFrame(confusion_matrix(y_test, lr_pred))
    lr_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(lr_cm)
    print("AUC:", roc_auc_score(y_test, lr_pred))
    print("")
    print("Confusion Matrix Decision Tree Clustering " + str(R))
    dt_cm = pd.DataFrame(confusion_matrix(y_test, dt_pred))
    dt_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(dt_cm)
    print("AUC:", roc_auc_score(y_test, dt_pred))
    print("")
    print("Confusion Matrix Random Forest Clustering " + str(R))
    rf_cm = pd.DataFrame(confusion_matrix(y_test, rf_pred))
    rf_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(rf_cm)
    print("AUC:", roc_auc_score(y_test, rf_pred))
    print("")
    print("Confusion Matrix GBM Clustering " + str(R))
    gbm_cm = pd.DataFrame(confusion_matrix(y_test, gbm_pred))
    gbm_cm.columns = ["Predicted Not Fraud","Predicted Fraud"]
    print(gbm_cm)
    print("AUC:", roc_auc_score(y_test, gbm_pred))
    print("")

In [None]:
i = 1
while i < 11:
    clusteringMethod(i, training_set)
    i += 1

In [None]:
clusteringMethod(15, training_set)
clusteringMethod(25, training_set)
clusteringMethod(50, training_set)
clusteringMethod(75, training_set)
clusteringMethod(100, training_set)
clusteringMethod(200, training_set)
clusteringMethod(300, training_set)
clusteringMethod(400, training_set)
clusteringMethod(500, training_set)