### 1. Import libraries:

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Imported Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import time
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections

from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold

### 2. Import data:

In [None]:
dataset_path = "/content/drive/MyDrive/UIT Projects/Khai thác dữ liệu/source/creditcard_dataset.csv"

In [None]:
df = pd.read_csv(dataset_path)
df.head(5)

FileNotFoundError: ignored

In [None]:
df.keys()

In [None]:
class_value_groups = df['Class'].value_counts()
number_of_rows = len(df)

print('No Frauds', round(class_value_groups[0]/number_of_rows * 100, 2), '% of the dataset')
print('Frauds', round(class_value_groups[1]/number_of_rows * 100, 2), '% of the dataset')

### 3. Pre-processing data:

#### 3.1. Scaling and Distributing:

Scale columns: Time, Amount

In [None]:
rob_scaler = RobustScaler()

scaled_amount = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
scaled_time = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

df.head(5)

Split data to train and test

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

### 4. Training data:

#### 4.1 Simple training with SMOTE:

In [None]:
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

In [None]:
# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters

tree_params = {
    "criterion": ["gini", "entropy"],
    "max_depth": list(range(2, 10, 1)),
    "min_samples_leaf": list(range(2, 10, 1))
}
rand_grid_tree = RandomizedSearchCV(DecisionTreeClassifier(), tree_params)

# Implementing SMOTE Technique
# Cross Validating the right way
for train, test in sss.split(original_Xtrain, original_ytrain):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_grid_tree) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])

    best_est = rand_grid_tree.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])

    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))

print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)

#### 4.2. Training with GA algorithm:





In [None]:
class Individual(object):
    def __init__(self, chromosome):
        self.chromosome = chromosome
        self.fitness = self.calculate_fitness()

    @classmethod
    def create_random_gene(self):
        global ALL_GENEs
        gene = random.choice(ALL_GENEs)
        return gene

    @classmethod
    def create_random_chromosome(self, chro_len):
        global TARGET
        chromosome_len = chro_len
        chromosome = []
        for _ in range(chromosome_len):
            random_gene = self.create_random_gene()
            chromosome.append(random_gene)
        return chromosome

    def crossover(self, individual_2):
        child_chromosome = []
        for gene_of_ind1, gene_of_ind2 in zip(self.chromosome, individual_2.chromosome):
            prob = random.random()

            if prob < 0.45:
                child_chromosome.append(gene_of_ind1)
            elif prob < 0.9:
                child_chromosome.append(gene_of_ind2)
            else:
                random_gene = self.create_random_gene()
                child_chromosome.append(random_gene)
        child = Individual(child_chromosome)
        return child

    def calculate_fitness(self):
        global TARGET
        fitness = 0
        rf = RandomForestClassifier(criterion= 'entropy', max_depth=6, max_features='log2', n_estimators= 10)

        columns_to_train = self.chromosome
        X = new_df[columns_to_train].values
        Y = new_df['Class'].values

        _, F, _, y = train_test_split(X, Y, test_size=0.2)
        F_train, F_test, y_train, y_test = train_test_split(F, y, test_size=0.2)

        rf.fit(F_train, y_train)
        # evaluating
        y_pred_test = rf.predict(F_test)
        test_score = accuracy_score(y_test, y_pred_test)

        fitness = test_score
        return fitness

### 5. Evaluating: