In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. Import libraries:

In [25]:
import time
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import collections
from collections import Counter
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

### 2. Import data:

In [78]:
dataset_path = "/content/drive/MyDrive/UIT Projects/Khai thác dữ liệu/source/datasets/creditcard_dataset.csv"

In [82]:
df = pd.read_csv(dataset_path)
df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [83]:
df.keys()

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [84]:
class_value_groups = df['Class'].value_counts()
number_of_rows = len(df)

print('No Frauds', round(class_value_groups[0]/number_of_rows * 100, 2), '% of the dataset')
print('Frauds', round(class_value_groups[1]/number_of_rows * 100, 2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


### 3. Pre-processing data:

#### 3.1. Scaling and Distributing:

Scale columns: Time, Amount

In [85]:
rob_scaler = RobustScaler()

scaled_amount = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
scaled_time = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

df.head(5)

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


Split data to train and test

In [86]:
X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

Train: [ 30473  30496  31002 ... 284804 284805 284806] Test: [    0     1     2 ... 57017 57018 57019]
Train: [     0      1      2 ... 284804 284805 284806] Test: [ 30473  30496  31002 ... 113964 113965 113966]
Train: [     0      1      2 ... 284804 284805 284806] Test: [ 81609  82400  83053 ... 170946 170947 170948]
Train: [     0      1      2 ... 284804 284805 284806] Test: [150654 150660 150661 ... 227866 227867 227868]
Train: [     0      1      2 ... 227866 227867 227868] Test: [212516 212644 213092 ... 284804 284805 284806]
----------------------------------------------------------------------------------------------------
Label Distributions: 

[0.99827076 0.00172924]
[0.99827952 0.00172048]


#### 3.2. Random Under-Sampling

In [87]:
# randomize data
df = df.sample(frac=1)

In [88]:
fraud_df = df.loc[df['Class'] == 1]
len_of_fraud = fraud_df.shape[0]

non_fraud_df = df.loc[df['Class'] == 0][:len_of_fraud]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

new_df = normal_distributed_df.sample(frac=1, random_state=42)
new_df.head(5)

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
51292,-0.167819,-0.467757,1.194213,0.418758,0.697577,1.175778,-0.470077,-1.056185,0.195679,-0.254059,...,-0.072757,-0.19311,-0.528938,0.125077,0.68961,0.357252,-0.654699,0.030478,0.036326,0
124087,-0.29344,-0.088359,1.11856,1.291858,-1.298805,2.135772,0.772204,-1.147291,0.390578,-0.107072,...,-0.023576,-0.346374,-0.663588,-0.102326,0.017911,0.650302,-0.332366,0.105949,0.128124,1
36562,-0.090827,-0.541606,1.370664,-0.582927,0.071668,-0.751498,-0.88819,-0.65293,-0.500641,-0.05136,...,-0.519505,-0.367228,-0.682153,3.1e-05,0.006577,0.2453,1.046648,-0.079435,-0.010155,0
261473,-0.238944,0.885137,-2.34934,1.512604,-2.647497,1.753792,0.406328,-2.188494,-0.686935,-0.547984,...,-0.093421,-0.088519,-0.595178,0.258148,0.061901,-0.35418,-1.152671,-0.736073,0.733703,1
10630,10.401174,-0.785418,-5.187878,6.967709,-13.510931,8.617895,-11.214422,0.672248,-9.462533,5.328704,...,-0.623737,2.086083,0.76019,0.716806,-0.646743,-1.617043,0.172347,0.626647,-0.169726,1


- V14 Removing Outliers (Highest Negative Correlated with Labels)

In [89]:
v14_fraud = new_df['V14'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr: {}'.format(v14_iqr))

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('Cut Off: {}'.format(v14_cut_off))
print('V14 Lower: {}'.format(v14_lower))
print('V14 Upper: {}'.format(v14_upper))

outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
print('Feature V14 Outliers for Fraud Cases: {}'.format(len(outliers)))
print('V10 outliers:{}'.format(outliers))

new_df = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)
print('----' * 44)

Quartile 25: -9.692722964972386 | Quartile 75: -4.282820849486865
iqr: 5.409902115485521
Cut Off: 8.114853173228282
V14 Lower: -17.807576138200666
V14 Upper: 3.8320323237414167
Feature V14 Outliers for Fraud Cases: 4
V10 outliers:[-18.8220867423816, -18.0499976898594, -19.2143254902614, -18.4937733551053]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


- V12 removing outliers from fraud transactions

In [90]:
v12_fraud = new_df['V12'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25

v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
print('V12 Lower: {}'.format(v12_lower))
print('V12 Upper: {}'.format(v12_upper))
outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
print('V12 outliers: {}'.format(outliers))
print('Feature V12 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_df = new_df.drop(new_df[(new_df['V12'] > v12_upper) | (new_df['V12'] < v12_lower)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_df)))
print('----' * 44)

V12 Lower: -17.3430371579634
V12 Upper: 5.776973384895937
V12 outliers: [-18.6837146333443, -18.5536970096458, -18.0475965708216, -18.4311310279993]
Feature V12 Outliers for Fraud Cases: 4
Number of Instances after outliers removal: 976
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


- Removing outliers V10 Feature

In [91]:
v10_fraud = new_df['V10'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25

v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
print('V10 Lower: {}'.format(v10_lower))
print('V10 Upper: {}'.format(v10_upper))
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
print('V10 outliers: {}'.format(outliers))
print('Feature V10 Outliers for Fraud Cases: {}'.format(len(outliers)))
new_df = new_df.drop(new_df[(new_df['V10'] > v10_upper) | (new_df['V10'] < v10_lower)].index)
print('Number of Instances after outliers removal: {}'.format(len(new_df)))

V10 Lower: -14.89885463232024
V10 Upper: 4.92033495834214
V10 outliers: [-15.2399619587112, -19.836148851696, -18.2711681738888, -16.6011969664137, -22.1870885620007, -15.3460988468775, -22.1870885620007, -20.9491915543611, -16.6496281595399, -24.5882624372475, -14.9246547735487, -15.1237521803455, -23.2282548357516, -22.1870885620007, -15.1241628144947, -18.9132433348732, -15.5637913387301, -24.4031849699728, -16.7460441053944, -16.2556117491401, -15.2318333653018, -17.1415136412892, -22.1870885620007, -14.9246547735487, -16.3035376590131, -15.5637913387301, -15.2399619587112]
Feature V10 Outliers for Fraud Cases: 27
Number of Instances after outliers removal: 946


### 4. Training data:

In [24]:
X = new_df.drop('Class', axis=1)
y = new_df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#### 4.1 RF simple training:

- Training with default params:

In [19]:
# train
model = RandomForestClassifier()
model.fit(X_train, y_train)

# evaluating
training_score = cross_val_score(model, X_train, y_train, cv=5)
print("RandomForest training accuracy score: ", round(training_score.mean(), 2) * 100)

RandomForest training accuracy score:  93.0


- Use GridSearch to find best params and Train again:

In [21]:
tree_params = {
    "criterion": ["gini", "entropy"],
    "max_depth": list(range(2, 10, 1)),
    "min_samples_leaf": list(range(2, 10, 1))
}
param_grid = {
    'n_estimators': [10, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid)
grid_rf.fit(X_train, y_train)

# tree best estimator
rf_best = grid_rf.best_estimator_
print('Best params:', rf_best)

tree_score = cross_val_score(rf_best, X_train, y_train, cv=5)
print('RandomForest cross validation score', round(tree_score.mean() * 100, 2).astype(str) + '%')

Best estimator: RandomForestClassifier(criterion='entropy', max_depth=6, max_features='log2',
                       n_estimators=10)
RandomForest cross validation score 93.4%


In [22]:
grid_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'log2',
 'n_estimators': 10}

#### 4.2. Prepare for GA algorithm:

In [92]:
POPULATION_SIZE = 100

ALL_GENEs = [
    'scaled_amount', 'scaled_time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
    'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
    'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'
]

In [93]:
class Individual(object):
    def __init__(self, chromosome):
        self.chromosome = chromosome
        self.fitness = self.calculate_fitness()

    @classmethod
    def create_random_gene(self):
        global ALL_GENEs
        gene = random.choice(ALL_GENEs)
        return gene

    @classmethod
    def create_random_chromosome(self, chro_len):
        global TARGET
        chromosome_len = chro_len
        chromosome = []
        for _ in range(chromosome_len):
            random_gene = self.create_random_gene()
            chromosome.append(random_gene)
        return chromosome

    def crossover(self, individual_2):
        child_chromosome = []
        for gene_of_ind1, gene_of_ind2 in zip(self.chromosome, individual_2.chromosome):
            prob = random.random()

            if prob < 0.45:
                child_chromosome.append(gene_of_ind1)
            elif prob < 0.9:
                child_chromosome.append(gene_of_ind2)
            else:
                random_gene = self.create_random_gene()
                child_chromosome.append(random_gene)
        child = Individual(child_chromosome)
        return child

    def calculate_fitness(self):
        global TARGET
        fitness = 0
        rf = RandomForestClassifier(criterion= 'entropy', max_depth=6, max_features='log2', n_estimators= 10)

        columns_to_train = self.chromosome
        X = new_df[columns_to_train].values
        Y = new_df['Class'].values

        _, F, _, y = train_test_split(X, Y, test_size=0.2)
        F_train, F_test, y_train, y_test = train_test_split(F, y, test_size=0.2)

        rf.fit(F_train, y_train)
        # evaluating
        y_pred_test = rf.predict(F_test)
        test_score = accuracy_score(y_test, y_pred_test)

        fitness = test_score
        return fitness

#### Find best columns with GA algorithms:

#### 4.2. Training with GA algorithm:

In [None]:
max_generation = 300
current_generation = 1

population = []

# initialize population
for _ in range(POPULATION_SIZE):
    random_chromosome = Individual.create_random_chromosome(chro_len=10)
    _individual = Individual(random_chromosome)
    population.append(_individual)

# genetic process
while current_generation < max_generation:
    population = sorted(population, key = lambda x:x.fitness, reverse=True)

    next_generation_of_population = []

    _10_percent_len = int(10*POPULATION_SIZE/100)
    next_generation_of_population.extend(population[:_10_percent_len])

    _50_percent_len = int(POPULATION_SIZE/2)
    _90_percent_len = int(90*POPULATION_SIZE/100)
    for _ in range(_90_percent_len):
        # crossover top 50% individuals
        parent1 = random.choice(population[:_50_percent_len])
        parent2 = random.choice(population[:_50_percent_len])
        child = parent1.crossover(parent2)

        next_generation_of_population.append(child)

    population = next_generation_of_population
    current_generation += 1

    print("Current Generation: {0} -- Top 1 String: {1} -- Fitness: {2}".format(
                            current_generation,
                            "".join(population[0].chromosome),
                            population[0].fitness))

### 5. Evaluating: