## Notes:
- In Chapter2, we used supervised machine learning. Here, we used unsupervised ML Techniques for fraud detection.
- Moreover, fraud patterns change over time, so supervised systems that are built using fraud labels become stale, capturing historical patterns of fraud but failing to adapt to newly emerging patterns.
- For these reasons (the lack of sufficient labels and the need to adapt to newly emerging patterns of fraud as quickly as possible), unsupervised learning shines.

**We'll be using the same dataset**

# 1. Import the necessary libraries

In [4]:
# Main
import os
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)

# Data Visualization
import seaborn as sns
color = sns.color_palette()

import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

# Data-Preparation
from scipy import stats
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection

# ML-Algorithms
import lightgbm as lgb
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

print("Libraries imported!")

Libraries imported!


# 2. Load the data

In [6]:
credit_data = pd.read_csv("../Ch-2/input/creditcard.csv.gz")
print(f"The Dataset has {credit_data.shape[0]} rows and {credit_data.shape[1]} columns")
display(credit_data.head())

The Dataset has 284807 rows and 31 columns


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 3. Data preparation

In [11]:
credit_dataX = credit_data.copy().drop(["Class", "Time"], axis=1)
credit_dataY = credit_data["Class"].copy()

In [13]:
featuresToScale = credit_dataX.columns
print(featuresToScale, len(featuresToScale))  # Class and Time removed

## Standardization
std_scl = preprocessing.StandardScaler()
credit_dataX.loc[:, featuresToScale] = std_scl.fit_transform(credit_dataX)
print("Standardization completed...")

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object') 29
Standardization completed...


In [14]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(credit_dataX,
                                                                    credit_dataY,
                                                                    random_state=42,
                                                                    shuffle=True,
                                                                    test_size=0.33,
                                                                    stratify=credit_dataY)

print(f"TRAINING INFO: {X_train.shape} {y_train.shape}")
print(f"TEST INFO: {X_test.shape} {y_test.shape}")

TRAINING INFO: (190820, 29) (190820,)
TEST INFO: (93987, 29) (93987,)


# Define Anomaly Score Function