# Un-Supervised Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import scipy as sci
import seaborn as sns
import sklearn as skl

In [None]:
data = pd.read_csv("creditcard.csv")

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
# 0 means normal transaction
# 1 means fraudulent transaction
data = data.sample(frac = 0.1, random_state = 1)
data.shape

In [None]:
data.hist(figsize = (20, 20))

In [None]:
fraud = data[data["Class"] == 1]
valid = data[data["Class"] == 0]

outlier_fraction = len(fraud)/float(len(valid))
print(outlier_fraction)
print(len(fraud))
print(len(valid))

In [13]:
corr_matrix = data.corr()
corr_matrix["Class"].sort_values(ascending = False)

Class     1.000000
V11       0.140513
V4        0.122631
V2        0.069598
V21       0.037570
V19       0.025784
V8        0.024896
V27       0.024421
V28       0.014344
Amount    0.012804
V25       0.011958
V20       0.005640
V22      -0.001683
V26      -0.001884
V13      -0.003380
V24      -0.003727
V15      -0.003760
Time     -0.005087
V23      -0.005856
V6       -0.035085
V5       -0.073519
V1       -0.079820
V9       -0.079962
V18      -0.098311
V7       -0.134247
V3       -0.160051
V16      -0.175216
V10      -0.191189
V12      -0.244444
V17      -0.293225
V14      -0.296297
Name: Class, dtype: float64

In [14]:
columns = data.columns.tolist()
#filter the columns to remove data we don't want
columns = [c for c in columns if c not in ["Class"]]
#store the variable we'll be predicting
target = "Class"
X = data[columns]
y = data[target] 

print(X.shape)
print(y.shape)

(28481, 30)
(28481,)


In [21]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor  #localoutlierfactor is an un-supervised method

In [25]:
state = 1
classifiers = {
    "Isolation Forest" : IsolationForest(max_samples = len(X),contamination = outlier_fraction, 
                                        random_state = state),
    "local outlier factor" : LocalOutlierFactor(n_neighbors = 20, contamination = outlier_fraction,
                                               )
}

In [31]:
 #fit the model
n_outliers = len(fraud)
for i,(clf_name, clf) in enumerate(classifiers.items()):
    if clf_name == "local outlier factor":
        y_predict = clf.fit_predict(X)
        scores_predict = clf.negative_outlier_factor_
        
    else:
        clf.fit(X)
        scores_predict = clf.decision_function(X)
        y_predict = clf.predict(X)
        
# reshape the predictions to 0 to valid, 1 to fraud
    y_predict[y_predict == 1] = 0
    y_predict[y_predict == -1] = 1
    n_errors = (y_predict != y).sum()

#running classification metrics
    print(clf_name, n_errors)
    print(accuracy_score(y, y_predict))
    print(classification_report(y, y_predict))   #comparing "y" to "y_predict"

Isolation Forest 71
0.99750711000316
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.28      0.29      0.28        49

    accuracy                           1.00     28481
   macro avg       0.64      0.64      0.64     28481
weighted avg       1.00      1.00      1.00     28481

local outlier factor 97
0.9965942207085425
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.02      0.02      0.02        49

    accuracy                           1.00     28481
   macro avg       0.51      0.51      0.51     28481
weighted avg       1.00      1.00      1.00     28481

