In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import seaborn as sns
import gif
import time
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, classification_report
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

In [126]:
# Read data and preprocessing
df = pd.read_csv('creditcard.csv')
df.dropna(inplace= True)
df.head()


b=df.Class.value_counts()
fraction = b[1]/b[0]

print (f'actual fraction of fraud to valid cases= {round(fraction,3)}')

# downsample majority and upsample minority in train set\

df_train,df_test = train_test_split(df,test_size=0.3,random_state=42) # split dataset into train test df and operate resampling only in train
majority_class = df_train[df_train.Class==0]
minority_class = df_train[df_train.Class==1]

df_majority_ds = resample(majority_class, replace=False, n_samples=int(len(minority_class)*20), random_state=42) # a fraction of class 0 only to help aid the imbalance

df_minority_us = resample(minority_class, replace=True, n_samples=len(minority_class)*2, random_state=42) 


df_s = pd.concat([df_majority_ds,df_minority_us]).sample(frac=1)

a = df_s.Class.value_counts()
newfrac= a[1]/a[0]

print ('new fraction after sampling=',newfrac)

actual fraction of fraud to valid cases= 0.002
new fraction after sampling= 0.1


# Visualization 

In [127]:
def make_gif():
    df["Class"] = df["Class"].astype(str)
    columns = df.columns.tolist()
    mycols = [c for c in columns if c not in ["Class","Amount"]]



    gif.options.matplotlib["dpi"] = 300
    @gif.frame
    def get_plot(colname):
        fig, ax = plt.subplots(figsize=(8,6))
 
        sns.scatterplot( data=df, x=str(colname), y="Amount", hue="Class").set(title=f"Amount spent vs feature {colname}(Normal=0, Fraud=1")
    
    frames = []
    for i in mycols:
        
        frame = get_plot(i)
        frames.append(frame)
    
    
    gif.save(frames, 'example.gif', duration=29, unit="s", between="startend")

    


In [128]:
# train test dataset
columns = df_s.columns.tolist()
mycols = [c for c in columns if c not in ["Class"]]
X_train, X_test = df_s[mycols], df_test[mycols]
y_train,y_test = df_s["Class"],df_test["Class"]

# One class SVM for classification

In [157]:
#### train  one class SVM model ######

# Instantiate normalisation 
nrm = MinMaxScaler()
svm = OneClassSVM(nu=newfrac, kernel = 'rbf')
svm_pl = Pipeline([('Normalise', nrm), ('SVM', svm)])


start = time.time()
svm_pl.fit(X_train)
end = time.time()

print (f'Time taken {end-start} s')





Time taken 0.4677550792694092 s


In [158]:
# predict the anomaly scores for the test data
y_scores1 = svm.score_samples(nrm.fit_transform(X_test))

y_pred = svm.predict(nrm.fit_transform(X_test))
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print(classification_report(y_test,y_pred))
print ('Number of Fraud cases in test set',len(y_test[y_test==1]))
print ('Number of predicted Fraud cases',len(y_pred[y_pred==1]))

              precision    recall  f1-score   support

           0       1.00      0.18      0.30     85307
           1       0.00      0.92      0.00       136

    accuracy                           0.18     85443
   macro avg       0.50      0.55      0.15     85443
weighted avg       1.00      0.18      0.30     85443

Number of Fraud cases in test set 136
Number of predicted Fraud cases 70322


# Isolation forest

In [154]:
start = time.time()
clf2 = IsolationForest(n_estimators=100, max_samples=len(X_train),contamination=newfrac,random_state=42, verbose=0).fit(X_train.values)
end = time.time()
print (f'time taken {end-start} s') 

time taken 0.4902820587158203 s


In [155]:
# predict the anomaly scores for the test data
y_scores2 = clf2.score_samples(X_test.values)#clf2.decision_function(X_test.values)
y_pred = clf2.predict(X_test.values)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
print('accuracy',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print ('Number of Fraud cases in test set',len(y_test[y_test==1]))
print ('Number of predicted Fraud cases',len(y_pred[y_pred==1]))

accuracy 0.9648771695750383
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     85307
           1       0.03      0.79      0.07       136

    accuracy                           0.96     85443
   macro avg       0.52      0.88      0.52     85443
weighted avg       1.00      0.96      0.98     85443

Number of Fraud cases in test set 136
Number of predicted Fraud cases 3079
