### Phase A : Load Required Packages and Dataset

In [1]:
#Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import telepot

In [2]:
#Selective import of modules
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.dummy import DummyClassifier
from sklearn.externals import joblib
from sklearn import metrics

In [3]:
#Load cell magics
%matplotlib inline
token="559325635:AAH0eBs-GJ0-iFouJ-LZAHtRfhcmYjgFqKo"
chat_id="507809613"

In [4]:
#Function to send message via Telegram to inform code execution is done
def send_telegram_message(text, chat_id, token):
    bot = telepot.Bot(token=token)
    bot.sendMessage(chat_id=chat_id, text=text)

In [5]:
#Send message to denote code has started running
send_telegram_message("Code started executing..", chat_id, token)

In [6]:
#Load dataset
df = pd.read_csv('/Users/SandyPC/Documents/Python/Projects/kaggle_fraud/data/raw/creditcard.csv')

### Part B : Data Manipulation

In [7]:
#Print shape before dropping duplicates
print('Shape of dataframe before dropping duplicates is : ', df.shape)

#Drop duplicates
df = df.drop_duplicates()

#Print shape after dropping duplicates
print('Shape of dataframe after dropping duplicates is : ', df.shape)

Shape of dataframe before dropping duplicates is :  (284807, 31)
Shape of dataframe after dropping duplicates is :  (283726, 31)


In [8]:
#Scale amount
scaler = StandardScaler()
amt_std = np.array(df['Amount'])
amt_std = amt_std.reshape(-1, 1)
df['Amount_Std'] = scaler.fit_transform(amt_std)

In [9]:
#Scale time
scaler = StandardScaler()
time_mnmx = np.array(df['Time'])
time_mnmx = time_mnmx.reshape(-1, 1)
df['Time_Mnmx'] = scaler.fit_transform(time_mnmx)

### Part C : Dummy Model

In [10]:
#Specify target and features
y = df['Class']
x = df.copy().drop(['Class', 'Time', 'Amount'], axis = 1)

In [11]:
#Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [12]:
#Fit classifier to check feature importances
clf_dummy = DummyClassifier(strategy='stratified')
clf_dummy.fit(x_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [13]:
#Scoring the model on testing data
clf_dummy.score(x_test, y_test)

0.996616461852957

In [14]:
#Looking at the confusion matrix
pd.crosstab(clf_dummy.predict(x_test), y_test)

Class,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,84865,125
1,128,0


### Part D : Baseline Model

In [15]:
#Specify target and features
y = df['Class']
x = df.copy().drop(['Class', 'Time', 'Amount'], axis = 1)

In [16]:
#Model normal cases as 1 and fraud cases as -1
y = np.where(y == 1, -1, 1)
outliers = y[y == -1]

In [17]:
#Print shape of outliers and percentage of outliers within dataset
print('Shape of outliers is : ', outliers.shape)
print('Percentage of outliers is : ', outliers.shape[0]/y.shape[0])

Shape of outliers is :  (473,)
Percentage of outliers is :  0.001667101358352777


In [18]:
#Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [19]:
#Distribution in train
pd.crosstab(y_train, y_train)

col_0,-1,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,348,0
1,0,198260


In [20]:
#Distribution in test
pd.crosstab(y_test, y_test)

col_0,-1,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,125,0
1,0,84993


In [21]:
#Nu parameter of svm
nu = outliers.shape[0]/y.shape[0]
gamma = 0.0005

In [22]:
#Fit classifier to check feature importances
clf_svm = OneClassSVM(kernel='rbf', nu=0.1, gamma=0.1)
clf_svm.fit(x_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
      max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [23]:
#Accuracy metrics for training
preds = clf_svm.predict(x_train)  
targs = y_train

print("accuracy: ", metrics.accuracy_score(targs, preds))  
print("precision: ", metrics.precision_score(targs, preds))  
print("recall: ", metrics.recall_score(targs, preds))  
print("f1: ", metrics.f1_score(targs, preds))  
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))  

accuracy:  0.9014843309433658
precision:  0.9998545421995457
recall:  0.9014425501866237
f1:  0.9481016217248532
area under curve (auc):  0.9133649532542314


In [24]:
#Looking at the confusion matrix
pd.crosstab(targs, preds)

col_0,-1,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,322,26
1,19540,178720


In [25]:
#Accuracy metrics for testing
preds = clf_svm.predict(x_test)  
targs = y_test

print("accuracy: ", metrics.accuracy_score(targs, preds))  
print("precision: ", metrics.precision_score(targs, preds))  
print("recall: ", metrics.recall_score(targs, preds))  
print("f1: ", metrics.f1_score(targs, preds))  
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))  

accuracy:  0.8983763716252732
precision:  0.9998035955115028
recall:  0.8984033979268881
f1:  0.9463951513949654
area under curve (auc):  0.8892016989634439


In [26]:
#Looking at the confusion matrix
pd.crosstab(targs, preds)

col_0,-1,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,110,15
1,8635,76358


In [27]:
#Pickle the model for future use
joblib.dump(clf_svm, '/Users/SandyPC/Documents/Python/Projects/kaggle_fraud/models/oneclasssvm.pkl') 

['/Users/SandyPC/Documents/Python/Projects/kaggle_fraud/models/oneclasssvm.pkl']

In [28]:
#Send message to denote code has finished running
send_telegram_message("Code finished executing..", chat_id, token)