# Fraud Prediction

In this project, we will train different models and evaluate how well they can predict instances of fraud.

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.svm import SVC

In [25]:
df = pd.read_csv('fraud_data.csv')
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.176563,0.323798,0.536927,1.047002,-0.368652,-0.728586,0.084678,-0.069246,-0.266389,0.155315,...,-0.109627,-0.341365,0.057845,0.49918,0.415211,-0.581949,0.015472,0.018065,4.67,0
1,0.681109,-3.934776,-3.801827,-1.147468,-0.73554,-0.501097,1.038865,-0.626979,-2.274423,1.527782,...,0.652202,0.272684,-0.982151,0.1659,0.360251,0.195321,-0.256273,0.056501,912.0,0
2,1.140729,0.453484,0.24701,2.383132,0.343287,0.432804,0.09338,0.17331,-0.808999,0.775436,...,-0.003802,0.058556,-0.121177,-0.304215,0.645893,0.1226,-0.012115,-0.005945,1.0,0
3,-1.107073,-3.298902,-0.184092,-1.795744,2.137564,-1.684992,-2.015606,-0.007181,-0.16576,0.869659,...,0.130648,0.329445,0.927656,-0.04956,-1.892866,-0.575431,0.266573,0.414184,62.1,0
4,-0.314818,0.866839,-0.124577,-0.627638,2.651762,3.428128,0.194637,0.670674,-0.442658,0.133499,...,-0.312774,-0.799494,-0.064488,0.953062,-0.42955,0.158225,0.076943,-0.015051,2.67,0


In [26]:
fraud = df.loc[df['Class'] == 1]
fraud.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
15,-1.927453,1.827621,-7.019495,5.348303,-2.739188,-2.107219,-5.015848,1.205868,-4.382713,-8.337707,...,1.376938,-0.792017,-0.771414,-0.379574,0.718717,1.111151,1.277707,0.819081,512.25,1
149,0.432554,1.861373,-4.310353,2.44808,4.574094,-2.979912,-2.792379,-2.719867,-0.276704,-2.314747,...,-1.384477,-0.348904,-3.979948,-0.828156,-2.419446,-0.76707,0.387039,0.319402,1.0,1
182,0.908637,2.849024,-5.647343,6.009415,0.216656,-2.397014,-1.819308,0.338527,-2.819883,-4.063098,...,0.40726,-0.397435,-0.080006,-0.168597,0.465058,0.21051,0.648705,0.360224,1.18,1
255,-22.341889,15.536133,-22.865228,7.043374,-14.183129,-0.463145,-28.215112,-14.607791,-9.481456,-20.949192,...,-9.110423,4.158895,1.412928,0.382801,0.447154,-0.632816,-4.380154,-0.467863,1.0,1
296,-2.880042,5.225442,-11.06333,6.689951,-5.759924,-2.244031,-11.199975,4.014722,-3.429304,-11.56195,...,2.002883,0.351102,0.795255,-0.778379,-1.646815,0.487539,1.427713,0.583172,1.0,1


In [27]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [28]:
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,1.176563,0.323798,0.536927,1.047002,-0.368652,-0.728586,0.084678,-0.069246,-0.266389,0.155315,...,-0.137258,-0.109627,-0.341365,0.057845,0.49918,0.415211,-0.581949,0.015472,0.018065,4.67
1,0.681109,-3.934776,-3.801827,-1.147468,-0.73554,-0.501097,1.038865,-0.626979,-2.274423,1.527782,...,1.341809,0.652202,0.272684,-0.982151,0.1659,0.360251,0.195321,-0.256273,0.056501,912.0
2,1.140729,0.453484,0.24701,2.383132,0.343287,0.432804,0.09338,0.17331,-0.808999,0.775436,...,-0.232185,-0.003802,0.058556,-0.121177,-0.304215,0.645893,0.1226,-0.012115,-0.005945,1.0
3,-1.107073,-3.298902,-0.184092,-1.795744,2.137564,-1.684992,-2.015606,-0.007181,-0.16576,0.869659,...,0.348269,0.130648,0.329445,0.927656,-0.04956,-1.892866,-0.575431,0.266573,0.414184,62.1
4,-0.314818,0.866839,-0.124577,-0.627638,2.651762,3.428128,0.194637,0.670674,-0.442658,0.133499,...,0.402329,-0.312774,-0.799494,-0.064488,0.953062,-0.42955,0.158225,0.076943,-0.015051,2.67


In [29]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

#### Train a dummy classifer, which is a type of classifier that only classifies the given data using simple rules. We will use 'most frequent' that always predicts the most frequent class label in the training dataset.

In [30]:
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
accuracy_score = dummy.score(X_test, y_test)
accuracy_score

0.9852507374631269

In [31]:
pred = dummy.predict(X_test)
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [33]:
recall_score = recall_score(y_test, pred)
recall_score

0.0

### Support Vector Classification (SVC)

In [35]:
svc = SVC().fit(X_train, y_train)
pred = svc.predict(X_test)
pred



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
recall_score = recall_score(y_test, pred)
recall_score

0.375

In [37]:
precision_score = precision_score(y_test, pred)
precision_score

1.0

In [38]:
accuracy_score = svc.score(X_test, y_test)
accuracy_score

0.9907817109144543