In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Principle Component Analysis
 1. Dimensionality Reduction
 2. Feature Encoding

In [2]:
datasets_frud = pd.read_csv("Datasets/SyntheticFinancialDatasetForFraudDetection/train.csv")

In [3]:
datasets_frud.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
0,619,CASH_IN,386385.08,C421351828,4669568.85,5055953.92,C1977099364,506035.06,119649.98,0,0
1,164,CASH_IN,212458.78,C83569848,234635.0,447093.78,C1690589535,806037.88,593579.1,0,0
2,382,PAYMENT,19967.6,C852995095,3634.0,0.0,M1695416333,0.0,0.0,0,0
3,180,CASH_OUT,527616.51,C61761046,180216.0,0.0,C577654587,92157.1,619773.61,0,0
4,36,TRANSFER,206067.85,C758004147,0.0,0.0,C2143015292,2131494.48,2337562.32,0,0


In [4]:
datasets_frud.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud',
       'isFraud'],
      dtype='object')

In [5]:
datasets_frud["type"].value_counts()

CASH_OUT    1790588
PAYMENT     1720696
CASH_IN     1119184
TRANSFER     426436
DEBIT         33192
Name: type, dtype: int64

In [6]:
print("Total Number of is Flagged Fraud :{}".format(datasets_frud["isFlaggedFraud"].value_counts().sum()))

Total Number of is Flagged Fraud :5090096


In [7]:
datasets_frud["isFraud"].value_counts()

0    5083503
1       6593
Name: isFraud, dtype: int64

In [8]:
# Figure out where our transaction faceing the fraud conditions
datasets_frud.loc[(datasets_frud.isFraud == 1)].type.value_counts()

CASH_OUT    3305
TRANSFER    3288
Name: type, dtype: int64

In [9]:
# Figure out where our transaction not faceing the fraud conditions
datasets_frud.loc[(datasets_frud.isFraud ==0)].type.value_counts()

CASH_OUT    1787283
PAYMENT     1720696
CASH_IN     1119184
TRANSFER     423148
DEBIT         33192
Name: type, dtype: int64

In [10]:
fraudDatasets = datasets_frud.loc[(datasets_frud.type == "TRANSFER") | (datasets_frud.type == "CASH_OUT")] 

In [11]:
fraudDatasets.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
3,180,CASH_OUT,527616.51,C61761046,180216.0,0.0,C577654587,92157.1,619773.61,0,0
4,36,TRANSFER,206067.85,C758004147,0.0,0.0,C2143015292,2131494.48,2337562.32,0,0
6,10,CASH_OUT,15241.42,C1606904496,45285.0,30043.58,C36437323,355775.87,652863.91,0,0
10,353,CASH_OUT,460159.75,C1847016170,0.0,0.0,C1970087686,1602337.92,2062497.67,0,0
11,185,CASH_OUT,39570.59,C1933638935,30591.0,0.0,C977120564,0.0,39570.59,0,0


In [12]:
fraudDatasets.type.value_counts()

CASH_OUT    1790588
TRANSFER     426436
Name: type, dtype: int64

In [13]:
fraudDatasets.isFraud.value_counts()

0    2210431
1       6593
Name: isFraud, dtype: int64

In [14]:
fraudDatasets.info("Dtype")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2217024 entries, 3 to 5090095
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFlaggedFraud  int64  
 10  isFraud         int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 203.0+ MB


In [15]:
fraudDatasets.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFlaggedFraud      int64
isFraud             int64
dtype: object

In [16]:
labelEncoderFeature = LabelEncoder()

In [17]:
replicaDatasets = pd.DataFrame.copy(fraudDatasets)

In [18]:
replicaDatasets.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
3,180,CASH_OUT,527616.51,C61761046,180216.0,0.0,C577654587,92157.1,619773.61,0,0
4,36,TRANSFER,206067.85,C758004147,0.0,0.0,C2143015292,2131494.48,2337562.32,0,0
6,10,CASH_OUT,15241.42,C1606904496,45285.0,30043.58,C36437323,355775.87,652863.91,0,0
10,353,CASH_OUT,460159.75,C1847016170,0.0,0.0,C1970087686,1602337.92,2062497.67,0,0
11,185,CASH_OUT,39570.59,C1933638935,30591.0,0.0,C977120564,0.0,39570.59,0,0


In [19]:
replicaDatasets["type"] = labelEncoderFeature.fit_transform(replicaDatasets["type"])
replicaDatasets["nameOrig"] = labelEncoderFeature.fit_transform(replicaDatasets["nameOrig"])
replicaDatasets["nameDest"] = labelEncoderFeature.fit_transform(replicaDatasets["nameDest"])

In [20]:
replicaDatasets.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
3,180,0,527616.51,1776784,180216.0,0.0,375185,92157.1,619773.61,0,0
4,36,1,206067.85,1937898,0.0,0.0,284025,2131494.48,2337562.32,0,0
6,10,0,15241.42,694725,45285.0,30043.58,322325,355775.87,652863.91,0,0
10,353,0,460159.75,970140,0.0,0.0,240819,1602337.92,2062497.67,0,0
11,185,0,39570.59,1069608,30591.0,0.0,474544,0.0,39570.59,0,0


In [21]:
replicaDatasets.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFlaggedFraud    0
isFraud           0
dtype: int64

In [22]:
x = replicaDatasets.drop(["isFlaggedFraud", "isFraud"], axis=1)
y = replicaDatasets.isFraud

In [23]:
# datasets = pd.read_csv("Datasets/creditcard.csv")
# datasets.head()

In [24]:
pca_model = PCA(n_components=9)
pca_model = pca_model.fit(x)
pca = pca_model.transform(x)

In [25]:
components_feat_columns= ["V"+str(i+1) for i in range(pca.shape[1])]
components_datasets  = pd.DataFrame(data = pca,
                                   columns=components_feat_columns)

In [26]:
components_datasets.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'], dtype='object')

In [27]:
components_datasets.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9
0,-2120860.0,415379.235429,668622.279378,100046.809463,53338.798423,-134953.488293,-70963.736918,-63.592865,0.219348
1,496167.6,-189740.417195,830034.551952,-28523.086445,-35643.911301,-43755.838646,6562.583495,-205.817829,-0.822592
2,-1950837.0,-88336.417999,-413406.261842,-148991.155207,96442.994842,-82186.241888,5112.603739,-226.151067,0.137885
3,-45915.59,162449.015347,-137853.919149,5398.452309,-65676.830162,-510.304572,12011.345683,110.16412,0.218114
4,-2642077.0,-140553.235861,-38502.768299,-33142.69449,-4772.447869,-234444.375916,-8459.330359,-53.866809,0.152314


In [28]:
model_name = "Datasets/componentsDatasets.sav"
pickle.dump(pca_model, open(model_name, 'wb'))

In [29]:
model_loaded = pickle.load(open(model_name, 'rb'))

In [30]:
components_datasets.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9
0,-2120860.0,415379.235429,668622.279378,100046.809463,53338.798423,-134953.488293,-70963.736918,-63.592865,0.219348
1,496167.6,-189740.417195,830034.551952,-28523.086445,-35643.911301,-43755.838646,6562.583495,-205.817829,-0.822592
2,-1950837.0,-88336.417999,-413406.261842,-148991.155207,96442.994842,-82186.241888,5112.603739,-226.151067,0.137885
3,-45915.59,162449.015347,-137853.919149,5398.452309,-65676.830162,-510.304572,12011.345683,110.16412,0.218114
4,-2642077.0,-140553.235861,-38502.768299,-33142.69449,-4772.447869,-234444.375916,-8459.330359,-53.866809,0.152314


In [31]:
components_datasets.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
finalComponentsDatasets = pd.concat([components_datasets, y], axis=1)

In [32]:
finalComponentsDatasets.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,isFraud
0,-2120860.0,415379.235429,668622.279378,100046.809463,53338.798423,-134953.488293,-70963.736918,-63.592865,0.219348,0
1,496167.6,-189740.417195,830034.551952,-28523.086445,-35643.911301,-43755.838646,6562.583495,-205.817829,-0.822592,0
2,-1950837.0,-88336.417999,-413406.261842,-148991.155207,96442.994842,-82186.241888,5112.603739,-226.151067,0.137885,0
3,-45915.59,162449.015347,-137853.919149,5398.452309,-65676.830162,-510.304572,12011.345683,110.16412,0.218114,0
4,-2642077.0,-140553.235861,-38502.768299,-33142.69449,-4772.447869,-234444.375916,-8459.330359,-53.866809,0.152314,0


In [33]:
finalComponentsDatasets.isFraud.value_counts()

0    2210431
1       6593
Name: isFraud, dtype: int64

In [34]:
print("Shape of the Components Datasets is: {}".format(components_datasets.shape))
print("Shape of the Y Datasets is: {}".format(y.shape))
print("Shape of the final Components Datasets is: {}".format(finalComponentsDatasets.shape))

Shape of the Components Datasets is: (2217024, 9)
Shape of the Y Datasets is: (2217024,)
Shape of the final Components Datasets is: (2217024, 10)


In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.2, 
                                                   random_state=42)

In [36]:
randomForestModel = RandomForestClassifier(n_estimators=20)
randomForestModel.fit(x_train, y_train)

RandomForestClassifier(n_estimators=20)

In [37]:
randomForestModelPredict = randomForestModel.predict(x_test)

In [38]:
print("The accuracy_score Score is : {}%".format(round(accuracy_score(y_test, randomForestModelPredict), 2)))
print("The precision_score Score is : {}%".format(round(precision_score(y_test, randomForestModelPredict), 2)))
print("The f1_score Score is : {}%".format(round(f1_score(y_test, randomForestModelPredict),2)))
print("The recall_score Score is : {}%".format(round(recall_score(y_test, randomForestModelPredict), 2)))

The accuracy_score Score is : 1.0%
The precision_score Score is : 0.98%
The f1_score Score is : 0.87%
The recall_score Score is : 0.78%


In [47]:
def classIdentify(predicted):
    if predicted == 1:
        return "Fraud"
    else:
        return "Real"

In [54]:
np.unique(randomForestModelPredict)

array([0, 1])

In [50]:
classIdentify(np.max(randomForestModelPredict))

'Fraud'

In [53]:
list(randomForestModelPredict).index(1)

96