In [2]:
import numpy as np
import seaborn as sn
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [3]:
df1 = pd.read_csv('transaction_fraud/PS_20174392719_1491204439457_log.csv')
df1.shape

(6362620, 11)

In [4]:
df1.head(20)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [5]:
df1.nunique()

step                  743
type                    5
amount            5316900
nameOrig          6353307
oldbalanceOrg     1845844
newbalanceOrig    2682586
nameDest          2722362
oldbalanceDest    3614697
newbalanceDest    3555499
isFraud                 2
isFlaggedFraud          2
dtype: int64

# Data Exploration

In [6]:
#check for null data
df1.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [8]:
df1['type'].value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [None]:
sn.countplot(x='type', data= df1)

### Cash out the most common type of payment

### How much fraud is there?

In [None]:
df1['isFraud'].value_counts()

In [None]:
sn.countplot(x='isFraud', data=df1, hue= 'type')

## What is the most profitable type of fraud?

In [None]:
df1.groupby('type').mean()

In [None]:
sn.barplot(x='type',y ='amount', data =df1)

## Cash out is most common but transfer is most profitable.

Drop useless columns

In [None]:
df2 = df1.drop(['nameDest','nameOrig'], axis=1)

In [None]:
dummies = pd.get_dummies(df2.type,drop_first=True)

In [None]:
df3 = pd.concat([df2, dummies], axis=1)

In [None]:
df3.head()

In [None]:
df4 = df3.drop('type', axis=1)

 ## Split data into training and test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df4.drop('isFraud', axis=1)
y = df4['isFraud']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

## Check if X and y are balanced

In [None]:
y_train.value_counts()

## We have a extremely unbalanced dataset, so we will generate synthetic data using smote.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
os = SMOTE()

In [None]:
X_train_resample, y_train_resample =os.fit_resample(X_train,y_train)

In [None]:
y_train_resample.value_counts()

## We now have a balanced dataset.

# What is the best ml model to use? We can try a few and pick the best one then optimize that model.

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [27]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [28]:
# def find_best_model_using_gridsearchcv(X,y):
#     algos = {
#         'linear_regression' : {
#             'model': LinearRegression(),
#             'params': {
#                 'normalize': [True, False]
#             }
#         },
#         'lasso': {
#             'model': Lasso(),
#             'params': {
#                 'alpha': [0,.5,1],
#                 'selection': ['random', 'cyclic']
#             }
#         },
#         'decision_tree': {
#             'model': DecisionTreeRegressor(),
#             'params': {
#                 'criterion' : ['mse','friedman_mse'],
#                 'splitter': ['best','random']
#             }
#         },
#         'random_forest' : {
#             'model' : RandomForestClassifier(),
#             'params' : {
#                 'n_estimators': [50],
#             }
#         },
#         'logistic_regression' :{
#             'model' : LogisticRegression(),
#             'params' : {
#
#             }
#         }
#     }
#     scores = []
#     for algo_name, config in algos.items():
#         gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False, n_jobs=-1)
#         gs.fit(X,y)
#         scores.append({
#             'model': algo_name,
#             'best_score': gs.best_score_,
#             'best_params': gs.best_params_
#         })
#
#     return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [29]:
# find_best_model_using_gridsearchcv(X_train_resample,y_train_resample)

![results](results.jpg)

In [1]:
rf_clf = GridSearchCV(RandomForestClassifier(),{
    'n_estimators':[100],'n_jobs':[-1],'criterion':['gini','entropy'],'max_features' :['sqrt','log2']}, cv=cv, return_train_score=False )
rf_clf.fit(X_train_resample,y_train_resample)

NameError: name 'GridSearchCV' is not defined

In [None]:
rf_clf.best_estimator_

In [None]:
rf_clf.best_params_


In [None]:
rf_clf.best_score_

In [None]:
y_pred =rf_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
cm = confusion_matrix(y_test, y_pred, )
cm

In [None]:
rf_clf.best_estimator_

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
# we predict no fraud perfectly but our precision is only 65% for predicting fraud
## we can hypertune the parameters for random forest to get a better precision