# Random Forest with Sampling - Credit Card Fraud

### Import Libraries

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
#https://www.kaggle.com/datasets/kelvinkelue/credit-card-fraud-prediction/data

In [15]:
import opendatasets as od
import pandas as pd
import plotly as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from datetime import datetime

### Import Dataset

In [16]:
data = pd.read_csv('.././csv/fraud_test.csv')
df = pd.DataFrame(data)

In [17]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,21/06/2020 12:14,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,19/03/1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,21/06/2020 12:14,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",17/01/1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,21/06/2020 12:14,3598220000000000.0,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",21/10/1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,21/06/2020 12:15,3591920000000000.0,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,25/07/1987,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,21/06/2020 12:15,3526830000000000.0,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,06/07/1955,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


### Basic Validation Check

In [18]:
df.isna().sum()
df.duplicated().sum()
df.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

### Checking the class balance of the target Variable

In [19]:
df.is_fraud.value_counts()

is_fraud
0    553574
1      2145
Name: count, dtype: int64

### Feature Extraction
As Age might be important feature for prediction. We can extract Age from DOB and drop the column

In [20]:
def calculate_age(dob):
    today = datetime.today()
    dob = datetime.strptime(dob, '%d/%m/%Y')  
    age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    return age

df['Age'] = df['dob'].apply(calculate_age)

### Droping Unwanted Columns
    1. Dropping the Names and Transaction ID columns
    2. As age is calulated, the DOB column can be dropped
    3. As street, city and state columns as there the latitude and longitude columns can be dropped
    4. City population is not good predictor for predicting fraud therefore sropping that column as well

In [21]:
df.drop(labels=['Unnamed: 0','trans_date_trans_time','cc_num','first', 'last','trans_num',
                'dob','lat','long','merch_lat','merch_long','unix_time','city_pop'],axis=1,inplace=True)

### Statistical Description

In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
amt,555719.0,69.39281,156.745941,1.0,9.63,47.29,83.01,22768.11
zip,555719.0,48842.628015,26855.283328,1257.0,26292.0,48174.0,72011.0,99921.0
is_fraud,555719.0,0.00386,0.062008,0.0,0.0,0.0,0.0,1.0
Age,555719.0,50.007279,17.439908,19.0,37.0,48.0,61.0,99.0


### Correlation

In [23]:
df.corr(numeric_only=True) *100

Unnamed: 0,amt,zip,is_fraud,Age
amt,100.0,0.231037,18.226707,-1.288385
zip,0.231037,100.0,-0.227092,0.85754
is_fraud,18.226707,-0.227092,100.0,0.763011
Age,-1.288385,0.85754,0.763011,100.0


### Encoding
    1. Changing Zip code to object for encoding it
    2. Label Encode all the object columns

In [24]:
df['zip'] = df['zip'].astype('object')

In [25]:
encode_cols = ['merchant', 'category','gender','job','street','state','zip','city']
lbl_en = LabelEncoder()
for col in encode_cols:
    df[col] = lbl_en.fit_transform(df[col])  

### The Amount column shows more magnitude therefore applying standard scaler to it

In [26]:
scaler_cols  = ['amt']
scaler = StandardScaler()
df[scaler_cols] = scaler.fit_transform(df[scaler_cols])

### Split Data to Train and Test

In [27]:
X_cols = ['merchant', 'category', 'amt', 'gender', 'street', 'city', 'state',
       'zip', 'job',  'Age']
X = df[X_cols]
y = df['is_fraud']
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Model 1: Basic Decision Tree Model

In [28]:
base_model = DecisionTreeClassifier()
base_model_DT = base_model.fit(X_train, y_train)
y_pred = base_model_DT.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110724
           1       0.62      0.65      0.64       420

    accuracy                           1.00    111144
   macro avg       0.81      0.83      0.82    111144
weighted avg       1.00      1.00      1.00    111144



### Balancing the taget class using SMOTE which is an oversampling technique

In [29]:
sm = SMOTE(random_state=7)
X_train_sm , y_train_sm = sm.fit_resample(X_train, y_train)

In [30]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_sm.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_sm.shape)) 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_sm == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_sm == 0))) 

After OverSampling, the shape of train_X: (885700, 10)
After OverSampling, the shape of train_y: (885700,) 

After OverSampling, counts of label '1': 442850
After OverSampling, counts of label '0': 442850


# Model 2:  Decision Tree Model with balanced taget using SMOTE

In [31]:
sm_model = DecisionTreeClassifier()
sm_model_DT = sm_model.fit(X_train_sm, y_train_sm)
y_pred_sm = sm_model_DT.predict(X_test)
print(classification_report(y_test, y_pred_sm))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    110724
           1       0.18      0.69      0.28       420

    accuracy                           0.99    111144
   macro avg       0.59      0.84      0.64    111144
weighted avg       1.00      0.99      0.99    111144



### Balancing the taget class using NearMiss which is an undersampling technique

In [32]:
nm = NearMiss()
X_train_nm , y_train_nm = nm.fit_resample(X_train, y_train)

In [33]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_nm.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_nm.shape)) 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_nm == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_nm == 0))) 

After OverSampling, the shape of train_X: (3450, 10)
After OverSampling, the shape of train_y: (3450,) 

After OverSampling, counts of label '1': 1725
After OverSampling, counts of label '0': 1725


# Model 3:  Decision Tree Model with balanced taget using NearMiss

In [34]:
nm_model = DecisionTreeClassifier()
nm_model_DT = nm_model.fit(X_train_nm, y_train_nm)
y_pred_nm = nm_model_DT.predict(X_test)
print(classification_report(y_test, y_pred_nm))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96    110724
           1       0.04      0.90      0.08       420

    accuracy                           0.92    111144
   macro avg       0.52      0.91      0.52    111144
weighted avg       1.00      0.92      0.95    111144



In [35]:
nm_model_DT.feature_importances_
nm_model_DT.criterion

'gini'

# Model 4:  Random Forest Model with balanced taget using SMOTE

In [36]:
rf_model = RandomForestClassifier()
rf_model_DT = rf_model.fit(X_train_sm, y_train_sm)
y_pred_rf = rf_model_DT.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    110724
           1       0.29      0.71      0.41       420

    accuracy                           0.99    111144
   macro avg       0.64      0.85      0.70    111144
weighted avg       1.00      0.99      0.99    111144



# Model 4:  Random Forest Model with balanced taget using NearMiss

In [37]:
rf_model = RandomForestClassifier()
rf_model_DT = rf_model.fit(X_train_nm, y_train_nm)
y_pred_rf = rf_model_DT.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      0.77      0.87    110724
           1       0.02      0.95      0.03       420

    accuracy                           0.77    111144
   macro avg       0.51      0.86      0.45    111144
weighted avg       1.00      0.77      0.87    111144

