This is a preliminary analyis of the dataset.

### Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

from pprint import pprint
from sklearn.model_selection import GridSearchCV

### Custom Functions

In [2]:
def cust_age_in_years(end):
    r = relativedelta(pd.to_datetime('now'), end) 
    return r.years

In [3]:
def acc_age_in_days(end):
    r = relativedelta(pd.to_datetime('now'), end) 
    return r.days

### Read dataset

In [4]:
df_csv = pd.read_csv('DataSet.csv');
df_csv.head()

Unnamed: 0,scheme,beneficiary_identifier,name_of_client,gender,dob,is_employed,income,amount,txn_date,label
0,Cyclone Relief fund,1,lilly,FEMALE,12-Nov-90,YES,180000,1000,12-Nov-20,FRAUD
1,Cyclone Relief fund,1,lilly,FEMALE,12-Nov-90,YES,180000,500,25-Nov-20,FRAUD
2,Cyclone Relief fund,2,rose,FEMALE,13-May-79,YES,275000,6000,12-Nov-20,NORMAL
3,Cyclone Relief fund,3,peter,MALE,13-Feb-91,YES,300000,1000,25-Nov-20,NORMAL
4,EarthQuake Relief fund,4,julie,FEMALE,13-May-88,YES,600000,2000,12-May-20,COULD BE FRAUD


### Number of possible status of fraud (labels in dataset)    
Here we are looking into the count of each label

In [5]:
df_csv.groupby('label').count()

Unnamed: 0_level_0,scheme,beneficiary_identifier,name_of_client,gender,dob,is_employed,income,amount,txn_date
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
COULD BE FRAUD,56,56,56,56,56,56,56,56,56
FRAUD,109,109,109,109,109,109,109,109,109
NORMAL,1819,1819,1819,1819,1819,1819,1819,1819,1819
SUSPECT,57,57,57,57,57,57,57,57,57


### Datetime conversion and feature creation

In [6]:
df_csv.dob = pd.to_datetime(df_csv.dob)
df_csv.txn_date = pd.to_datetime(df_csv.txn_date)

df_csv['age'] = df_csv["dob"].apply(cust_age_in_years)
##df_csv['acc_age'] = df_csv["txn_date"].apply(acc_age_in_days)

##df_csv['age_quantile_rank'] = pd.qcut(df_csv['age'], 4, labels = False, duplicates='drop')
##df_csv['acc_age_quantile_rank'] = pd.qcut(df_csv['acc_age'], 4, labels = False, duplicates='drop')

##df_csv['age_decile_rank'] = pd.qcut(df_csv['age'], 10, labels = False, duplicates='drop')
##df_csv['acc_age_decile_rank'] = pd.qcut(df_csv['acc_age'], 10, labels = False, duplicates='drop')

### Drop some features

In [7]:
df_csv = df_csv.drop('name_of_client', axis = 1)
df_csv = df_csv.drop('txn_date', axis = 1)
df_csv = df_csv.drop('dob', axis = 1)
df_csv = df_csv.drop('scheme', axis = 1)
df_csv = df_csv.drop('beneficiary_identifier', axis = 1)

### Encode features for classification   
Here we are encoding catagoriacal data and dropping the original catagories.

In [8]:
df_csv = pd.concat([df_csv, pd.get_dummies(df_csv['gender'], prefix='gender')], axis=1)
df_csv = df_csv.drop('gender', axis = 1)

df_csv = pd.concat([df_csv, pd.get_dummies(df_csv['is_employed'], prefix='is_employed')], axis=1)
df_csv = df_csv.drop('is_employed', axis = 1)

In [9]:
## Checking null values
print('\nNaN',len(df_csv) - df_csv.count(),'\n\n\n')


NaN income             0
amount             0
label              0
age                0
gender_FEMALE      0
gender_MALE        0
is_employed_NO     0
is_employed_YES    0
dtype: int64 





In [10]:
df_csv.head()

Unnamed: 0,income,amount,label,age,gender_FEMALE,gender_MALE,is_employed_NO,is_employed_YES
0,180000,1000,FRAUD,30,1,0,0,1
1,180000,500,FRAUD,30,1,0,0,1
2,275000,6000,NORMAL,41,1,0,0,1
3,300000,1000,NORMAL,29,0,1,0,1
4,600000,2000,COULD BE FRAUD,32,1,0,0,1


In [11]:
df_csv.shape

(2041, 8)

### Create training and testing datasets

In [12]:
y = df_csv['label'] 
X = df_csv.drop('label', axis = 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) 

### Create Randomforest classifier

In [13]:
clf = RandomForestClassifier(n_estimators = 3000, random_state = 42) 
model = clf.fit(X_train,y_train)

In [14]:
pprint(clf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 3000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [15]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

In [16]:
df_res = pd.DataFrame(data = y_pred_prob, columns = ["COULD BE FRAUD", "FRAUD", "NORMAL", "SUSPECT"])
df_res['PREDICTED_STATUS'] = y_pred

### Various matricies for accuracy

In [17]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[  7   0   7   0]
 [  0  33   9   0]
 [  5   4 530   2]
 [  0   0   6  10]]


In [18]:
base_accuracy = accuracy_score(y_test, y_pred)
print("Base accuracy: ", base_accuracy)

Base accuracy:  0.9461663947797716


In [19]:
report_dict = classification_report(y_test, y_pred, output_dict = True)
pd.DataFrame(report_dict)

Unnamed: 0,COULD BE FRAUD,FRAUD,NORMAL,SUSPECT,accuracy,macro avg,weighted avg
precision,0.583333,0.891892,0.960145,0.833333,0.946166,0.817176,0.943553
recall,0.5,0.785714,0.979667,0.625,0.946166,0.722595,0.946166
f1-score,0.538462,0.835443,0.969808,0.714286,0.946166,0.7645,0.944081
support,14.0,42.0,541.0,16.0,0.946166,613.0,613.0


### Save model

In [20]:
joblib.dump(model, "model.pkl");
rnd_columns = list(X_train.columns)
joblib.dump(rnd_columns, 'rnd_columns.pkl');