In [1]:
# Import necessary ibraries

import pandas as pd
import numpy as np

In [3]:
# # Loading the data files
# df_1 = pd.read_csv("mnt/data/reference.csv")
# df_2 = pd.read_csv("mnt/data/analysis.csv")
#
# # merging the analysis and reference files
# df = pd.concat([df_1, df_2])

df = pd.read_csv("mnt/data/reference.csv")

# printing the no.of rows and no.of columns
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

# converting the timestamp into date and adding the date column
df['date'] = pd.to_datetime(df['timestamp']).dt.date

print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

df['timestamp'] = pd.to_datetime(df['timestamp'])

Number of rows:  50207
Number of columns:  7
Number of rows:  50207
Number of columns:  8


In [4]:
# Creation of time-based features
df['transaction_hour'] = df.timestamp.dt.hour
df['transaction_day'] = df.timestamp.dt.dayofweek

# Creation of Log based features
df['amount_log'] = np.log1p(df['transaction_amount'])

# Creation binary flags
df['high_amount_flag'] = (df['transaction_amount'] > df['transaction_amount'].median()).astype(int)

# Interaction Feature creation
df['amount_x_time'] = df['transaction_amount'] * df['time_since_login_min']

In [5]:
# checking null values and duplicates

df.duplicated().sum(), df.isnull().sum()

(np.int64(0),
 timestamp                  0
 time_since_login_min       0
 transaction_amount         0
 transaction_type        3052
 is_first_transaction       0
 user_tenure_months         0
 is_fraud                   0
 date                       0
 transaction_hour           0
 transaction_day            0
 amount_log                 0
 high_amount_flag           0
 amount_x_time              0
 dtype: int64)

In [6]:
df.transaction_type.value_counts()

transaction_type
PAYMENT     18927
CASH-OUT    12534
CASH-IN     12504
TRANSFER     3190
Name: count, dtype: int64

In [22]:
df1 = df[df.transaction_type.notnull()].copy()
features_ = ['amount_log', 'time_since_login_min', 'user_tenure_months', 'transaction_type', 'is_first_transaction', 'transaction_hour','is_fraud']
df1.isnull().sum().sum()

np.int64(0)

In [23]:
df1[features_].head(3)

Unnamed: 0,amount_log,time_since_login_min,user_tenure_months,transaction_type,is_first_transaction,transaction_hour,is_fraud
0,8.289565,1.56175,0.31898,PAYMENT,False,0,1.0
1,7.145906,1.658074,7.391323,PAYMENT,False,0,0.0
2,7.593727,2.454287,0.781225,CASH-IN,False,0,1.0


In [24]:
df1['transaction_type'] = df1['transaction_type'].map({'PAYMENT':1, 'CASH-OUT':2, 'CASH-IN':3, 'TRANSFER':4})
df1['is_first_transaction'] = df1['is_first_transaction'].map({False:0, True:1})

In [25]:
df1[features_].head(10)

Unnamed: 0,amount_log,time_since_login_min,user_tenure_months,transaction_type,is_first_transaction,transaction_hour,is_fraud
0,8.289565,1.56175,0.31898,1,0,0,1.0
1,7.145906,1.658074,7.391323,1,0,0,0.0
2,7.593727,2.454287,0.781225,3,0,0,1.0
3,7.72586,2.392085,0.680473,2,0,0,1.0
4,7.662844,2.189806,8.542895,3,0,0,1.0
5,7.205784,2.253766,2.535341,3,0,0,1.0
6,8.429127,1.405208,8.912317,1,0,0,0.0
8,8.639252,2.198534,3.497238,2,0,1,1.0
9,7.621195,2.343364,10.381912,4,0,1,0.0
10,8.347875,1.476141,5.326502,1,0,1,0.0


In [36]:
# Logistic Model Training
from sklearn.model_selection import train_test_split

X = df1[features_].drop('is_fraud', axis=1)
y = df1['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
print("Logistic Regression : ")
print(classification_report(y_test, y_pred))

y_prob = model_lr.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_pred)
print(roc_auc)
print("------------------------------------------------------")

Logistic Regression : 
              precision    recall  f1-score   support

         0.0       0.65      0.80      0.72      4723
         1.0       0.74      0.57      0.64      4708

    accuracy                           0.68      9431
   macro avg       0.70      0.68      0.68      9431
weighted avg       0.70      0.68      0.68      9431

0.6846837526225628
------------------------------------------------------


In [33]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=400)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

print(classification_report(y_test, y_pred))

y_prob = model_rf.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_pred)
print(roc_auc)


              precision    recall  f1-score   support

         0.0       0.66      0.81      0.73      4754
         1.0       0.75      0.58      0.66      4677

    accuracy                           0.70      9431
   macro avg       0.71      0.70      0.69      9431
weighted avg       0.71      0.70      0.69      9431

0.6978242060139267
