In [1]:
import os
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
import git
import pickle

In [2]:
n_estimators = 50
n_estimators1 = 50

In [3]:
# Parameters
n_estimators = 200
n_estimators1 = 150


In [4]:
# Necessary imports for this notebook
import os

import numpy as np
import pandas as pd

import datetime
import time

import random

# For plotting
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

import mlflow
import logging
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE


In [5]:
# Reading the file

# Load a set of pickle files, put them together in a single DataFrame, and order them by time
# It takes as input the folder DIR_INPUT where the files are stored, and the BEGIN_DATE and END_DATE
def read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE):
    
    files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT) if f>=BEGIN_DATE+'.pkl' and f<=END_DATE+'.pkl']

    frames = []
    for f in files:
        df = pd.read_pickle(f)
        frames.append(df)
        del df
    df_final = pd.concat(frames)
    
    df_final=df_final.sort_values('TRANSACTION_ID')
    df_final.reset_index(drop=True,inplace=True)
    #  Note: -1 are missing values for real world data 
    df_final=df_final.replace([-1],0)
    
    return df_final

In [6]:
DIR_INPUT='C:/Users/RUPESH/Desktop/Datascience_new/My_code/Fraud_Model' 
# Loading 2 month data
BEGIN_DATE = "2018-04-01"
END_DATE = "2018-06-01"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)

transactions_df.head()

transactions_df.shape

transactions_df.info()

transactions_df.TX_FRAUD_SCENARIO.nunique()

transactions_df.TX_TIME_DAYS.unique()

transactions_df.columns

Load  files


Wall time: 1.29 s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594735 entries, 0 to 594734
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   TRANSACTION_ID     594735 non-null  int64         
 1   TX_DATETIME        594735 non-null  datetime64[ns]
 2   CUSTOMER_ID        594735 non-null  int64         
 3   TERMINAL_ID        594735 non-null  int64         
 4   TX_AMOUNT          594735 non-null  float64       
 5   TX_TIME_SECONDS    594735 non-null  int64         
 6   TX_TIME_DAYS       594735 non-null  int64         
 7   TX_FRAUD           594735 non-null  int64         
 8   TX_FRAUD_SCENARIO  594735 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(7)
memory usage: 40.8 MB


Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD',
       'TX_FRAUD_SCENARIO'],
      dtype='object')

In [7]:
#### creating weekend feature by date time feature

from datetime import datetime
%time transactions_df['TX_DURING_WEEKEND'] = transactions_df['TX_DATETIME'].apply(lambda x : x.weekday())

transactions_df.head()

Wall time: 1.93 s


Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DURING_WEEKEND
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0,0,6
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0,0,6
2,2,2018-04-01 00:07:56,2,1365,146.0,476,0,0,0,6
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0,0,6
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0,0,6


In [8]:
#### creating night or day feature with date time time feature

%time transactions_df['TX_DURING_NIGHT'] = transactions_df.TX_DATETIME.apply(lambda X : 1 if(X.hour >= 18 or X.hour <= 6)else 0)

transactions_df.head()

transactions_df.shape

transactions_df.CUSTOMER_ID.nunique()


transactions_df = transactions_df.set_index('TX_DATETIME').sort_values('TX_DATETIME')

transactions_df.head()

Wall time: 2 s


Unnamed: 0_level_0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DURING_WEEKEND,TX_DURING_NIGHT
TX_DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-04-01 00:00:31,0,596,3156,57.16,31,0,0,0,6,1
2018-04-01 00:02:10,1,4961,3412,81.51,130,0,0,0,6,1
2018-04-01 00:07:56,2,2,1365,146.0,476,0,0,0,6,1
2018-04-01 00:09:29,3,4128,8737,64.49,569,0,0,0,6,1
2018-04-01 00:10:34,4,927,9906,50.99,634,0,0,0,6,1


In [9]:
#### average money spent by each customer in 30 day 

def get_customer_spending_behaviour_features(customer_transactions):
        
    # Compute the sum of the transaction amounts and the number of transactions for 30 day
    SUM_AMOUNT_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(30)+'d').sum()
    NB_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(30)+'d').count()

    # Compute the average transaction amount for the given window size
    # NB_TX_WINDOW is always >0 since current transaction is always included
    AVG_AMOUNT_TX_WINDOW=SUM_AMOUNT_TX_WINDOW/NB_TX_WINDOW

    # Save feature values
    customer_transactions['CUSTOMER_ID_NB_TX_'+str(30)+'DAY_WINDOW']=list(NB_TX_WINDOW)
    customer_transactions['CUSTOMER_ID_AVG_AMOUNT_'+str(30)+'DAY_WINDOW']=list(AVG_AMOUNT_TX_WINDOW)
    
        
    # And return the dataframe with the new features
    return customer_transactions

%time transactions_df=transactions_df.groupby('CUSTOMER_ID').apply(lambda x: get_customer_spending_behaviour_features(x))


Wall time: 10.2 s


In [10]:
#####Calculating risk score average number of fraudulent transactions that occurred on a terminal ID over 30 days before delay of 7 day

In [11]:
def get_count_risk_rolling_window(terminal_transactions, delay_period=7, feature="TERMINAL_ID"):
    
    
    NB_FRAUD_DELAY=terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').sum()
    NB_TX_DELAY=terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').count()
    
    
    NB_FRAUD_DELAY_WINDOW=terminal_transactions['TX_FRAUD'].rolling(str(delay_period + 30)+'d').sum()
    NB_TX_DELAY_WINDOW=terminal_transactions['TX_FRAUD'].rolling(str(delay_period + 30)+'d').count()

    NB_FRAUD_WINDOW=NB_FRAUD_DELAY_WINDOW-NB_FRAUD_DELAY
    NB_TX_WINDOW=NB_TX_DELAY_WINDOW-NB_TX_DELAY

    RISK_WINDOW=NB_FRAUD_WINDOW/NB_TX_WINDOW

    terminal_transactions[feature+'_NB_TX_'+str(30)+'DAY_WINDOW']=list(NB_TX_WINDOW)
    terminal_transactions[feature+'_RISK_'+str(30)+'DAY_WINDOW']=list(RISK_WINDOW)
        
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    terminal_transactions.fillna(0,inplace=True)
    
    return terminal_transactions

%time transactions_df=transactions_df.groupby('TERMINAL_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, feature="TERMINAL_ID"))


Wall time: 26.5 s


In [12]:
transactions_df.tail()

transactions_df=transactions_df.reset_index()

print(transactions_df.head())

transactions_df.isnull().sum()

          TX_DATETIME  TRANSACTION_ID  CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT  \
0 2018-04-01 00:00:31               0          596         3156      57.16   
1 2018-04-01 00:02:10               1         4961         3412      81.51   
2 2018-04-01 00:07:56               2            2         1365     146.00   
3 2018-04-01 00:09:29               3         4128         8737      64.49   
4 2018-04-01 00:10:34               4          927         9906      50.99   

   TX_TIME_SECONDS  TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  \
0               31             0         0                  0   
1              130             0         0                  0   
2              476             0         0                  0   
3              569             0         0                  0   
4              634             0         0                  0   

   TX_DURING_WEEKEND  TX_DURING_NIGHT  CUSTOMER_ID_NB_TX_30DAY_WINDOW  \
0                  6                1                             1

TX_DATETIME                            0
TRANSACTION_ID                         0
CUSTOMER_ID                            0
TERMINAL_ID                            0
TX_AMOUNT                              0
TX_TIME_SECONDS                        0
TX_TIME_DAYS                           0
TX_FRAUD                               0
TX_FRAUD_SCENARIO                      0
TX_DURING_WEEKEND                      0
TX_DURING_NIGHT                        0
CUSTOMER_ID_NB_TX_30DAY_WINDOW         0
CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW    0
TERMINAL_ID_NB_TX_30DAY_WINDOW         0
TERMINAL_ID_RISK_30DAY_WINDOW          0
dtype: int64

In [13]:
print(transactions_df.TX_FRAUD.value_counts())

#### highly inbalance data

transactions_df.columns

output_feature="TX_FRAUD"

input_features=['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

y=transactions_df['TX_FRAUD']
x=transactions_df[input_features]

0    590326
1      4409
Name: TX_FRAUD, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0, stratify=y)

# over sampling with SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train.astype('float'), y_train)

In [15]:
from collections import Counter
print('Before SMOTE:' , Counter(y_train))
print('After SMOTE:' , Counter(y_train_smote))

Before SMOTE: Counter({0: 472261, 1: 3527})
After SMOTE: Counter({0: 472261, 1: 472261})


In [16]:
with mlflow.start_run():
    rf = RandomForestClassifier(n_estimators=n_estimators)
    rf.fit(X_train_smote, y_train_smote)
    y_pred = rf.predict(X_test)
    
    #output_path = 'C:/Users/RUPESH/Desktop/Datascience_new/My_code/Fraud_Model'
    with open('model_pkl', 'wb') as files:
        pickle.dump(rf, files)


    print(confusion_matrix(y_test, y_pred))
    print(roc_auc_score(y_test, y_pred))
    print(roc_curve(y_test, y_pred))
    print(classification_report(y_test,y_pred))


    xg= xgb.XGBClassifier(n_estimators=n_estimators1)
    xg.fit(X_train_smote, y_train_smote)
    y_pred1 = xg.predict(X_test)
    
    with open('model_pkl1', 'wb') as files:
        pickle.dump(xg, files)


    print(confusion_matrix(y_test, y_pred1))
    print(roc_auc_score(y_test, y_pred1))
    print(roc_curve(y_test, y_pred1))
    print(classification_report(y_test,y_pred1))
    recall = confusion_matrix(y_test, y_pred1)[1,1]/(confusion_matrix(y_test, y_pred1)[1,1]+confusion_matrix(y_test, y_pred1)[1,0])
    mlflow.log_metric("roc_auc_score", roc_auc_score(y_test, y_pred1))
    mlflow.log_metric("recall", recall)

[[117834    231]
 [   347    535]]
0.8023097071802082
(array([0.        , 0.00195655, 1.        ]), array([0.        , 0.60657596, 1.        ]), array([2, 1, 0], dtype=int64))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    118065
           1       0.70      0.61      0.65       882

    accuracy                           1.00    118947
   macro avg       0.85      0.80      0.82    118947
weighted avg       0.99      1.00      0.99    118947







[[117947    118]
 [   362    520]]
0.7942848557709622
(array([0.00000000e+00, 9.99449456e-04, 1.00000000e+00]), array([0.        , 0.58956916, 1.        ]), array([2, 1, 0], dtype=int64))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    118065
           1       0.82      0.59      0.68       882

    accuracy                           1.00    118947
   macro avg       0.91      0.79      0.84    118947
weighted avg       1.00      1.00      1.00    118947



In [17]:
# loading random forest model and testing 
with open('model_pkl' , 'rb') as f:
    rf = pickle.load(f)

In [18]:
y_pred = rf.predict(X_test)
print(roc_auc_score(y_test, y_pred))

0.8023097071802082


In [19]:
# loading xg boost model and testing 
with open('model_pkl1' , 'rb') as f:
    xg = pickle.load(f)

In [20]:
y_pred = xg.predict(X_test)
print(roc_auc_score(y_test, y_pred))

0.7942848557709622
