# Introduction

This dataset has been generated using BankSim, a bank simulator for a Spanish bank.

## Data generation method

BankSim was run for 180 steps (approx. six months), several times and calibrated the parameters in order to obtain a distribution that get close enough to be reliable for testing. There were collected several log files and selected the most accurate. There were injected thieves that aim to steal an average of three cards per step and perform about two fraudulent transactions per day. Were produced 594643 records in total. Where 587443 are normal payments and 7200 fraudulent transactions. Since this is a randomised simulation the values are of course not identical to original data.

## Original paper

Original paper:

Lopez-Rojas, Edgar Alonso ; Axelsson, Stefan
Banksim: A bank payments simulator for fraud detection research Inproceedings
26th European Modeling and Simulation Symposium, EMSS 2014, Bordeaux, France, pp. 144â€“152, Dime University of Genoa, 2014, ISBN: 9788897999324.
https://www.researchgate.net/publication/265736405_BankSim_A_Bank_Payment_Simulation_for_Fraud_Detection_Research



# Analysis preparation

## Load packages

In [None]:
import numpy as np 
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import seaborn as sns
%matplotlib inline 
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


## Load the data

In [None]:
data_red_df = pd.read_csv("/kaggle/input/banksim1/bsNET140513_032310.csv")
data_df = pd.read_csv("/kaggle/input/banksim1/bs140513_032310.csv")

## Glimpse the data

In [None]:
print(data_red_df.shape)

In [None]:
print(data_df.shape)

In [None]:
data_df.head()

In [None]:
data_red_df.head()

In [None]:
print(f"unique customers: {data_df.customer.nunique()}")
print(f"unique merchants: {data_df.merchant.nunique()}")
print(f"unique age: {data_df.age.nunique()}")
print(f"unique gender: {data_df.gender.nunique()}")
print(f"unique zipCode Origin: {data_df.zipcodeOri.nunique()}")
print(f"unique zipCode Merchant: {data_df.zipMerchant.nunique()}")
print(f"unique category: {data_df.category.nunique()}")
print(f"unique amount: {data_df.amount.nunique()}")
print(f"unique fraud: {data_df.fraud.nunique()}")

In [None]:
print(f"unique Source: {data_red_df.Source.nunique()}")
print(f"unique Target: {data_red_df.Target.nunique()}")
print(f"unique Weight: {data_red_df.Weight.nunique()}")
print(f"unique typeTrans: {data_red_df.typeTrans.nunique()}")
print(f"unique fraud: {data_red_df.fraud.nunique()}")

# Data Exploration


We define few utility plot functions.  


We will explore all the features, as well as interaction between features.

In [None]:
def plot_count(df, feature, title='', size=2):
    f, ax = plt.subplots(1,1, figsize=(3*size,2*size))
    total = float(len(df))
    sns.countplot(df[feature],order = df[feature].value_counts().index, palette='Set3')
    plt.title(title)
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

In [None]:
plot_count(data_df, 'age', 'Distribution of age (count & percent)', size=2.5)

In [None]:
plot_count(data_df, 'gender', 'Distribution of gender (count & percent)')

In [None]:
plot_count(data_df, 'category', 'Distribution of category (count & percent)', size=4)

In [None]:
temp = data_df["fraud"].value_counts()
df = pd.DataFrame({'fraud': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['fraud'],y = df['values'],
    name="Payments fraud - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'Payments Fraud - data unbalance (Not fraud = 0, Fraud = 1)',
          xaxis = dict(title = 'Fraud', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
def plot_distplot_grouped(df, feature):
    classes = list(df[feature].unique())
    print(classes)
    group_labels = []     
    hist_data = []
    for item in classes:
        crt_class = df.loc[df[feature]==item]["step"]
        group_labels.append(f"{item}")
        hist_data.append(crt_class)
    fig = ff.create_distplot(hist_data, group_labels, show_hist=False, show_rug=False)
    fig['layout'].update(title=f'Payments Transactions Time Density Plot - grouped by `{feature}`', xaxis=dict(title='Time [step]'))
    iplot(fig, filename='dist_only')     

In [None]:
plot_distplot_grouped(data_df, 'fraud')

In [None]:
plot_distplot_grouped(data_df, 'age')

In [None]:
plot_distplot_grouped(data_df, 'gender')

In [None]:
plot_distplot_grouped(data_df, 'category')

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.boxplot(ax = ax1, x="fraud", y="amount", hue="fraud",data=data_df, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="fraud", y="amount", hue="fraud",data=data_df, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.boxplot(ax = ax1, x="gender", y="amount", hue="gender",data=data_df, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="gender", y="amount", hue="gender",data=data_df, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16,6))
s = sns.boxplot(ax = ax1, x="age", y="amount", hue="age",data=data_df, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="age", y="amount", hue="age",data=data_df, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="category", y="amount", hue="category",data=data_df, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="category", y="amount", hue="category",data=data_df, palette="PRGn",showfliers=False)
plt.show()

# Model

From the data, we will use 70% for training and 30% for validation.  

CatBoostClassifier (CatBoost) algorithm is used.  

ROC-AUC is the metric choosen.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

Prepare the features (X) and labels (y).

In [None]:
X = data_df.drop(['fraud'], axis=1)
y = data_df.fraud

Perform train-validation split.

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)

Specify the categorical features. CatBoost is very efficient in treatment of categorical features - no special processing (Label encoding, One-Hot encoding, Binary encoding) needed.


In [None]:
categorical_features_indices = np.where(X.dtypes != np.float)[0]


Initialize the algorithm, with few tuned hyperparameters.

In [None]:
clf = CatBoostClassifier(iterations=500,
                             learning_rate=0.02,
                             depth=12,
                             eval_metric='AUC',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 20,
                             od_wait=25)

Fit the training data.

In [None]:
clf.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)

Perform prediction for validation data.

In [None]:
preds = clf.predict(X_validation)

Show the confusion matrix.

In [None]:
cm = pd.crosstab(y_validation.values, preds, rownames=['Actual'], colnames=['Predicted'])
fig, (ax1) = plt.subplots(ncols=1, figsize=(5,5))
sns.heatmap(cm, 
            xticklabels=['Not Fraud', 'Fraud'],
            yticklabels=['Not Fraud', 'Fraud'],
            annot=True,ax=ax1,
            linewidths=.2,linecolor="Darkblue", cmap="Blues")
plt.title('Confusion Matrix', fontsize=14)
plt.show()

Validation ROC-AUC.

In [None]:
print(f"ROC-AUC score: {roc_auc_score(y_validation.values, preds)}")