In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, average_precision_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, auc

from sklearn.preprocessing import StandardScaler, LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

from plotly import tools
from plotly.offline import iplot
import plotly.graph_objs as go
#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

from IPython.display import display

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
bank_trans = pd.read_csv('/kaggle/input/banksim1/bs140513_032310.csv')
print(bank_trans.shape)
bank_trans.head()

In [None]:
bank_network = pd.read_csv('/kaggle/input/banksim1/bsNET140513_032310.csv')
print(bank_network.shape)
bank_network.head()

In [None]:
bank_trans.info()

__Check for Missing values__

In [None]:
bank_trans.isna().any()

# Data Cleaning
Before proceeding its better to remove the quotes from the strings in the columns

In [None]:
cols_to_change = [c for c in bank_trans.columns if c not in ['step', 'amount', 'fraud']]
cols_to_change

In [None]:
bank_trans[cols_to_change] = bank_trans[cols_to_change].applymap(lambda x: x.replace("'", ""))

# EDA

In [None]:
for col in bank_trans.columns:
    print(f"Feature {col.upper()} has {bank_trans[col].nunique()} items")

The 'zipcodeOri' and 'zipMerchant' has only one unique item in them, we can drop these two features from the dataset

In [None]:
bank_trans = bank_trans.drop(['zipcodeOri', 'zipMerchant'], axis = 1)
bank_trans.head(2)

In [None]:
#Viz Utils

def countplot_helper(data: pd.DataFrame, col: str, title: str = None, hue: str = None):
    plt.figure(figsize = (16, 12))
    plt.title(title)
    ax = sns.countplot(data = data, x = col, order = data[col].value_counts().index,
                       hue = hue if hue else None)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2.0, height + 3,
                f"{round(100 * height / len(data[col]), 2)}%",
                ha = 'center')
    if data[col].nunique() > 5:
        plt.xticks(rotation = 45)
    plt.show()

In [None]:
countplot_helper(bank_trans, 'fraud', title = 'No. of Geniune/Fraud Transactions')

- There % of Fraudulent transactions in the dataset is 1.21% of the total transactions
- Heavily imbalanced dataset

In [None]:
countplot_helper(bank_trans, 'age', title = 'Feature - Age Distribution', hue = 'fraud')

In [None]:
countplot_helper(bank_trans, 'gender', title = 'Feature - Gender Distribution', hue = 'fraud')

In [None]:
countplot_helper(bank_trans, 'category', title = 'Feature - Category Distribution')

- Most of the transactions has happened in 'es_transportation' category
- It would be interesting to find out in which category the fradulent transactions happens

In [None]:
countplot_helper(bank_trans, 'category', title = 'Feature - Category Distribution', hue = 'fraud')

- There is no Fraud transaction in 'es_transportation'
- Let's remove the es_transportation category and check the distribution

In [None]:
fraud_cats = list(bank_trans[bank_trans['fraud'] == 1]['category'].unique())
print("The categories where fraud has been detected:")
print(*fraud_cats, sep = '\n')

In [None]:
temp = bank_trans[bank_trans['category'] != 'es_transportation']
countplot_helper(temp, 'category', title = 'Feature - Category Distribution without ES_TRANSPORTATION', hue = 'fraud')

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize = (16, 8))
ax1.set_title('Boxplot of Amount')
sns.boxplot(data = bank_trans, x = 'fraud', y = 'amount', hue = 'fraud', ax = ax1)

ax2.set_title('Log Distribution of Amount')
sns.distplot(np.log1p(bank_trans[bank_trans['fraud'] == 0]['amount']), label = 'No Fraud', ax = ax2)
sns.distplot(np.log1p(bank_trans[bank_trans['fraud'] == 1]['amount']), label = 'Fraud', ax = ax2)
ax2.legend()
plt.show()

In [None]:
print('Fraud Transaction:')
print(f"Max. Transaction Amount: {bank_trans[bank_trans['fraud'] == 1]['amount'].max()}")
print(f"Min. Transaction Amount: {bank_trans[bank_trans['fraud'] == 1]['amount'].min()}")
print(f"Avg. Transaction Amount: {bank_trans[bank_trans['fraud'] == 1]['amount'].mean()}")
print(f"Median Transaction Amount: {bank_trans[bank_trans['fraud'] == 1]['amount'].median()}")
print('\nNo Fraud Transaction:')
print(f"Max. Transaction Amount: {bank_trans[bank_trans['fraud'] == 0]['amount'].max()}")
print(f"Min. Transaction Amount: {bank_trans[bank_trans['fraud'] == 0]['amount'].min()}")
print(f"Avg. Transaction Amount: {bank_trans[bank_trans['fraud'] == 0]['amount'].mean()}")
print(f"Median Transaction Amount: {bank_trans[bank_trans['fraud'] == 9]['amount'].median()}")

- The average amount of transaction is higher in Fraud compared to No Fraud

In [None]:
def bivariate_plots(df, x, y):
    fig, ax = plt.subplots()
    plt.scatter(df[x], df[y], c = 'blue', edgecolors = 'none', alpha = 0.5)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f'{x} vs. {y}')
    plt.show()

In [None]:
plt.title('Density Distribution plot of Step')
sns.kdeplot(bank_trans[bank_trans['fraud'] == 0]['step'], shade = True, label = 'No Fraud')
sns.kdeplot(bank_trans[bank_trans['fraud'] == 1]['step'], shade = True, label = 'Fraud')
plt.legend()
plt.show()

In [None]:
fraud_df = bank_trans[bank_trans['fraud'] == 1].copy()
nofraud_df = bank_trans[bank_trans['fraud'] == 0].copy()
fraud_df.shape, nofraud_df.shape

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize = (20, 8))
fraud_df.groupby('step')['amount'].mean().plot(kind = 'line', label = 'Fraud', color = 'violet', legend = True, ax = ax1)
ax1.set_ylabel('Avg. Amount')
nofraud_df.groupby('step')['amount'].mean().plot(kind = 'line', label = 'No Fraud', color = 'blue', legend = True, ax = ax2)
ax2.set_ylabel('Avg. Amount')
plt.suptitle('Step Vs Avg. Transaction Amount', fontsize = 16)
plt.show()

- The average amount spent during every 'step' is higher for fradulent transactions than normal ones
- Lets check how gender and age influences the transactions

In [None]:
def subplots_helper(df: pd.DataFrame, col: str):
    colors = plt.rcParams['axes.prop_cycle']()
    for i, gender in enumerate(df[col].unique()):
        fig, (ax1, ax2) = plt.subplots(ncols = 2, nrows = 1, figsize = (20, 8))
        c = next(colors)['color']
        fraud_df[fraud_df[col] == gender].groupby('step')['amount'].mean().plot(kind = 'line', label = 'Fraud', color = c, legend = True, ax = ax1)
        ax1.set_ylabel('Avg. Amount')
        c = next(colors)['color']
        nofraud_df[nofraud_df[col] == gender].groupby('step')['amount'].mean().plot(kind = 'line', label = 'No Fraud', color = c, legend = True, ax = ax2)
        ax2.set_ylabel('Avg. Amount')
        plt.suptitle(f'Step Vs Avg. Transaction Amount for {col}: {gender}', y = 0.95, fontsize = 16)
        plt.figtext(0.5, 0.01, f"Figure {i + 1}", ha = 'center', fontsize = 16, bbox = {"facecolor": "grey", "alpha": 0.5, "pad": 5})
        plt.show()
    return None

In [None]:
subplots_helper(bank_trans, 'gender')

In [None]:
subplots_helper(bank_trans, 'age')

- From above plots its clear that across figures/features, the amount transacted is higher in Fraudulent cases
- The bank could take special interest in high value transactions and do a check before its approved (most banks do this now)

In [None]:
def cols_to_dict(col: str, kind: str = None):
    temp_dict = {}
    for val in fraud_df[col].unique():
        temp_dict[val] = len(nofraud_df[nofraud_df[col] == val])
    temp = pd.Series(temp_dict).to_frame(col)
    temp.iplot(kind = 'line' if kind else 'bar', 
              xTitle = col.title(), 
              yTitle = 'No. of Transactions', 
              title = f'No. of Normal Transactions done by {col.title()} in Fraudulent Transaction')
    return None

In [None]:
cols_to_dict('merchant')

In [None]:
cols_to_dict('customer', kind = 'line')

__Who are the Merchants where most number of fradulent transaction happens?__

In [None]:
countplot_helper(fraud_df, col = 'merchant')

- __Of the total fradulent transaction, 43% happens with merchants M480139044 and M980657600__

- What category of items that customers buy with these merchants and how much amount transacted?

In [None]:
temp = fraud_df[fraud_df['merchant'] == 'M480139044'][['category', 'amount']]
print(f"Top Fraud category: {temp['category'].unique()}, Total amount transacted: {temp['amount'].sum()}")
temp = fraud_df[fraud_df['merchant'] == 'M980657600'][['category', 'amount']]
print(f"Top Fraud category: {temp['category'].unique()}, Total amount transacted: {temp['amount'].sum()}")

- __Top fradulent transactions happen in es_health and es_sportandtoys category with total amount transacted 664,804 and 505,311 respectively.__
- What is the amount transacted in each category?

In [None]:
fraud_df.groupby('category')['amount'].sum().sort_values().iplot(kind = 'bar', 
                                                  xTitle = 'Category', 
                                                  yTitle = 'Amount', 
                                                  title = 'Total Fradulent Amount Transacted in each Category', 
                                                  color = 'green')

- es_travel is where most fraudulent transactions happen
- Which merchants were affected?

In [None]:
fraud_df[fraud_df['category'] == 'es_travel']['merchant'].unique()

In [None]:
fraud_df[fraud_df['category'] == 'es_travel'][['merchant', 
                                               'amount']].groupby('merchant')['amount'].sum().iplot(kind = 'bar', 
                                                                                                               xTitle = 'Merchants', 
                                                                                                               yTitle = 'Amount', 
                                                                                                               title = 'Total Fraudulent Amount Transacted in es_travel Category by Merchants', 
                                                                                                               color = 'red')

In [None]:
def pivotplot_helper(df: pd.DataFrame, cat: str = None):
    if cat is None:
        print('Please enter the category to plot')
        return
    colors = ['brown', 'pink', 'red', 'blue', 'green', 'orange', 'purple']
    temp = df[df['category'] == cat][['merchant', 'amount', 'fraud']]
    temp = pd.pivot_table(data = temp, columns = ['merchant', 'fraud'], 
                          values = ['amount'], aggfunc = 'sum', fill_value = 0)

    temp.T.iplot(kind = 'bar', 
                xTitle = 'Merchant - Fraud', 
                yTitle = 'Amount', 
                title = f'Total Amount Transacted by Merchant-Fraud/noFraud for {cat} Category', 
                color = np.random.choice(colors, 1))
    return None

In [None]:
for cat in fraud_cats:
    pivotplot_helper(bank_trans, cat = cat)

- __From the above plots we can get which merchants are affected by fraudulent transaction by looking at the total amount transacted, may be the bank should flag these merchants and monitor transactions closely__
- __We can do the same analysis for customer feature but flagging customers would be a bad idea for PR__

# Data Pre-processing

#### Cardinality is high for the features 'customer' and 'merchant', we use Frequency Encoding for this

In [None]:
#Frequency Encoding customer and merchant features
for col in ['customer', 'merchant']:
    print(f"Frequency Encoding: {col} - {bank_trans[col].nunique()}")
    freq = bank_trans[col].value_counts()
    bank_trans[col] = bank_trans[col].apply(lambda x: freq[x])

In [None]:
#Label Encoding age, gender, category
for col in ['gender', 'age', 'category']:
    print(f"Label Encoding: {col} - {bank_trans[col].nunique()}")
    le = LabelEncoder()
    bank_trans[col] = le.fit_transform(bank_trans[col])

In [None]:
bank_trans['amount'] = StandardScaler().fit_transform(np.array(bank_trans['amount']).reshape(-1, 1))

# Traning and Prediction

In [None]:
def plot_confusion(mat):
    plt.figure(figsize = (8, 4))
    sns.heatmap(pd.DataFrame(mat), annot = True, annot_kws = {"size": 25}, cmap = 'Blues', fmt = 'g')
    plt.title('Confusion matrix', y = 1.1, fontsize = 22)
    plt.ylabel('Actual', fontsize = 18)
    plt.xlabel('Predicted', fontsize = 18)
    plt.show()

In [None]:
sample = bank_trans.sample(frac = 1).reset_index(drop = True)
sample.shape

In [None]:
X = sample.drop(['fraud', 'step'], axis = 1)
y = sample['fraud'].copy()

In [None]:
#scale_pos_weight - sample Weights
num_pos_samples = y.value_counts().values[1]
num_neg_samples = y.value_counts().values[0]
num_neg_samples / num_pos_samples, np.sqrt(num_neg_samples / num_pos_samples)

## When to use PR AUC?

#### When two classes are equally important

AUC would be the metric to use if the goal of the model is to perform equally well on both classes. Image classification between cats & dogs is a good example because the performance on cats is equally important on dogs.

#### When minority class is more important

PR AUC would be the metric to use if the focus of the model is to identify correctly as many positive samples as possible.

# Precision-Recall Curve

Precision-Recall is a useful measure of success of prediction when the classes are very imbalanced.

The precision-recall curve shows the tradeoff between precision and recall for different threshold. __A high area under the curve represents both high recall and high precision, where high precision relates to a low false positive rate, and high recall relates to a low false negative rate.__ High scores for both show that the classifier is returning accurate results (high precision), as well as returning a majority of all positive results (high recall).

A system with high recall but low precision returns many results, but most of its predicted labels are incorrect when compared to the training labels. A system with high precision but low recall is just the opposite, returning very few results, but most of its predicted labels are correct when compared to the training labels. An ideal system with high precision and high recall will return many results, with all results labeled correctly.

[Ref](https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html)

In [None]:
def plot_pr_curve(precision, recall, avg_precision):
    plt.figure(figsize = (8, 6))
    plt.plot(recall, precision, label = f"Avg. PR: {round(avg_precision, 2)}")
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.5, 1.05])
    plt.xlim([0.55, 1.0])
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb

xgb_params = {
         'objective': 'binary:logistic',
         'lambda': 0.0030282073258141168, 
         'alpha': 0.01563845128469084,
         'colsample_bytree': 0.55,
         'subsample': 0.7,
         'learning_rate': 0.01,
         'max_depth': 9,
         'random_state': 2020, 
         'min_child_weight': 257,
         'eval_metric': 'aucpr',
         'seed': 2021,
         'scale_pos_weight': np.sqrt(num_neg_samples / num_pos_samples) #np.sqrt()?
         }

In [None]:
n_folds = 4
preds_xg = []

skf = StratifiedKFold(n_splits = n_folds)

for i, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold: {i + 1}")
    Xtrain, ytrain = X.iloc[trn_idx], y[trn_idx]
    Xvalid, yvalid = X.iloc[val_idx], y[val_idx]
    
    xg_train = xgb.DMatrix(Xtrain, label = ytrain)
    xg_valid = xgb.DMatrix(Xvalid, label = yvalid)

    clf = xgb.train(xgb_params,
                              xg_train,
                              10000,
                              verbose_eval = 200,
                              evals = [(xg_train, 'train'), (xg_valid, 'valid')],
                              early_stopping_rounds = 100)

    valid_preds = clf.predict(xg_valid)
    
    avg_precision = average_precision_score(yvalid, valid_preds)
    print(f'\nAvg. Precision-Recall Score: {avg_precision}')
    
    precision, recall, _ = precision_recall_curve(yvalid, valid_preds)
    plot_pr_curve(precision, recall, avg_precision)
    
    conf_mat = confusion_matrix(yvalid, valid_preds > 0.5)
    plot_confusion(conf_mat)
    
    print(f"Classification Report: \n{classification_report(yvalid, valid_preds > 0.5, labels = [0, 1])}")
    
    preds_xg.append(valid_preds)
    print()

In [None]:
xgb.plot_importance(clf);

- __Our Xgboost model does a good job by keeping the False Negatives low (high Recall) which is preferred in Fraud/Spam/Churn detection setup__
- __When using *'scale_pos_weight' = sqrt(neg/pos)*, the precision has increased (lesser FP) but the recall has decresed a bit (higher FN) compared to using *'scale_pos_weight' = neg/pos*__
- From the feature importance plot, merchant and amount are the most important ones, which confirms what we observed from the plots in the EDA part
    <font color = 'green'>
    
    - Flagging the merchants where high fraudulent transactions happens
    - Checking whenever high amount in transacted
    
    </font>

- We will attempt undersampling below, to check whether we can improve our model performance

# UnderSampling Majority Class

- The original dataset has positive/negative sampels in the ratio 1:80
-  We will undersample the majority so that we get a ratio of 1:30

In [None]:
temp_fraud = bank_trans[bank_trans['fraud'] == 1].reset_index(drop = True)
temp_nofraud = bank_trans[bank_trans['fraud'] == 0].reset_index(drop = True)
temp_nofraud = temp_nofraud.sample(n = 216000).reset_index(drop = True)
df = pd.concat([temp_fraud, temp_nofraud]).reset_index(drop = True)
df.shape

In [None]:
countplot_helper(df, 'fraud', title = 'Target Countplot after UnderSampling')

In [None]:
X = df.drop(['fraud', 'step'], axis = 1)
y = df['fraud'].copy()

In [None]:
#scale_pos_weight - sample Weights
num_pos_samples = y.value_counts().values[1]
num_neg_samples = y.value_counts().values[0]
xgb_params['scale_pos_weight'] = np.sqrt(num_neg_samples / num_pos_samples) #set the new class weights

num_neg_samples / num_pos_samples, np.sqrt(num_neg_samples / num_pos_samples)

In [None]:
n_folds = 4
preds_xg = []

skf = StratifiedKFold(n_splits = n_folds)

for i, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold: {i + 1}")
    Xtrain, ytrain = X.iloc[trn_idx], y[trn_idx]
    Xvalid, yvalid = X.iloc[val_idx], y[val_idx]
    
    xg_train = xgb.DMatrix(Xtrain, label = ytrain)
    xg_valid = xgb.DMatrix(Xvalid, label = yvalid)

    clf = xgb.train(xgb_params,
                              xg_train,
                              10000,
                              verbose_eval = 200,
                              evals = [(xg_train, 'train'), (xg_valid, 'valid')],
                              early_stopping_rounds = 100)

    valid_preds = clf.predict(xg_valid)
    
    avg_precision = average_precision_score(yvalid, valid_preds)
    print(f'\nAvg. Precision-Recall Score: {avg_precision}')
    
    precision, recall, _ = precision_recall_curve(yvalid, valid_preds)
    plot_pr_curve(precision, recall, avg_precision)
    
    conf_mat = confusion_matrix(yvalid, valid_preds > 0.5)
    plot_confusion(conf_mat)
    
    print(f"Classification Report: \n{classification_report(yvalid, valid_preds > 0.5, labels = [0, 1])}")
    
    preds_xg.append(valid_preds)
    print()

In [None]:
xgb.plot_importance(clf);

- __Recall has improved well after undersampling and also the Precision__
- __Average Precision-Recall Score has also improved well__
- We can further improve the performance by doing feature engineering (create more features)
- Also, if only high Recall is desired, we can do so by setting the model parameter *scale_pos_weight* to just neg/pos instead of sqrt(neg/pos)

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))