In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from scipy import stats
from cycler import cycler
import math
import matplotlib

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn.metrics import roc_curve, roc_auc_score

#from sklearn import svm, tree, linear_model, neighbors, ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import gc

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Loading input data
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [None]:
# Merge 
train_set = train_transaction.merge(train_identity,on="TransactionID",how="left")
test_set = test_transaction.merge(test_identity,on="TransactionID",how="left")

In [None]:
del train_transaction
del train_identity
del test_transaction
del test_identity
gc.collect()

#### Transaction Table & Identity table

    TransactionDT: timedelta from a given reference datetime (not an actual timestamp)  
    TransactionAMT: transaction payment amount in USD  
    ProductCD: product code, the product for each transaction  
    card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.  
    addr: address  
    dist: distance  
    P_ and (R__) emaildomain: purchaser and recipient email domain  
    C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.  
    D1-D15: timedelta, such as days between previous transaction, etc.  
    M1-M9: match, such as names on card and address, etc.  
    Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.  
    
    Categorical Features:
    ProductCD
    card1 - card6
    addr1, addr2
    P_emaildomain
    R_emaildomain
    M1 - M9


    Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
    
    They're collected by Vesta’s fraud protection system and digital security partners.
    (The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

    Categorical Features:
    DeviceType
    DeviceInfo
    id_12 - id_38
    
https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203



In [None]:
# Seaborn plot options
sns.set(font_scale=1) 

In [None]:
### Response variable : isFraud

# Class Distribution of Fraudulent vs Legitimate Transactions 

fig, ax = plt.subplots(1, 1, figsize=(6,4))
plt.rcParams.update({'font.size': 10})

barplot_isfraud = (train_set
 .groupby("isFraud")
 .isFraud
 .count()
 .to_frame(name= "total")
 .assign(percent = lambda x: np.round(100 * x / float(x.sum()),2))
 .reset_index()
 .assign(Color = ['#66c2a5','#fc8d62'])
)

ax = plt.bar(barplot_isfraud['isFraud'],barplot_isfraud['percent'],width = 0.8,color=barplot_isfraud['Color'])
plt.title('Distribution of Fraudulent vs Legitimate Transactions') 
for isFraud,total,pct in zip(list(barplot_isfraud['isFraud']),
                       list(barplot_isfraud['total']),
                       list(barplot_isfraud['percent'])):
    plt.annotate( str(total) + ' (' + str(pct) +'%'+ ')',
                 xy= (isFraud,pct),
                 xytext=(isFraud,pct+1),
                 size = 8,horizontalalignment='center')#,verticalalignment='top')
    
del barplot_isfraud 
gc.collect()

In [None]:
##################################################################################################
####### Quantitative Variables : TransactionAmt, dist1, dist2, TransactionDT, C1 ~ C14, D1 ~ D15 , 
####### Vxxx and id_01~id_11
##################################################################################################

# TransactionDT : timedelta from a given reference datetime (not an actual timestamp)
# Visualizing days from origin and associated total transaction amount

transactiondt_plot_train =( train_set[['TransactionDT','isFraud','TransactionAmt']]
.assign(Day = lambda df_: round(df_.TransactionDT/86400,0).astype('int32'))
.groupby(['isFraud','Day'])
.sum('TransactionAmt').div(1000).round(2)
.reset_index()
)
g = sns.FacetGrid(transactiondt_plot_train, col="isFraud", sharey=False)
g.map_dataframe(sns.scatterplot, x="Day", y="TransactionAmt")
g.set_axis_labels("24-hour timedelta \n Relative from Origin", "Total Transaction Amount\n (in Thousands)")
g.fig.suptitle("Visualizing TransactionDT", fontsize=15,va="bottom")


del transactiondt_plot_train
del g 
gc.collect()

In [None]:
### Transaction Amount : transaction payment amount in USD
# Distribution of Transaction amount by fraudulent vs legitimate transactions

g = sns.FacetGrid(train_set[['isFraud','TransactionAmt']], col="isFraud", sharey=False)
g.map(sns.kdeplot,"TransactionAmt")
g.set_axis_labels("Density", "Total Transaction Amount\n (in Thousands)")
g.fig.suptitle("Visualizing transaction amount using Density plot", fontsize=15,va="bottom")


def ecdf(x,**kwargs):
    # Generate x and y values for ECDF: x, y
    from statsmodels.distributions.empirical_distribution import ECDF
    ecdf = ECDF(x)
    # Plot the ECDF as dots
    plt.plot(ecdf.x, ecdf.y)

# Normalized ecdf of transaction amount of Fraudulent transactions vs Legitimate transactions
g = sns.FacetGrid(train_set[['isFraud','TransactionAmt']], col="isFraud", sharey=False)
g.map(ecdf,"TransactionAmt")
g.set_axis_labels("Cumulative Frequency", "Total Transaction Amount\n (in Thousands)")
g.fig.suptitle("Visualizing transaction amount using normalized ecdf", fontsize=15,va="bottom")

del g
gc.collect()

In [None]:
### dist1 and dist2 : Distance
# Visualizing dist1 and dist2 values 
g = sns.FacetGrid(train_set[["isFraud","dist1","dist2"]].melt(id_vars="isFraud"),
                  col="isFraud", row="variable",sharey=False)
g.map(ecdf,"value")
g.set_axis_labels("Cumulative Frequency", "Distance")
g.fig.suptitle("Visualizing distance using normalized ECDF", fontsize=15,va="bottom")

del g
gc.collect()

In [None]:
### C1~C14 variables dataframe : counting, such as how many addresses are found to be associated with the payment card,etc.
# Pairwise Correlation heatmap of C1~C14
Cx_plot_train = (train_set
.filter(regex=("^C.*"))
)

corr_train = Cx_plot_train.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 250, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_train, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Cx variables: train set)")

## Correlation heatmap for test set
Cx_plot_test=(test_set
.filter(regex=("^C.*"))
)

corr_test = Cx_plot_test.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 145, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_test, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Cx variables: test set)")

del Cx_plot_test
del Cx_plot_train
gc.collect()

In [None]:
### D1 ~ D15 : timedelta, such as days between previous transaction, etc.
# D1 ~ D15 variables dataframe
Dx_plot_train = (train_set
.filter(regex=("^D.*"))
)

corr_train = Dx_plot_train.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 250, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_train, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Dx variables: train set)")


## Correlation heatmap for test set
Dx_plot_test=(test_set
.filter(regex=("^D.*"))
)

corr_test = Dx_plot_test.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 145, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_test, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Dx variables: test set)")


del Dx_plot_test
del Dx_plot_train
gc.collect()

In [None]:
### Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
# Vxxx variables dataframe
Vxxx_plot_train = (train_set
.filter(regex=("^V.*"))
.reset_index()
)

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

Vxxx_plot_train.fillna(-999,inplace=True)
pca = PCA(n_components=20)
pc = pca.fit_transform(Vxxx_plot_train)
print(pca.explained_variance_ratio_)

del Vxxx_plot_train
gc.collect()

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

In [None]:
### Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
# Vxxx variables dataframe
Vxxx_plot_test = (test_set
.filter(regex=("^V.*"))
.reset_index()
)

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

Vxxx_plot_test.fillna(-999,inplace=True)
pca = PCA(n_components=20)
pc = pca.fit_transform(Vxxx_plot_test)
print(pca.explained_variance_ratio_)

del Vxxx_plot_test
gc.collect()

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

In [None]:
### id_01 ~ id11
# Variables with more than 99% of the values as NA
idx_plot_train = (train_set
.filter(regex=(r'(^id_0.*|id_10|id_11)'))
)

# Check for missing values
idx_plot_train.isnull().mean()

corr_train = idx_plot_train[["id_01","id_02","id_03","id_04","id_05","id_06","id_09","id_10","id_11"]].corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 250, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_train, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(idx variables: train set)")



In [None]:
## Correlation heatmap for test set
def clean_columns(col):
    return col.replace('-',"_")

idx_plot_test=(test_set
.filter(regex=(r'(^id-0.*|id-10|id-11)'))
.rename(columns=clean_columns)
)

idx_plot_test.isnull().mean()

corr_test = idx_plot_test[["id_01","id_02","id_03","id_04","id_05","id_06","id_09","id_10","id_11"]].corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 145, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_test, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(idx variables: test set)")


# del Dx_plot_test
# del Dx_plot_train
# gc.collect()

In [None]:
#########################################################################################################################################################
####### Categorical Features : ProductCD, card1 ~ card6, M1~M9, addr1 ~ addr2, P_emaildomain and R_emaildomain, Device Type, Device Info and id12 ~ id38
#########################################################################################################################################################

### ProductCD : product code, the product for each transaction  
# Fraudulent transactions vs legitimate transactions grouped by ProductCD
productcd_plt = (train_set[["isFraud","ProductCD","TransactionAmt"]]
.groupby(["ProductCD","isFraud"])
.agg(['sum','count'])
.reset_index()
)

productcd_plt.columns = ['ProductCD','isFraud','TotalAmount','NumberofTransactions']

def gbarplot(x, y, color, **kwargs):
    cmap = sns.color_palette(color)
    sns.barplot(x, y,palette=cmap, **kwargs)
    
def boxplots(x, y, color, **kwargs):
    cmap = sns.color_palette(color)
    sns.boxplot(x, y,palette=cmap, **kwargs)    
        
# Visualizing Number of transactions in each product category
g = sns.FacetGrid(productcd_plt,col="isFraud",sharey=False,hue="isFraud")
g.map(gbarplot,"ProductCD","NumberofTransactions",color="Paired")
g.set_axis_labels("Product Code", "Number of Transactions")
g.fig.suptitle("Visualizing Product Code and Number of Transactions", fontsize=12,va="bottom")

# Total Transaction amount of Fraudulent transactions vs legitimate transactions grouped by ProductCD
# Visualizing total transaction amount in each product category
g = sns.FacetGrid(productcd_plt,col="isFraud",sharey=False,hue="isFraud")
g.map(gbarplot,"ProductCD","TotalAmount",color="Paired")
g.set_axis_labels("Product Code", "Total Transaction Amount\n")
g.fig.suptitle("Visualizing Product Code and Total Transaction Amount", fontsize=12,va="bottom")

# Distribution of Transaction amount of Fraudulent transactions vs legitimate transactions grouped by ProductCD
g = sns.catplot(data=train_set[['ProductCD','TransactionAmt','isFraud']],
                x="ProductCD", y="TransactionAmt",
                hue="isFraud", kind="box",dodge=True)
plt.ylim(0, 1200)
#g.set_axis_labels("Product Code", "Total Transaction Amount\n")
g.fig.suptitle("Visualizing distribution of Transaction amount \n by product code", fontsize=12,va="bottom")

del g
gc.collect()

In [None]:

### card1 - card6 : payment card information, such as card type, card category, issue bank, country, etc.
# Visualizing  variables Card 1, Card2 and Card3 and Card5 and associated number of transactions
card_plt = (train_set[['card1','card2','card3','card5','isFraud','TransactionAmt']]
.melt(id_vars=['isFraud','TransactionAmt']))

# Distribution of Transaction amount of Fraudulent transactions vs legitimate transactions grouped by card details
g = sns.catplot(data=card_plt,
                x="isFraud", y="value",kind="box",
                col = "variable",hue="isFraud",sharey=False,sharex=False)
## Drop 2 Variables
card_drop_var = ['card1','card2']

In [None]:

# Visualizing variables card4 and card6 
# Number of transactions for each Card type and network, grouped by transaction type(Fraudulent/legitimate)
card46 = pd.DataFrame(train_set[['isFraud','card6','card4']].value_counts()).reset_index()
card46.columns = ['isFraud','card6','card4','numtxn']

g = sns.FacetGrid(card46,col="card6",row="isFraud",sharey=False,hue="isFraud",sharex=False)
g.map(gbarplot,"numtxn","card4",color="Paired")
g.set_axis_labels("", "Number of Transactions")
g.fig.suptitle("Visualizing Number of transactions associated with \n Card type and Card Network\n", fontsize=12,va="bottom")
#g.set_xticklabels(rotation=30)

# Total transaction amount for each Card type and network, grouped by transaction type(Fraudulent/legitimate)
card46_amt = (train_set[['isFraud','card6','card4','TransactionAmt']]
.groupby(['isFraud','card6','card4'])
.agg(['sum'])
.div(1000)
.reset_index()
)

card46_amt.columns = ['isFraud','card6','card4','TotalTransactionAmt']


g = sns.FacetGrid(card46_amt,col="card6",row="isFraud",sharey=False,hue="isFraud",sharex=False)
g.map(gbarplot,"TotalTransactionAmt","card4",color="Paired")
g.set_axis_labels("", "Total Transaction Amount \n(in Thousands)")
g.fig.suptitle("Visualizing total transaction amount associated with \n Card type and Card Network\n", fontsize=12,va="bottom")

In [None]:

### M1 ~ M9 (logical) : match, such as names on card and address, etc.
# Visualizing M1~ M9 values and associated number of transactions
Mx_plt = (train_set
.filter(regex=(r'(^M[1-3|5-9]|isFraud)'))
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'})
.melt(id_vars=['isFraud','numtxn']))

g = sns.FacetGrid(Mx_plt,col="isFraud",row="value",sharey=False,hue="isFraud",sharex=False)
g.map(gbarplot,"variable","numtxn",color="Paired",ci=None)
g.set_axis_labels("", "Number of Transactions")
g.fig.suptitle("Visualizing Number of transactions associated with \n Match type\n", fontsize=12,va="bottom")


# Visualizing M1~M9 values and associated total transaction amount
Mx_plt_amt = (train_set
.filter(regex=(r'(^M[1-3|5-9]|isFraud|TransactionAmt)'))
.melt(id_vars=['isFraud','TransactionAmt'])
.groupby(['isFraud','variable','value'])
.agg(['sum'])
.div(1000)
.reset_index())

Mx_plt_amt.columns = ['isFraud','variable','value','totalAmt']
g = sns.FacetGrid(Mx_plt_amt,col="isFraud",row="value",sharey=False,hue="isFraud",sharex=False)
g.map(gbarplot,"variable","totalAmt",color="Paired",ci=None)
g.set_axis_labels("", "Total Transaction amount")
g.fig.suptitle("Visualizing total transaction amount associated with \n Match type\n", fontsize=12,va="bottom")



In [None]:

### addr1 and addr2 : Address
# Visualizing addr1 values and associated number of transactions
addr1_plt = (train_set[['addr1','isFraud']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'}))

g = sns.FacetGrid(addr1_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.scatterplot,"addr1","numtxn")
g.set_axis_labels("addr1 values", "Number of Transactions")
g.fig.suptitle("Visualizing Number of transactions associated with \n addr1 values\n", fontsize=12,va="bottom")


# Visualizing addr1 values and associated total transaction amount
addr1_plt_amt = (train_set[['addr1','isFraud','TransactionAmt']]
.groupby(['isFraud','addr1'])
.agg(['sum'])
.div(1000)
.reset_index())

addr1_plt_amt.columns = ['isFraud','addr1','totalAmt']
g = sns.FacetGrid(addr1_plt_amt,col="isFraud",sharey=False,hue="isFraud",sharex=False)
g.map(sns.scatterplot,"addr1","totalAmt")
g.set_axis_labels("addr1 values", "Total Transaction amount")
g.fig.suptitle("Visualizing total transaction amount associated with \n addr1 values\n", fontsize=12,va="bottom")

# Visualizing addr2 values and associated number of transactions
addr2_plt = (train_set[['addr2','isFraud']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'}))

g = sns.FacetGrid(addr2_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.scatterplot,"addr2","numtxn")
g.set_axis_labels("addr2 values", "Number of Transactions")
g.fig.suptitle("Visualizing Number of transactions associated with \n addr2 values\n", fontsize=12,va="bottom")


# Visualizing addr2 values and associated total transaction amount
addr2_plt_amt = (train_set[['addr2','isFraud','TransactionAmt']]
.groupby(['isFraud','addr2'])
.agg(['sum'])
.div(1000)
.reset_index())

addr2_plt_amt.columns = ['isFraud','addr2','totalAmt']
g = sns.FacetGrid(addr2_plt_amt,col="isFraud",sharey=False,hue="isFraud",sharex=False)
g.map(sns.scatterplot,"addr2","totalAmt")
g.set_axis_labels("addr2 values", "Total Transaction amount")
g.fig.suptitle("Visualizing total transaction amount associated with \n addr1 values\n", fontsize=12,va="bottom")





In [None]:

### P_emaildomain: purchaser email domain
# Visualizing purchaser email domain of approximately 99% of the transactions.
P_emaildomain_plt = (train_set[['isFraud','P_emaildomain']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'})
.query('numtxn>1000')
)

g = sns.FacetGrid(P_emaildomain_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.barplot,"numtxn","P_emaildomain")
g.set_axis_labels("Number of Transactions","Purchaser email domain")
g.fig.suptitle("Visualizing Number of transactions associated with Purchaser email domain", fontsize=12,va="bottom")


In [None]:

### R_emaildomain : recipient email domain
# Visualizing recipient email domain of approximately 99% of the transactions.
R_emaildomain_plt = (train_set[['isFraud','R_emaildomain']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'})
.query('numtxn>1000')
)

g = sns.FacetGrid(R_emaildomain_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.barplot,"numtxn","R_emaildomain")
g.set_axis_labels("Number of Transactions","Recepient email domain")
g.fig.suptitle("Visualizing Number of transactions associated with Recepient email domain", fontsize=12,va="bottom")


In [None]:
### Device Type : type of device used for transaction (mobile/desktop/Unknown)
DeviceType_plt = (train_set[['isFraud','DeviceType']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'})
.query('numtxn>1000')
)

g = sns.FacetGrid(DeviceType_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.barplot,"DeviceType","numtxn")
g.set_axis_labels("Device Type","Number of Transactions")
g.fig.suptitle("Visualizing Number of transactions associated with Device Type", fontsize=12,va="bottom")


In [None]:

### Device Info 
# Visualizing Device information of approximately 99% of dataset (Number of transactions > 100)

DeviceInfo_plt = (train_set[['isFraud','DeviceInfo']]
.value_counts()
.reset_index()
.rename(columns={0:'numtxn'})
.query('numtxn>1000')
)

g = sns.FacetGrid(DeviceInfo_plt,col="isFraud",hue="isFraud",sharex=False,sharey=False)
g.map(sns.barplot,"numtxn","DeviceInfo")
g.set_axis_labels("Number of Transactions","Device Info")
g.fig.suptitle("Visualizing Number of transactions associated with Device Info", fontsize=12,va="bottom")
