# Libraries

In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname,_,filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

In [None]:
pd.set_option('display.max_columns',50)

# Training Data

In [None]:
data = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
data = data.append(pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv'))
del data['Unnamed: 0']
data['merchant'] = data['merchant'].str.strip('fraud_')

fullname = data[['first','last']].apply(lambda x: ' '.join(x),axis=1)
data.insert(7,'fullname',fullname)

data_yes = data[data.is_fraud==1]
data_no = data[data.is_fraud==0]

## Attributes (or Features)

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
print('Total Transactions: {}'.format(len(data)))
print('   Fraudulent: {}'.format(len(data_yes)))
print('   Non-Fraudulent: {}'.format(len(data_no)))

# Exploratory Data Analysis (EDA)

## User-Defined Functions

### Sorted Database in Order of Probability

In [None]:
def table_summary(split_attrs, info_attrs):
    
    agg_dict = {'is_fraud': 'count'}
    agg_dict.update(dict((k, 'nunique') for k in info_attrs))
    agg_dict.update({'amt': ['min','mean','max','sum']})
    
    dfall = data.groupby(by=split_attrs).agg(agg_dict)
    df1 = data_yes.groupby(by=split_attrs).agg(agg_dict)
    df0 = data_no.groupby(by=split_attrs).agg(agg_dict)
    
    df = pd.merge(df1,df0,on=split_attrs,how='outer',suffixes=('_yes','_no'))
    df = pd.merge(df,dfall,on=split_attrs,how='outer').fillna(0)
    df.columns = ['_'.join(col) for col in df.columns.values]
    
    p_yes = df.is_fraud_yes_count/df.is_fraud_count
    df.insert(0,'p_yes',p_yes)   
    df = df.sort_values(by='p_yes',ascending=False).reset_index()
    
    if len(split_attrs)>1:
        label = df[split_attrs].apply(lambda x: ', '.join(x),axis=1)
        df.insert(len(split_attrs),'-'.join(split_attrs),label)
    
    return df

### Database of Top Fradulent Records

In [None]:
def top_summary(df, main_attrs, info_attrs, n):
    
    cols1 = main_attrs+['p_yes','is_fraud_yes_count','is_fraud_count']
    cols1.extend([k for ks in info_attrs for k in [ks+'_yes_nunique',ks+'_nunique']])
    cols2 = ['amt_yes_min','amt_yes_mean','amt_yes_max']
    
    top_df = df[cols1+cols2].sort_values(by='is_fraud_yes_count',ascending=False).head(n)
    top_df[cols2] = round(top_df[cols2],2)
    
    return top_df

### Information on Partial and Definite Frauds

In [None]:
def print_summary(df, attr):
    
    print(' and '.join([v.capitalize() for v in attr.split('-')]))
    
    total = len(df.p_yes)
    p0 = np.sum(df.p_yes==0)
    p1 = np.sum(df.p_yes==1)
    
    print('Total: {}'.format(total))
    print('   No Fraud: {}'.format(p0))
    print('   Fraud: {}'.format(total-p0))
    print('      Partial Fraud: {}'.format(total-p0-p1))
    print('      Definite Fraud: {}'.format(p1))

### Graphs of Partial and Definite Frauds

In [None]:
def plot(df, attr, istick0, istick1):
    
    if istick0: hspace = 0.7
    else: hspace = 0.25
        
    fig, axes = plt.subplots(nrows=2,ncols=1,
                             gridspec_kw={'hspace':hspace})
    fig.set_size_inches(15,10)
    
    title = []
    title.append('Partial Fraud: 0 < P[Fraud] < 1')
    title.append('Definite Fraud: P[Fruad] = 1')
    
    ls = []
    ls.append(df[df.p_yes.between(0,1,inclusive=False)])
    ls.append(df[df.p_yes==1])
    
    color = ['orchid','orangered']

    for i in range(2):
        if ls[i].empty:
            fig.delaxes(axes[i])
        else:
            axes[i].set_title(title[i])
            sns.scatterplot(x=attr,y='p_yes',data=ls[i],
                            s=10,color=color[i],ax=axes[i])
            
    for ind,v in enumerate([istick0,istick1]):
        axes[ind].tick_params(axis='x',rotation=90)
        if not v:
            axes[ind].set(xticklabels=[])
            axes[ind].set(xticks=[]) 

    plt.show()

## Histogram of Transaction Amount

In [None]:
print('Maximum Transaction Amount')
print('Fraudulent: {}'.format(max(data_yes.amt)))
print('Non-Fraudulent: {}'.format(max(data_no.amt)))

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=4,
                         gridspec_kw={'hspace':0.3,'wspace': 0.3})
fig.set_size_inches(20,10)

title = ['Fraudulent','Non-Fraudulent']
color = ['lightcoral','lightseagreen']
tick = [0,200,1000,4000,
        math.ceil(max(max(data_yes.amt),max(data_no.amt)))]

for i in range(4):
    
    ls = []
    ls.append(data_yes[data_yes.amt.between(tick[i],tick[i+1])].amt)
    ls.append(data_no[data_no.amt.between(tick[i],tick[i+1])].amt)
    
    for k in range(2):
                    
        if ls[k].empty:
            fig.delaxes(axes[k,i])
        else:                 
            axes[k,i].set_title(title[k])
            sns.histplot(ls[k],bins=50,color=color[k],ax=axes[k,i])

## Splitting Attributes

### Gender

In [None]:
data[['fullname','job','category']].nunique()

In [None]:
df_gender = table_summary(['gender'],['fullname','job','category'])
df_gender.to_csv('df_gender.csv')
df_gender

In [None]:
print_summary(df_gender,'gender')

In [None]:
fig = sns.barplot(x='gender',y='p_yes',data=df_gender,palette='pastel')
plt.show(fig)

In [None]:
top_summary(df_gender,['gender'],['job'],2)

### Job

In [None]:
data[['fullname','category','merchant','city']].nunique()

In [None]:
df_job = table_summary(['job'],['fullname','category','merchant','city'])
df_job.to_csv('df_job.csv')
df_job

In [None]:
print_summary(df_job,'job')

In [None]:
plot(df_job,'job',istick0=False,istick1=True)

In [None]:
top_summary(df_job,['job'],['fullname','merchant'],5)

### Fullname

In [None]:
data[['city','merchant','category']].nunique()

In [None]:
df_fullname = table_summary(['fullname'],['city','merchant','category'])
df_fullname.to_csv('df_fullname.csv')
df_fullname

In [None]:
print_summary(df_fullname,'fullname')

In [None]:
plot(df_fullname,'fullname',istick0=False,istick1=True)

In [None]:
top_summary(df_fullname,['fullname'],['merchant','category'],5)

### City and State

In [None]:
data[['fullname','job','merchant','category']].nunique()

In [None]:
df_city_state = table_summary(['city','state'],['fullname','job','merchant','category'])
df_city_state.to_csv('df_city_state.csv')
df_city_state

In [None]:
print_summary(df_city_state,'city-state')

In [None]:
plot(df_city_state,'city-state',istick0=False,istick1=True)

In [None]:
top_summary(df_city_state,['city','state'],['merchant','category'],5)

### Merchant

In [None]:
data[['city','category','gender','fullname']].nunique()

In [None]:
df_merchant = table_summary(['merchant'],['city','category','gender','fullname'])
df_merchant.to_csv('df_merchant.csv')
df_merchant

In [None]:
print_summary(df_merchant,'merchant')

In [None]:
plot(df_merchant,'merchant',istick0=False,istick1=True)

In [None]:
top_summary(df_merchant,['merchant'],['city','fullname'],5)

### Merchant and City

In [None]:
data[['category','gender','fullname']].nunique()

In [None]:
df_merchant_city = table_summary(['merchant','city'],['category','gender','fullname'])
df_merchant_city.to_csv('df_merchant_city.csv')
df_merchant_city

In [None]:
print_summary(df_merchant_city,'merchant-city')

In [None]:
plot(df_merchant_city,'merchant-city',istick0=False,istick1=False)

In [None]:
top_summary(df_merchant_city,['merchant','city'],['fullname'],5)

### Category

In [None]:
data[['merchant','gender','job']].nunique()

In [None]:
df_category = table_summary(['category'],['merchant','gender','job'])
df_category.to_csv('df_category.csv')
df_category

In [None]:
print_summary(df_category,'category')

In [None]:
plot(df_category,'category',istick0=True,istick1=True)

In [None]:
top_summary(df_category,['category'],['merchant','job'],13)