In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Credits for this notebook:

https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt

https://www.kaggle.com/artgor/eda-and-models

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from scipy import stats
from cycler import cycler
import math
import matplotlib

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC, LinearSVC
from sklearn import svm, tree, linear_model, neighbors, ensemble
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
import matplotlib
import xgboost as xgb
import gc


import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Fantastic helper functions borrowed from:

https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt

In [None]:
def tablestats(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['Mode (with NaNs)'] = df.mode(dropna = False).iloc[0].values
    summary['Mode (without NaNs)'] = df.mode().iloc[0].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def calcoutliers(df_num): 

    #the mem reduction routine changes the transaction amount into float16. Computations like sum, mean, std result in overflow.
    vals = df_num.astype('float64') 
    
    # calculating mean and std of the array
    data_mean = np.mean(vals)
    data_std = np.std(vals)
    
    # calculating min and max of the array
    data_min = np.min(vals)
    data_max = np.max(vals)

    # seting the cut line to both higher and lower values
    # You can change this value
    cut = data_std * 3

    #Calculating the higher and lower cut values
    lower = data_mean - cut 
    upper = data_mean + cut

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print('Mean: ', data_mean)
    print('StD: ', data_std)
    print('Min. value: ', data_min)
    print('Max. value: ', data_max)
    print('Identified lower outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
    print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
    print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
    print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
    print("% outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # % outliers in points
    
    return

In [None]:
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '{}'.format(height), ha=ha[xpos], va='bottom')


### Reading Data

In [None]:
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')


In [None]:
print('Train_identity: ', train_identity.shape)
print('Train_transaction: ', train_transaction.shape)
print('Test_identity: ', test_identity.shape)
print('Test_transaction: ', test_transaction.shape)

### Transaction Table

* TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
* TransactionAMT: transaction payment amount in USD
* ProductCD: product code, the product for each transaction
* card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
* addr: address
* dist: distance
* P_ and (R__) emaildomain: purchaser and recipient email domain
* C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
* D1-D15: timedelta, such as days between previous transaction, etc.
* M1-M9: match, such as names on card and address, etc.
* Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203


In [None]:
train_transaction.head()

### Categorical Features:

* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9

https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203

In [None]:
train_identity.head()

In [None]:
train_transaction.head()

In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [None]:
print('Train: ', train.shape)
print('Test: ', test.shape)


In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
del train_transaction
del train_identity
del test_transaction
del test_identity
gc.collect()

## Data Exploration

See this link for data description:

https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203

In [None]:
print('Columns with at least one null value (train):', train.isnull().any().sum())
print('Columns with at least one null value (test):', test.isnull().any().sum())
print('Columns with all null values (train):', train.isnull().all().sum())
print('Columns with all null values (test):', test.isnull().all().sum())


Unique Values in Columns

In [None]:
print('Columns with at least one null value (train):', [col for col in train.columns if train[col].nunique() == 1])
print('Columns with at least one null value (test):', [col for col in test.columns if test[col].nunique() == 1])

Top Null Columns By Percentage

In [None]:
train_stats = pd.DataFrame(train.isnull().sum()).reset_index()
train_stats.columns = ['cols', 'nulls']
train_stats['percent_nulls'] = 100.0 * train_stats['nulls']/train.shape[0]
train_stats = train_stats.sort_values(by = 'nulls', ascending = False).reset_index(drop = True)
train_stats.head(n = 20)

In [None]:
tablestats(train)

### Target Label: isFraud

In [None]:
f, ax = plt.subplots(1, 1, figsize=(6,4))
plt.rcParams.update({'font.size': 10})

col = 'isFraud'
print(train[col].value_counts(dropna=False, normalize = True))

train[col].value_counts(dropna=False).plot(kind='bar', ax = ax)
ax.set_title(col)


## ID Columns

“id01 to id11 are numerical features for identity, which is collected by Vesta and security partners such as device rating, ip_domain rating, proxy rating, etc. Also it recorded behavioral fingerprint like account login times/failed to login times, how long an account stayed on the page, etc. All of these are not able to elaborate due to security partner T&C.”

In [None]:
num_id_cols = []
for col in train.columns:
    if train[col].dtype != 'O' and col[0:2] == 'id':
        num_id_cols.append(col)
        
print('Numerical ID colums: ', len(num_id_cols))
print(num_id_cols)
print('='*60)
print('id_yy columns with at least one null =', train[num_id_cols].isnull().any().sum())

cols = 4
rows = math.ceil(len(num_id_cols)/cols)
#print(rows, cols)

font = {'weight' : 'normal',
        'size' : 18}

matplotlib.rc('font', **font)

f, ax = plt.subplots(rows, cols, figsize = (40,40))
plt.rcParams.update({'font.size': 16})

for i, col in enumerate(num_id_cols):
    sns.distplot(a = train[col], ax = ax[int(i/cols)][i%cols], kde = False)
    #ax[int(i/cols)][i%cols].set_title('Distribution of ', col)
    #ax[int(i/cols)][i%cols].set_xlabel(size = 20)

ax[-1][-1].axis('off')
plt.show()


Some colums that seem like mostly single valued in above plots

In [None]:
cols = ['id_03', 'id_04', 'id_10']
for col in cols:
    print('Column:', col)
    print(train[col].value_counts(dropna=False, normalize=True).head(n = 10))
    print('='*60)


#### Categorical ID Columns with greater than 5 unique values

In [None]:
cat_id_cols = []
for col in train.columns:
    if train[col].dtype == 'O' and col[0:2] == 'id':
        if train[col].nunique() <= 5:
            cat_id_cols.append(col)
        else:
            print('Column:', col)
            print(train[col].value_counts(dropna=False, normalize=True).head(n=10))
            print('='*60)


#### Categorical ID Columns with <= 5 unique values

In [None]:
print('Categorical ID colums: ', len(cat_id_cols))
print(cat_id_cols)

cols = 3
rows = math.ceil(len(cat_id_cols)/cols)

f, ax = plt.subplots(rows, cols, figsize = (30,15))
#plt.rcParams.update({'font.size': 22})

for i, col in enumerate(cat_id_cols):
    train[col].value_counts(dropna=False).plot(kind='barh', ax = ax[int(i/cols)][i%cols], fontsize = 18)
    ax[int(i/cols)][i%cols].set_title('Counts of ' + col, fontsize = 18)

f.tight_layout()
plt.show()

In [None]:
#Selecting crowded x-axis
cols = ['id_30', 'id_31', 'id_33']
for col in cols:
    print('Column:', col)
    print(train[col].value_counts(dropna=False, normalize=True).head(n=10))
    print('='*60)


### Card Columns

card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.



In [None]:
sel_cols = []
for col in train.columns:
    if train[col].dtype != 'O' and col.startswith('card'):
        sel_cols.append(col)
        
print('Numerical ID colums: ', len(sel_cols))
print(sel_cols)
print('='*60)

cols = 2
rows = math.ceil(len(sel_cols)/cols)
#print(rows, cols)

font = {'weight' : 'normal',
        'size' : 14}

matplotlib.rc('font', **font)

f, ax = plt.subplots(rows, cols, figsize = (15,10))
plt.rcParams.update({'font.size': 14})

for i, col in enumerate(sel_cols):
    sns.distplot(a = train[col], ax = ax[int(i/cols)][i%cols], kde = False)
    #ax[int(i/cols)][i%cols].set_title('Distribution of ', col)
    #ax[int(i/cols)][i%cols].set_xlabel(size = 20)

f.tight_layout()
plt.show()


In [None]:
tablestats(train[sel_cols])

In [None]:
train[sel_cols].quantile([.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).reset_index()

In [None]:
sel_cols = ['card4', 'card6']

f, ax = plt.subplots(1, 2, figsize = (20,4))

for i, col in enumerate(sel_cols):
    #train[col].value_counts(dropna=False).plot(kind='barh', ax = ax[int(i/cols)][i%cols], fontsize = 18)
    ct = pd.crosstab(train[col].fillna("NA"), train['isFraud'], normalize='index', dropna=False) * 100
    ct = ct.reset_index()
    ct = ct.rename(columns={0:'NoFraud', 1:'Fraud'})

    sns.countplot(x=train[col], hue=train['isFraud'], ax = ax[i], order = ct[col])
    
    ax[i].set_title('Counts of ' + col, fontsize = 14)
    
    bx = ax[i].twinx()
    sns.pointplot(x=col, y='Fraud', data=ct, color='r', linestyle = '--', markers = 'x', ax = bx, order = ct[col])
                  #order=ax[int(i/cols)][i%cols].get_xticks(), legend=False)

f.tight_layout()
plt.legend(loc='best')
plt.show()

### Addr1 and Addr2

Addr1: Zip Code

Addr2: Country

In [None]:
train[['addr1', 'addr2']].quantile([.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).reset_index()

In [None]:
tablestats(train[['addr1', 'addr2']])

### M columns

> M1-M9: match, such as names on card and address, etc. Mx is attribute of matching check, e.g. is phone areacode matched with billing zipcode, purchaser and recipient first/or last name match, etc.

In [None]:
sel_cols = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

cols = 3
rows = math.ceil(len(sel_cols)/cols)

f, ax = plt.subplots(rows, cols, figsize = (30,25))
#plt.rcParams.update({'font.size': 22})

for i, col in enumerate(sel_cols):
    #train[col].value_counts(dropna=False).plot(kind='barh', ax = ax[int(i/cols)][i%cols], fontsize = 18)
    train[col] = train[col].fillna("Missing")
    test[col] = test[col].fillna("Missing")
    ct = pd.crosstab(train[col].fillna("NA"), train['isFraud'], normalize='index', dropna=False) * 100
    ct = ct.reset_index()
    ct = ct.rename(columns={0:'NoFraud', 1:'Fraud'})

    sns.countplot(x=train[col], hue=train['isFraud'], ax = ax[int(i/cols)][i%cols], order = ct[col])
    plt.legend(loc='upper right')
    ax[int(i/cols)][i%cols].set_title('Counts of ' + col, fontsize = 18)
    
    bx = ax[int(i/cols)][i%cols].twinx()
    sns.pointplot(x=col, y='Fraud', data=ct, color='r', linestyle = '--', markers = 'x', ax = bx, order = ct[col])
                  #order=ax[int(i/cols)][i%cols].get_xticks(), legend=False)

        
f.tight_layout()
#ax[-1][-1].axis('off')
#ax[-1][-2].axis('off')
plt.show()

### Transaction Amount

In [None]:
f, ax = plt.subplots(3, 2, figsize = (15,10))

sns.distplot(train['TransactionAmt'], kde = False, ax = ax[0][0])#, hist_kws = {'log':True})
ax[0][0].set_title('Transaction Amount (Train)') 

sns.distplot(test['TransactionAmt'], kde = False, ax = ax[0][1])#, hist_kws = {'log':True})
ax[0][1].set_title('Transaction Amount (Test)') 
ax[0][1].legend()

sns.distplot(np.log(train['TransactionAmt']), kde = False, ax = ax[1][0])#, hist_kws = {'log':True})
ax[1][0].set_title('Log Transaction Amount (Train)') 
ax[1][0].legend()

sns.distplot(np.log(test['TransactionAmt']), kde = False, ax = ax[1][1])#, hist_kws = {'log':True})
ax[1][1].set_title('Log Transaction Amount (Test)') 
ax[1][1].legend()

ax[2][0].plot(np.sort(train[train['isFraud'] == 0]['TransactionAmt'].values), 'bo', label='Normal', alpha=0.3)
ax[2][0].plot(np.sort(train[train['isFraud'] == 1]['TransactionAmt'].values), 'ro', label='Fraud', alpha=0.3)
ax[2][0].set_title('Distribution of Normal/Fraud Transaction Amounts (Train)')
ax[2][0].legend()

ax[2][1].plot(np.sort(train['TransactionAmt'].values), 'bo', label='Train', alpha=0.3)
ax[2][1].plot(np.sort(test['TransactionAmt'].values), 'ro', label='Test', alpha=0.3)
ax[2][1].set_title('Distribution of Transaction Amounts (Train v. Test)') 
ax[2][1].legend()



f.tight_layout()

> The log transformation can be used to make highly skewed distributions less skewed. This can be valuable both for making patterns in the data more interpretable.


In [None]:
train_bins = train[train['isFraud'] == 0]['TransactionAmt'].quantile([.01, .1, .25, .5, .75, .9, .99]).reset_index()
test_bins = train[train['isFraud'] == 1]['TransactionAmt'].quantile([.01, .1, .25, .5, .75, .9, .99]).reset_index()

print(pd.concat([train_bins, test_bins], axis = 1, keys = ['No Fraud', 'Fraud']))

In [None]:
calcoutliers(train['TransactionAmt'])

### ProductCD

ProductCD: product code, the product for each transaction


In [None]:
pcd = pd.crosstab(train['ProductCD'], train['isFraud'], normalize='index') * 100
pcd = pcd.reset_index()
pcd = pcd.rename(columns={0:'NoFraud', 1:'Fraud'})
pcd
# tmp = tmp.reset_index()
# tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)


In [None]:
fontsize = 14
col = 'ProductCD'
f, ax = plt.subplots(1, 2, figsize=(14,5))

g1 = train[col].value_counts(dropna=False).plot(kind='bar', fontsize = 14, ax = ax[0], rot = 0)
for p in g1.patches:
    g1.annotate(str(np.round(100.0 * p.get_height()/train.shape[0],decimals=2)) + '%', (p.get_x()+p.get_width()/2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')
g1.set_xlabel(col, fontsize = fontsize)
g1.set_ylabel("Count", fontsize = fontsize)
g1.set_title(col + " Distribution", fontsize=fontsize+2)
g1.set_ylim(0,500000)

sns.countplot(x=col, hue='isFraud', data=train, ax = ax[1], order=['W', 'C',"R", "H", "S"])
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])
ax[1].set_xlabel(col, fontsize = fontsize)
ax[1].set_ylabel("Count", fontsize = fontsize)
ax[1].set_title("Transaction by " + col + " and Fraud", fontsize=fontsize+2)
ax[1].set_ylim(0,500000)
g2 = ax[1].twinx()
sns.pointplot(x=col, y='Fraud', data=pcd, color='r', linestyle = '--', markers = 'x', order=['W', 'C',"R", "H", "S"], legend=False, ax = g2)
g2.set_ylabel("% of Fraud Transactions", fontsize=fontsize)

f.tight_layout()

In [None]:
f, ax = plt.subplots(2, 1, figsize=(15,10))

sns.boxplot(x='ProductCD', y='TransactionAmt', hue='isFraud', 
              data=train[train['TransactionAmt'] <= 2000], ax=ax[0])
ax[0].set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=14)
ax[0].set_xlabel("ProductCD Name", fontsize=17)
ax[0].set_ylabel("Transaction Amt Values", fontsize=14)

sns.boxenplot(x='ProductCD', y='TransactionAmt', hue='isFraud', 
              data=train[train['TransactionAmt'] <= 2000], ax=ax[1])
ax[1].set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=14)
ax[1].set_xlabel("ProductCD Name", fontsize=17)
ax[1].set_ylabel("Transaction Amt Values", fontsize=14)

#plt.subplots_adjust(hspace = 0.6, top = 0.85)
f.tight_layout()

[Boxenplots](https://vita.had.co.nz/papers/letter-value-plot.pdf)

Source: [Interpreting Boxplots](https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51)

![Comparison of a boxplot of a nearly normal distribution and a probability density function (pdf) for a normal distribution](https://miro.medium.com/max/700/1*NRlqiZGQdsIyAu0KzP7LaQ.png)


### Some more columns

P_ and (R__) emaildomain: purchaser and recipient email domain.

Certain transactions don't need recipient, so R_emaildomain is null.

In [None]:
sel_cols = ['id_30', 'id_31', 'id_33', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']

cols = 2
rows = math.ceil(len(sel_cols)/cols)

f, ax = plt.subplots(rows, cols, figsize = (30,30))
#plt.rcParams.update({'font.size': 22})

for i, col in enumerate(sel_cols):
    train[col].value_counts(dropna=False).head(n=15).plot(kind='barh', ax = ax[int(i/cols)][i%cols], fontsize = 18)
    ax[int(i/cols)][i%cols].set_title('Counts of ' + col, fontsize = 18)

ax[-1][-1].axis('off')
f.tight_layout()
plt.show()

### Transaction Time

In [None]:
plt.hist(train['TransactionDT'], label='train');
plt.hist(test['TransactionDT'], label='test');
plt.legend();
plt.title('Distribution of transaction dates');

### Emails

In [None]:
sel_cols = ['P_emaildomain', 'R_emaildomain']

f, ax = plt.subplots(2, 1, figsize = (15,15))

for i, col in enumerate(sel_cols):
    #train[col].value_counts(dropna=False).plot(kind='barh', ax = ax[int(i/cols)][i%cols], fontsize = 18)
    ct = pd.crosstab(train[col].fillna("NA"), train['isFraud'], normalize='index', dropna=False) * 100
    ct = ct.reset_index()
    ct = ct.rename(columns={0:'NoFraud', 1:'Fraud'})
    ct = ct.sort_values(by = 'Fraud', ascending = False).head(35)

    sns.countplot(x=train[col], hue=train['isFraud'], ax = ax[i], order = ct[col])
    
    ax[i].set_title('Counts of ' + col, fontsize = 14)
    plt.setp(ax[i].get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    
    bx = ax[i].twinx()
    sns.pointplot(x=col, y='Fraud', data=ct, color='r', linestyle = '--', markers = 'x', ax = bx, order = ct[col])
                  #order=ax[int(i/cols)][i%cols].get_xticks(), legend=False)

f.tight_layout()
plt.legend(loc='best')
plt.xticks(rotation = 0)
plt.show()

In [None]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

Splitting emails into 3 tokens: e.g. abc.com.au

In [None]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

## Preparing Data for Modeling

As in this [kernel](https://www.kaggle.com/artgor/eda-and-models), we will drop values that have huge number of null values (>90%)

In [None]:
train.columns = [x.replace('-','_') for x in train.columns]
test.columns = [x.replace('-','_') for x in test.columns]

In [None]:
train_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
test_null_cols = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]
print('Train columns with 90% or more null values')
print(train_null_cols)
print('='*60)
print('Test columns with 90% or more null values')
print(test_null_cols)
print('='*60)

dist1/2: distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc.

In [None]:
train_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9 and col != 'isFraud']
test_top_value_cols = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [None]:
print('Train columns with 90% or more of same values')
print(train_top_value_cols)
print('='*60)
print('Test columns with 90% or more of same values')
print(test_top_value_cols)
print('='*60)

In [None]:
train_one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
test_one_value_cols = [col for col in test.columns if test[col].nunique() <= 1]

In [None]:
print('Train single value columns')
print(train_one_value_cols)
print('='*60)
print('Test single value columns')
print(test_one_value_cols)
print('='*60)

In [None]:
remove_cols = np.sort(np.unique(train_null_cols + test_null_cols + train_top_value_cols + test_top_value_cols + train_one_value_cols + test_one_value_cols))
print(remove_cols)

In [None]:
train = train.drop(columns = remove_cols)
test = test.drop(columns = remove_cols)

From [Discussion](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203)

**Categorical Features (Transaction Table):**
* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9

**Categorical Features (Identity Table):**
* DeviceType
* DeviceInfo
* id_12 - id_38



In [None]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 
            'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 
            'DeviceType', 'DeviceInfo', 'ProductCD', 'card1', 'card2', 'card3',  'card4', 'card5', 'card6',
            'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']

cat_cols = [x for x in cat_cols if x in train.columns]
print(cat_cols)

In [None]:
for col in cat_cols:
    #print(col)
    lbl = LabelEncoder()
    train_vals = train[col].astype(str).values
    test_vals  = test[col].astype(str).values
    lbl.fit(list(train_vals) + list(test_vals))
    train[col] = lbl.transform(list(train_vals))
    test[col] = lbl.transform(list(test_vals))


In [None]:
X = train.sort_values('TransactionDT').drop(columns = ['isFraud', 'TransactionDT', 'TransactionID'])
X = X.replace([np.inf, -np.inf], np.nan)

y = train.sort_values('TransactionDT')['isFraud']
#X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1)
X_test.replace([np.inf, -np.inf], np.nan)

#del train
test = test[["TransactionDT", 'TransactionID']]

In [None]:
train_cols_with_nulls = X.columns[X.isnull().any()].tolist()
test_cols_with_nulls = X_test.columns[X_test.isnull().any()].tolist()
cols_with_nulls = np.unique(train_cols_with_nulls + test_cols_with_nulls)
len(cols_with_nulls)

Some good discussion on metric for imbalanced datasets:

https://stats.stackexchange.com/questions/222558/classification-evaluation-metrics-for-highly-imbalanced-data


### Untuned Decision Tree

In [None]:
clf = DecisionTreeClassifier(max_depth = 20)
clf.fit(X.drop(columns = cols_with_nulls), y)
y_preds = clf.predict_proba(X_test.drop(columns = cols_with_nulls))[:,1]
sub = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv('DT_model.csv')

In [None]:
featuresRank = pd.DataFrame()
featuresRank['features'] = X.drop(columns = cols_with_nulls).columns
featuresRank['scores'] = clf.feature_importances_
featuresRank = featuresRank.sort_values(by=['scores'], ascending=False).reset_index()
featuresRank.plot(x = 'features', y = 'scores', kind="bar", fontsize=12, figsize=(25,5))

### Random Forest

In [None]:
# clf_params = {
#     #'criterion': ["gini", "entropy"], 
#     #'max_depth': range(2,20,2), 
#     #'min_samples_split': range(2, 50, 4), 
#     #'min_samples_leaf' : range(2, 50, 4), 
#     'n_estimators': [100,200], 
#     #'max_features': ['auto', 'log2', None]
# }

# cv_split = TimeSeriesSplit(n_splits=2)

# clf = RandomForestClassifier()

# clf_grid = GridSearchCV(clf, clf_params, scoring = 'roc_auc', verbose = True, n_jobs = -1, cv = cv_split)

# X_non_null = X.drop(columns = cols_with_nulls)
# X_test_non_null = X_test.drop(columns = cols_with_nulls)

# clf_grid.fit(X_non_null, y)

# # print(clf_grid.score(X_val, y_val))

# print('Best params: ', clf_grid.best_params_)
# print('Mean cross-validated score of the best_estimator', clf_grid.best_score_)


# #A lot of useful information
# #print(clf_grid.cv_results_)

# #Create submission file
# y_preds = clf_grid.predict_proba(X_test_non_null)[:,1]
# sub['isFraud'] = y_preds
# sub.to_csv('RF_model.csv')



In [None]:
!nvidia-smi

### XGBoost

Learning point for me:

XGBoost deals with missing values by moving examples in the direction of assignment that minimizes the loss. [-Source](https://datascience.stackexchange.com/questions/15305/how-does-xgboost-learn-what-are-the-inputs-for-missing-values) 

Added this to my reading list: [XGBoost: A Scalable Tree Boosting System](https://arxiv.org/pdf/1603.02754v3.pdf)

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

clf = xgb.XGBClassifier(random_state = 42, tree_method = 'gpu_hist', verbosity = 2,
                        objective='binary:logistic')#, eval_metric = 'auc')
clf_params = {'max_depth' : np.arange(2,16,2), 
              'subsample' : np.arange(0.5,1.0,0.1),
              'n_estimators' : [100,200,500],
              'eta' : [0.1, 0.3, 0.5],
              'min_child_weight': [1,2,4,8,16,32]
             }

clf_grid = GridSearchCV(clf, clf_params, scoring = 'roc_auc', verbose = True, n_jobs = -1, cv = tscv)

clf_grid.fit(X, y)

print('Best params: ', clf_grid.best_params_)
print('Mean cross-validated score of the best_estimator', clf_grid.best_score_)

y_preds = clf_grid.predict_proba(X_test)[:,1]
sub['isFraud'] = y_preds
sub.to_csv('XGB_model.csv', index = False)

In [None]:
feature_importance = clf_grid.best_estimator_.get_booster().get_score(importance_type="weight")
keys = list(feature_importance.keys())
values = list(feature_importance.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(20)