In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# scikit-learn is a rich ML library
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

# library of XGBoost algorithm. You can find one in scikit-learn, too.
import xgboost as xgb

# library for regex
import re

# library for garbage collection
import gc
gc.enable()

# Let's ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
%%time
# TransactionID is the key column. We define it as the index column for ease of use.
tr_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv', index_col='TransactionID')
ts_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv', index_col='TransactionID')
tr_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
ts_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')
print("Data Loaded!")

In [None]:
# Let's see the size of the tables
print("train transaction shape:", tr_transaction.shape)
print("test transaction shape:", ts_transaction.shape)
print("train identity shape:", tr_identity.shape)
print("test identity shape:", ts_identity.shape)

# Explore Data

In [None]:
# How is the data distributed among the classes?
sns.countplot(tr_transaction['isFraud'], palette='Pastel1')

In [None]:
# What portion of data in transaction table is missing?
missing_values_count = tr_transaction.isnull().sum()
print (missing_values_count)
total_cells = np.product(tr_transaction.shape)
total_missing = missing_values_count.sum()
print (f'{round((total_missing/total_cells) * 100, 2)}% of transaction data is missing!')

In [None]:
# What portion of data in identity table is missing?
missing_values_count = tr_identity.isnull().sum()
print (missing_values_count)
total_cells = np.product(tr_identity.shape)
total_missing = missing_values_count.sum()
print (f'{round((total_missing/total_cells) * 100, 2)}% of identity data is missing!')

In [None]:
# What portion of transactions have an identity record?
print(f'{round(np.sum(tr_transaction.index.isin(tr_identity.index.unique())) / len(tr_transaction) *100, 2)}% of transactions have identity.')

In [None]:
# What is the distribution of transaction's date-time?
fig = px.histogram(tr_transaction, x='TransactionDT', color='isFraud', marginal='box')
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.6)
fig.show()

In [None]:
# Let's plot it in logarithm scale for a better insight
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = tr_transaction.loc[tr_transaction['isFraud'] == 1]['TransactionDT'].values

sns.distplot(np.log(time_val), ax=ax[0], color='r')
ax[0].set_title('Distribution of LOG TransactionDT, isFraud=1', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

time_val = tr_transaction.loc[tr_transaction['isFraud'] == 0]['TransactionDT'].values

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionDT, isFraud=0', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])


plt.show()

In [None]:
# Do the date-times of transactions in train and test overlap?
plt.hist(tr_transaction['TransactionDT'], label='train')
plt.hist(ts_transaction['TransactionDT'], label='test')
plt.legend()
plt.title("Histogram of transaction datetime")

In [None]:
# What is the distribution of transactions' amount?
fig = px.histogram(tr_transaction, x='TransactionAmt', color='isFraud', marginal='box')
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.6)
fig.show()

In [None]:
gc.collect()

In [None]:
# Let's plot in logarithm scale for a better insight
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = tr_transaction.loc[tr_transaction['isFraud'] == 1]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[0], color='r')
ax[0].set_title('Distribution of LOG TransactionAmt, isFraud=1', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

time_val = tr_transaction.loc[tr_transaction['isFraud'] == 0]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionAmt, isFraud=0', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])


plt.show()

In [None]:
# M is a categorical feature. Let's explore it.
fig, axes = plt.subplots(3, 3, figsize=(20, 20))
fig.suptitle("Value Counts in M features")
for i in range(3):
    for j in range(3):
        sns.countplot(data=tr_transaction, x=f'M{3*i+j+1}', hue='isFraud', ax=axes[i,j])

In [None]:
# ProductCD is a categorical feature
sns.countplot(data=tr_transaction, x="ProductCD", hue='isFraud')

In [None]:
# How many unique values are there in each card feature in the train set?
plt.figure(figsize=(35, 8))
features = [f'card{i}' for i in range(1, 7)]
uniques = [len(tr_transaction[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per feature TRAIN')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center")

In [None]:
# How many unique values are there in each card feature in the test set?
plt.figure(figsize=(35, 8))
features = [f'card{i}' for i in range(1, 7)]
uniques = [len(ts_transaction[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per feature TEST')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center")

In [None]:
# Visualizing card4 categories
sns.countplot(data=tr_transaction, x='card4', hue='isFraud')

In [None]:
# Visualizing card6 categories
sns.countplot(data=tr_transaction, x='card6', hue='isFraud')

In [None]:
# How many unique values are there in id features in the train set?
plt.figure(figsize=(35, 8))
features = list(tr_identity.columns[0:38])
uniques = [len(tr_identity[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per feature TRAIN')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center")

In [None]:
# How many unique values are there in id features in the test set?
plt.figure(figsize=(35, 8))
features = list(ts_identity.columns[0:38])
uniques = [len(ts_identity[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per feature TEST')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center") 

In [None]:
tr_identity.head()

In [None]:
# Are there categories that exist in the test set but not in the train?
for ft in features[11:]:
    print("Feature:", ft)
    print(set(ts_identity[ft].unique()).difference(set(tr_identity[ft.replace('-', '_')].unique())))
    print("*"*40)

Let's take a look at unique values in some other categorical features

In [None]:
tr_transaction['P_emaildomain'].unique()

In [None]:
tr_transaction['R_emaildomain'].unique()

In [None]:
tr_transaction['addr1'].unique()

In [None]:
tr_transaction['addr2'].unique()

In [None]:
# Visualizing DeviceType feature
sns.countplot(data=tr_identity, x='DeviceType')

In [None]:
tr_identity['DeviceInfo'].unique()

# Feature Engineering

In [None]:
# id features are not named unifyingly in the train and test and should be fixed.
ts_identity.rename(columns={x: x.replace('-', '_') for x in ts_identity.columns[:38]}, inplace=True)

In [None]:
# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 
def reduce_mem_usage_numeric(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def reduce_mem_usage_cat(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == object and col not in ['DeviceInfo', 'id_30', 'id_31']:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Function to drop some columns
def columns2drop(df):
    drop_list = ['id_33', 'P_emaildomain', 'R_emaildomain', 'TransactionDT']
    for col in df.columns:
        if df[col].isnull().sum() / df.shape[0] > 0.9:
            drop_list.append(col)
    return drop_list

In [None]:
# Function to split the email domains into separate columns
def split_email_domains(df):
    df[['P_emaildomain1', 'P_emaildomain2', 'P_emaildomain3']] = df['P_emaildomain'].str.split('.', expand=True)
    df[['R_emaildomain1', 'R_emaildomain2', 'R_emaildomain3']] = df['R_emaildomain'].str.split('.', expand=True)
    for x in ['R', 'P']:
        for i in range(1, 4):
            df[f'{x}_emaildomain{i}'].fillna('', inplace=True)

In [None]:
# Function to generate a numerical column from id_33 which apparently contains string of dimensions
def split_id33(df):
    name = "id-33"
    if not 'id-33' in df.columns:
        name = "id_33"
    df[["height", "width"]] = df[name].str.split('x', expand=True)
    df['height'].fillna(0, inplace=True)
    df['width'].fillna(-1, inplace=True)
    df['aspect_ratio'] = df['height'].astype('uint16') / df['width'].astype('uint16')
    df.drop(['height', 'width'], axis=1, inplace=True)

In [None]:
# function to apply the preprocessing
def preprocessing(df, drop_list):
    split_id33(df)
    split_email_domains(df)
    
    df['Transaction_day_of_week'] = np.floor((df['TransactionDT'] / (3600 * 24) - 1) % 7)
    df['Transaction_hour'] = np.floor(df['TransactionDT'] / 3600) % 24
    
    df.drop(drop_list, axis=1, inplace=True)

In [None]:
# Functions to handle tfidf vectorization
def tokenizer(x):
    return re.split(' ._-/', x)


def tfidf_vectorizer(train_df, test_df, col):
    train_df[col].fillna('Unknown', inplace=True)
    test_df[col].fillna('Unknown', inplace=True)

    tfidf = TfidfVectorizer(decode_error='replace', lowercase=True, strip_accents='ascii', analyzer='char_wb', tokenizer=tokenizer)
    v = tfidf.fit_transform(train_df[col])
    w = tfidf.transform(test_df[col])
    
    tr_tfidf = pd.DataFrame.sparse.from_spmatrix(v, index=train_df.index, columns=[f'{col}_{i}' for i in tfidf.vocabulary_])
    ts_tfidf = pd.DataFrame.sparse.from_spmatrix(w, index=test_df.index, columns=[f'{col}_{i}' for i in tfidf.vocabulary_])
    
    for col in tr_tfidf.columns:
        tr_tfidf[col] = tr_tfidf[col].values.to_dense().astype(np.float16)
        ts_tfidf[col] = ts_tfidf[col].values.to_dense().astype(np.float16)

    tr = pd.concat([train_df, tr_tfidf], axis=1)
    del tr_tfidf, train_df
    ts = pd.concat([test_df, ts_tfidf], axis=1)
    del ts_tfidf, test_df
    
    gc.collect()
    
    return tr, ts

In [None]:
# Join the two tables
train = tr_transaction.merge(tr_identity, how='left', left_index=True, right_index=True)
y_train = train['isFraud'].astype('uint8').copy()
del tr_transaction, tr_identity

test = ts_transaction.merge(ts_identity, how='left', left_index=True, right_index=True)
del ts_transaction, ts_identity

print(f"Shape of train data: {train.shape}, Shape of test data: {test.shape}")

train.head()

In [None]:
gc.collect()

In [None]:
X_train = train.drop('isFraud', axis=1)
del train
gc.collect()

In [None]:
drop_list = columns2drop(X_train)
preprocessing(X_train, drop_list)

In [None]:
preprocessing(test, drop_list)

In [None]:
X_train = reduce_mem_usage_numeric(X_train)
test = reduce_mem_usage_numeric(test)

In [None]:
# Encoding labels in categorical features
for f in X_train.columns:
    if f not in ['DeviceInfo', 'id_30', 'id_31'] and (X_train[f].dtype=='object' or test[f].dtype=='object'):
        lbl = LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        test[f] = lbl.transform(list(test[f].values))

In [None]:
X_train, test = tfidf_vectorizer(X_train, test, 'DeviceInfo')
X_train, test = tfidf_vectorizer(X_train, test, 'id_30')
X_train, test = tfidf_vectorizer(X_train, test, 'id_31')

In [None]:
X_train.drop(['DeviceInfo', 'id_30', 'id_31'], axis=1, inplace=True)
test.drop(['DeviceInfo', 'id_30', 'id_31'], axis=1, inplace=True)

In [None]:
# Now all the features are numerical. Let's fill the missings with -1.
X_train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

In [None]:
# Just to ensure no other category is left and if so, reduce its memory
X_train = reduce_mem_usage_cat(X_train)
test = reduce_mem_usage_cat(test)

# Build Model and Evaluate

In [None]:
NFOLDS = 5
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=1400)

y_preds = np.zeros(test.shape[0])
y_oof = np.zeros(X_train.shape[0])
score = 0
  
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    clf = xgb.XGBClassifier(  # For more info about the parameters, visit https://xgboost.readthedocs.io/en/latest/parameter.html
        n_estimators=500,
        max_depth=12,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        gamma = 0.2,
        alpha = 5,
        missing=-1,
        tree_method='gpu_hist'
    )
    
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print("FOLD: ",fold,' AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
    score += roc_auc_score(y_vl, y_pred_train) / NFOLDS
    y_preds += clf.predict_proba(test)[:,1] / NFOLDS
    
    # delete the excess memory
    del X_tr, X_vl, y_tr, y_vl
    gc.collect()
    
    
print("\nMEAN AUC = {}".format(score))
print("OOF AUC = {}".format(roc_auc_score(y_train, y_oof)))  # OOF stands for out-of-fold

In [None]:
# Get xgBoost importances
importance_dict = {}
for import_type in ['weight', 'gain', 'cover']:
    importance_dict['xgBoost-'+import_type] = clf.get_booster().get_score(importance_type=import_type)
    
# MinMax scale all importances
importance_df = pd.DataFrame(importance_dict).fillna(0)
importance_df = pd.DataFrame(
    MinMaxScaler().fit_transform(importance_df),
    columns=importance_df.columns,
    index=importance_df.index
)

# Create mean column
importance_df['mean'] = importance_df.mean(axis=1)

# Plot the feature importances
importance_df.sort_values('mean', ascending=False).head(25).plot(kind='bar', figsize=(30, 7))

In [None]:
# delete the excess memory
del clf, importance_df
gc.collect()

In [None]:
# Prepare for submission
sub = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')
sub['isFraud'] = y_preds
sub.to_csv('submission.csv')
sub.head()

# Resources

[Fraud complete EDA](https://www.kaggle.com/jesucristo/fraud-complete-eda/notebook)

[EDA for CIS Fraud Detection](https://www.kaggle.com/nroman/eda-for-cis-fraud-detection)

[~Almost~ complete Feature Engineering IEEE data](https://www.kaggle.com/kabure/almost-complete-feature-engineering-ieee-data)

[Extensive EDA and Modeling XGB Hyperopt](https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt)