In [None]:
import matplotlib.pyplot as plt
#import plotly.graph_objs as go
#import plotly.tools as tls
import seaborn as sns
import xgboost as xgb
import pandas as pd
import numpy as np
import datetime
import time
import gc
import os

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from sklearn.model_selection import KFold, TimeSeriesSplit, StratifiedKFold
from sklearn.metrics import roc_auc_score,make_scorer
#from plotly.offline import iplot, init_notebook_mode
from xgboost import XGBClassifier, plot_importance
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn import preprocessing


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_transactions = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')


# **Exploratory Data Analysis**

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)
            
def reduce_memory_usage(df, verbose=True):
    """ 
    Reduces the size of given dataframe by assigning 
    datatype appropriately.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_max = df[col].max()
            c_min = df[col].min()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Mem. usage decreased to {end_mem} Mb {(((start_mem - end_mem)/start_mem))*100} % reduction.')
    return df
                    
def describe_table(df):
    """Describes the statistics of given dataframe"""
    print(f'Dataset Shape is: {df.shape}')
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary.rename(columns={'index':'Name'}, inplace=True)
    summary['Missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First Values'] = df.loc[0].values
    summary['Second Values'] = df.loc[1].values
    summary['Third Values'] = df.loc[2].values
    display(summary)
    
    return summary

def countplot(df, x_value='', y_value='', title='',xlabel='', ylabel='', 
              legend=False, legend_title='',legend_labels='', point_plot=False,
              point_y_label='', point_x_value='', point_y_value='', hue=False):
    
    """Plot Countplot"""
    
    plt.figure(figsize=(14,22))
    total = df.shape[0]
    plt.subplot(413)
    if hue == True:
        g = sns.countplot(x=x_value, hue=y_value, data=df, order=df[x_value].dropna().unique())
    else:
        g = sns.countplot(x=x_value, data=df, )
    
    g.set_title(title, fontsize=16)
    g.set_xlabel(xlabel, fontsize=15)
    g.set_ylabel(ylabel, fontsize=15)
    for p in g.patches:
        height = p.get_height()
        g.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center", fontsize=10)
        
    if legend == True:
        plt.legend(title=legend_title, loc='best', labels=legend_labels)
    
    if point_plot == True:
        tmp = pd.crosstab(df[x_value], df[y_value], normalize='index')*100
        tmp = tmp.reset_index()
        
        gt = g.twinx()
        gt = sns.pointplot(x=point_x_value, y=point_y_value, data=tmp, color='black',
                          order=df[x_value].dropna().unique(), legend=False)
        gt.set_ylabel(point_y_label, fontsize=15)
        
    plt.show()

def barplot(df, x_value, y_value, title, xlabel, ylabel, total):
    """Plot Barplot"""
    plt.figure(figsize=(14,22))
    plt.subplot(413)
    g = sns.barplot(x=x_value, y=y_value, dodge=True, data=df)
    g.set_title(title, fontsize=20)
    g.set_xlabel(xlabel,fontsize=18)
    g.set_ylabel(ylabel, fontsize=18)
    for p in g.patches:
        height = p.get_height()
        g.text(p.get_x()+p.get_width()/ 2, height+3, 
                f'{(height/total * 100):.2f}%',ha='center', fontsize=13)
    plt.show()

def boxenplot(df, x_value, y_value, legend, title, x_label, y_label):
    """Plot BoxenPlot"""
    plt.figure(figsize=(14,10))
    plt.subplot(212)
    g = sns.boxenplot(x=x_value, y=y_value, hue=legend,
                      data=df)
    g.set_title(title, fontsize=20)
    g.set_xlabel(x_label, fontsize=17)
    g.set_ylabel(y_label, fontsize=17)

    plt.subplots_adjust(hspace=0.6, top=0.85)
    plt.show()

def calculate_outliers(df_num):
    """Calculate outliers of given column."""
    data_mean, data_std = np.mean(df_num), np.std(df_num)
    
    cut = data_std * 3
    
    lower, upper = data_mean - cut, data_mean + cut
    
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]
    outliers_removed = [x for x in df_num if x>lower and x< upper]
    
    print(f'Identified lowest outliers: {len(outliers_lower)}')
    print(f'Identified upper outliers: {len(outliers_higher)}')
    print(f'Total outlier observations: {len(outliers_total)}')
    print(f'Non-outlier observations: {len(outliers_removed)}')
    
    print(f'Total percentual of outliers: {round((len(outliers_total) / len(outliers_removed))*100,4)}%')

In [None]:
display_all(df_id.head().T)
display_all(df_transactions.head().T)

print(df_id.shape)
print(df_transactions.shape)

df_id = reduce_memory_usage(df_id)
df_transactions = reduce_memory_usage(df_transactions)

df_transactions['TransactionAmt'] = df_transactions['TransactionAmt'].astype(float)
plt.subplots_adjust(hspace = 0.6, top = 0.85)


In [None]:
#Fraud Distributions 
perc_amt = df_transactions.groupby(['isFraud'])['TransactionAmt'].sum()
perc_amt = perc_amt.reset_index()    
total_bar = df_transactions['TransactionAmt'].sum()
countplot(df=df_transactions, x_value="isFraud", title="Fraud Transactions Distribution \n# 0: No Fraud | 1: Fraud #", xlabel="Is fraud?", ylabel="Count") 
barplot(perc_amt, "isFraud", "TransactionAmt", "Total Amount in Transaction Amt \n# 0: No Fraud | 1: Fraud #", "Is fraud?", "Total Transaction Amount Scalar", total_bar)


In [None]:
#ProductCD Distribution
countplot(df=df_transactions, x_value="ProductCD", title="ProductCD Distribution", xlabel="ProductCD Name", ylabel="Count") 
countplot(df_transactions, "ProductCD", "isFraud", "Product CD by Target(isFraud)", "ProductCD Name", "Count", True, "Fraud", ["No", "Yes"], True, "% of Fraud Transactions", "ProductCD", 1, hue=True)
boxenplot(df_transactions[df_transactions['TransactionAmt']<=2000], "ProductCD", "TransactionAmt", "isFraud",
         "Transaction Amount Distribuition by ProductCD and Target", "ProductCD Name", "Transaction Values")

In [None]:
#Transaction Amount Quantiles
df_transactions['TransactionAmt'] = df_transactions['TransactionAmt'].astype(float)
print("Transaction Amounts Qantiles:")
print(df_transactions['TransactionAmt'].quantile([.01, 0.025, .1, 0.25, .5,
                                                 .75, .9, .975, .99]))
print(pd.concat([df_transactions[df_transactions['isFraud']==1]['TransactionAmt'].quantile([.01, 0.025, .1, 0.25, .5,\
                                                 .75, .9, .975, .99]).reset_index(),\
    df_transactions[df_transactions['isFraud']==0]['TransactionAmt'].quantile([.01, 0.025, .1, 0.25, .5,\
                                                 .75, .9, .975, .99]).reset_index()], axis=1, keys=['Fraud', 'No Fraud']))
#Transaction Amount Outliers
calculate_outliers(df_transactions['TransactionAmt'])

In [None]:
#Cards Quantiles
print("Card Features Quantiles: ")
print(df_transactions[['card1', 'card2', 'card3', 'card5']].quantile([0.01, .025, .1, .25, .5, .75, .975, .99]))

describe_table(df_transactions[['card1', 'card2', 'card3','card4', 'card5', 'card6']])

#Card3 and Card5 Distributions
df_temp = df_transactions['card3'].value_counts()
df_transactions.loc[df_transactions['card3'].isin(df_temp[df_temp<200].index), 'card3'] = "Others"

df_temp = df_transactions['card5'].value_counts()
df_transactions.loc[df_transactions['card5'].isin(df_temp[df_temp<300].index), 'card5'] = "Others"


countplot(df=df_transactions, x_value="card3", y_value="isFraud", title="Card3 Values Distribution and % of Fraud Transactions", 
          xlabel="Card3 Values", ylabel="Count", point_plot=True, point_y_label="% of Fraud Transactions", 
          point_x_value="card3", point_y_value=1)

countplot(df=df_transactions, x_value="card5", y_value="isFraud", title="Card5 Values Distribution and % of Fraud Transactions", 
          xlabel="Card5 Values", ylabel="Count", point_plot=True, point_y_label="% of Fraud Transactions", 
          point_x_value="card5", point_y_value=1)

#Card 4 and Card6 Distribution
countplot(df=df_transactions, x_value="card4", title="Card4 Distribution", xlabel="Card4 Values", ylabel="Count") 
countplot(df_transactions, "card4", "isFraud", "Card4 by Target(isFraud)", "Card4 Name", "Count", True, "Fraud", ["No", "Yes"], True, "% of Fraud Transactions", "card4", 1, hue=True)
boxenplot(df_transactions[df_transactions['TransactionAmt']<=2000], "card4", "TransactionAmt", "isFraud",
         "Transaction Amount Distribuition by Card4 and Target", "Card4 Name", "Transaction Values")

countplot(df=df_transactions, x_value="card6", title="Card6 Distribution", xlabel="Card6 Values", ylabel="Count") 
countplot(df_transactions, "card6", "isFraud", "Card6 by Target(isFraud)", "Card6 Name", "Count", True,
          "Fraud", ["No", "Yes"], True, "% of Fraud Transactions", "card6", 1, hue=True)
boxenplot(df_transactions[df_transactions['TransactionAmt']<=2000], "card6", "TransactionAmt", "isFraud",
         "Transaction Amount Distribuition by Card6 and Target", "Card6 Name", "Transaction Values")


In [None]:
#M Features
describe_table(df_transactions[['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']])

for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
    df_transactions[col] = df_transactions[col].fillna('Miss')
    
for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
    countplot(df_transactions, col, "isFraud", col + " by Target(isFraud)", col + " Name", "Count", True,
              "Fraud", ["No", "Yes"], True, "% of Fraud Transactions", col, 1, hue=True)

In [None]:
#Addr Features
print('Addr Quantiles: ')
print(df_transactions[['addr1', 'addr2']].quantile([0.01,0.025,0.1,
                                                   .25,.5,.75,.90,.99]))
describe_table(df_transactions[['addr1', 'addr2']])

df_transactions.loc[df_transactions['addr1'].isin(df_transactions['addr1'].value_counts()[df_transactions['addr1'].value_counts()<=5000].index),'addr1'] = 'Others'
df_transactions.loc[df_transactions['addr2'].isin(df_transactions['addr2'].value_counts()[df_transactions['addr2'].value_counts()<=50].index),'addr2'] = 'Others'

countplot(df=df_transactions, x_value="addr1", y_value="isFraud", title="addr1 Distribution", xlabel="addr1 Values", ylabel="Count",
          point_plot=True, point_y_label="% of Fraud Transactions", point_x_value="addr1", point_y_value=1)
countplot(df=df_transactions, x_value="addr2", y_value="isFraud", title="addr2 Distribution", xlabel="addr2 Values", ylabel="Count", 
         point_plot=True, point_y_label="% of Fraud Transactions", point_x_value="addr2", point_y_value=1)

In [None]:
#Email Distribution
df_transactions.loc[df_transactions['P_emaildomain'].isin(['gmail.com', 'gmail']),'P_emaildomain'] = 'Google'

df_transactions.loc[df_transactions['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es']), 'P_emaildomain'] = 'Yahoo Mail'
df_transactions.loc[df_transactions['P_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'P_emaildomain'] = 'Microsoft'
df_transactions.loc[df_transactions.P_emaildomain.isin(df_transactions.P_emaildomain\
                                         .value_counts()[df_transactions.P_emaildomain.value_counts() <= 500 ]\
                                         .index), 'P_emaildomain'] = "Others"
df_transactions.P_emaildomain.fillna("NoInf", inplace=True)

df_transactions.loc[df_transactions['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

df_transactions.loc[df_transactions['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
df_transactions.loc[df_transactions['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                             'outlook.es', 'live.com', 'live.fr',
                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
df_transactions.loc[df_transactions.R_emaildomain.isin(df_transactions.R_emaildomain\
                                         .value_counts()[df_transactions.R_emaildomain.value_counts() <= 300 ]\
                                         .index), 'R_emaildomain'] = "Others"
df_transactions.R_emaildomain.fillna("NoInf", inplace=True)

countplot(df=df_transactions, x_value="P_emaildomain", y_value="isFraud", title="P_emaildomain Distribution", xlabel="P_emaildomain Values", ylabel="Count",
         point_plot=True, point_y_label="% of Fraud Transactions", point_x_value="P_emaildomain", point_y_value=1)
countplot(df=df_transactions, x_value="R_emaildomain", y_value="isFraud", title="R_emaildomain Distribution", xlabel="R_emaildomain Values", ylabel="Count",
         point_plot=True, point_y_label="% of Fraud Transactions", point_x_value="R_emaildomain", point_y_value=1)

In [None]:
for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
    countplot(df=df_transactions, x_value=col, y_value="isFraud", title=col + " Distribution", xlabel=col + " Values", ylabel="Count",
         point_plot=True, point_y_label="% of Fraud Transactions", point_x_value=col, point_y_value=1)
    

In [None]:
df_test_transactions = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [None]:
#it seems that train and test transaction dates don't overlap, so it would be prudent to use time-based split for validation.
plt.hist(df_transactions['TransactionDT'], label='train');
plt.hist(df_test_transactions['TransactionDT'], label='test');
plt.legend();
plt.title('Distribution of transactiond dates');

# **Feature Engineering**


In [None]:
df_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_transactions = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')


df_test_transactions = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
df_test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

In [None]:
for col in df_test_id.columns:
    if '-' in col:
        df_test_id.rename(columns={col:col.replace('-','_')}, inplace=True)

In [None]:
df_train = df_transactions.merge(df_id, how='left', left_index=True, right_index=True, on='TransactionID')
df_test = df_test_transactions.merge(df_test_id, how='left', left_index=True, right_index=True, on='TransactionID')

print(df_train.shape)
print(df_test.shape)

del df_transactions, df_id, df_test_transactions, df_test_id

In [None]:
df_train = reduce_memory_usage(df_train)
df_test = reduce_memory_usage(df_test)

In [None]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c+'_bin'] = df_train[c].map(emails)
    df_test[c+'_bin'] = df_test[c].map(emails)
    
    df_train[c+'_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_test[c+'_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    
    df_train[c+'_suffix'] = df_train[c+'_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c+'_suffix'] = df_test[c+'_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    

In [None]:
df_train['TransactionAmt'] = df_train['TransactionAmt'].astype(float)

df_train['Trans_min_mean'] = df_train['TransactionAmt'] - df_train['TransactionAmt'].mean()
df_train['Trans_min_mean'] = df_train['Trans_min_mean'] / df_train['TransactionAmt'].std()

df_test['Trans_min_mean'] = df_test['TransactionAmt'] - df_test['TransactionAmt'].mean()
df_train['Trans_min_mean'] = df_test['Trans_min_mean'] / df_test['TransactionAmt'].std()

df_train['TransactionAmt_to_mean_card1'] = df_train['TransactionAmt']/ \
df_train.groupby(['card1'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_mean_card4'] = df_train['TransactionAmt']/ \
df_train.groupby(['card4'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_std_card1'] = df_train['TransactionAmt']/ \
df_train.groupby(['card1'])['TransactionAmt'].transform('std')
df_train['TransactionAmt_to_std_card4'] = df_train['TransactionAmt']/ \
df_train.groupby(['card4'])['TransactionAmt'].transform('std')

df_test['TransactionAmt_to_mean_card1'] = df_test['TransactionAmt']/ \
df_test.groupby(['card1'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_mean_card4'] = df_test['TransactionAmt']/ \
df_test.groupby(['card4'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_std_card1'] = df_test['TransactionAmt']/ \
df_test.groupby(['card1'])['TransactionAmt'].transform('std')
df_test['TransactionAmt_to_std_card4'] = df_test['TransactionAmt']/ \
df_test.groupby(['card4'])['TransactionAmt'].transform('std')

df_train['TransactionAmt'] = np.log(df_train['TransactionAmt'])
df_test['TransactionAmt'] = np.log(df_test['TransactionAmt'])


# **Data Cleaning**

In [None]:
def remove_columns_null(df, null_per=0.9, target=''):
    """Remove columns from dataframe which have null values greater than null_per"""
    drop_cols = [col for col in df if df[col].isnull().sum() / df.shape[0] > null_per]
    if target and target in drop_cols:
        drop_cols.remove(target)
    df.drop(columns=drop_cols, inplace=True)
    return df
    
def remove_columns_top_values(df, top_values_per=0.9, target=''):
    """Remove columns from dataframe which have top_values greater than top_values_per"""
    drop_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > top_values_per]
    if target and target in drop_cols:
        drop_cols.remove(target)
    df.drop(columns=drop_cols, inplace=True)
    return df

def convert_categorical(df, target=''):
    """Convert categorical to labels"""
    for col in df.columns:
        if df[col].dtype=='object' and col != target:
            le = preprocessing.LabelEncoder()
            col_val = list(df[col].values)
            le.fit(col_val)
            df[col] = le.transform(col_val)
    return df

def pca(df, cols, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)
    principal_components = pca.fit_transform(df[cols])
    principal_df = pd.DataFrame(principal_components)
    df.drop(col, axis=1, inplace=True)
    principal_df.rename(columns=lambda x: str(prefix)+str(x), inplace=True)
    df = pd.concat([df, principal_df], axis=1)
    return df 

In [None]:
df_test['isFraud'] = 'test'
df = pd.concat([df_train, df_test], axis=0, sort=False)
df = df.reset_index()
df = df.drop('index', axis=1)

df = remove_columns_null(df, target='isFraud')
df = remove_columns_top_values(df, target='isFraud')
df = convert_categorical(df, 'isFraud')

In [None]:
mass_v = []
for col in df.columns:
    if 'V' in col:
        df[col] = df[col].fillna((df[col].min()-2))
        df[col] = (minmax_scale(df[col], feature_range=(0, 1)))
        mass_v.append(col)
df = pca(df, cols=mass_v, n_components=30, prefix='PCA_V')

In [None]:
df = reduce_memory_usage(df)
df_train, df_test = df[df['isFraud'] != 'test'], df[df['isFraud'] == 'test'].drop('isFraud', axis=1)
x_train = df_train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
y_train = df_train.sort_values('TransactionDT')['isFraud'].astype(bool)
x_test = df_test.sort_values('TransactionDT').drop(['TransactionDT'], axis=1)

del df_train
del df_test


In [None]:
x_train.columns

In [None]:
x_train.fillna(-1, inplace=True)
x_test.fillna(-1, inplace=True)
gc.collect()

# **Parameter Tuning**

In [None]:
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': f"{params['gamma']:.3f}",
        'subsample': f"{params['subsample']:.3f}",
        'reg_alpha': f"{params['reg_alpha']:.3f}",
        'reg_lambda': f"{params['reg_lambda']:.3f}",
        'learning_rate': f"{params['learning_rate']:.3f}",
        'colsample_bytree': f"{params['colsample_bytree']:.3f}"
#         'num_leaves': f"{params['num_leaves']:.3f}",
#         'colsample_bytree': f"{params['colsample_bytree']:.3f}",
#         'min_child_sample': f"{params['min_child_sample']:.3f}",
#         'feature_fraction': f"{params['feature_fraction']:.3f}",
#         'bagging_fraction': f"{params['bagging_fraction']:.3f}"
    }
    print(f'Params={params}')
    FOLDS = 5
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    #tss = TimeSeriesSplit(n_splits=FOLDS)
    #y_preds = np.zeros(sample_submission.shape[0])
    #y_oof = np.zeros(x_train.shape[0])
    score_mean = 0
    count = 1
    for tr_idx, val_idx in skf.split(x_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, random_state=4,
            tree_method='gpu_hist', **params
        )
        x_tr, x_vl = x_train.iloc[tr_idx, :], x_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        clf.fit(x_tr, y_tr)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, x_vl, y_vl)
        #score2 = roc_auc_score(x_vl, y_vl)
        print(score)
        #print(score2)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1    
    print(f'Total Time taken to Run:{round(time2/60, 2)}')
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean/FOLDS}')
    del x_tr, x_vl, y_tr, clf, score
    return -(score_mean / FOLDS)

space = {
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    'gamma': hp.uniform('gamma', 0.01, 0.7),
    'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.9),
    'subsample': hp.choice('subsample', [0.2,0.4,0.5,0.6,0.7,0.8,0.9])
    
#     'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
#     'min_child_sample': hp.choice('min_child_samples', list(range(100,250,10))),
#     'feature_fraction': hp.uniform('feature_fraction', 0.4, 0.8),
#     'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 0.9)
}

best = fmin(fn=objective,
           space=space,
           algo=tpe.suggest,
           max_evals=20)
best_params = space_eval(space, best)

# **Modeling**

In [None]:
best_params = {'colsample_bytree': 0.7119660671941589, 'gamma': 0.33522898802386575, 'learning_rate': 0.056727824029570835, 'max_depth': 20.0, 'reg_alpha': 0.23710032808707382, 'reg_lambda': 0.25699791703652, 'subsample': 0.9}
print('Best Params:', best_params)
best_params['max_depth'] = int(best_params['max_depth'])

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=600,
    **best_params,
    tree_method='gpu_hist'
)
clf.fit(x_train, y_train)
y_preds = clf.predict_proba(x_test)[:, 1]

In [None]:
y_preds = clf.predict_proba(x_test)[:, 1]

In [None]:
feature_important = clf.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())
data = pd.DataFrame(data=values, index=keys, columns=['score']).sort_values(by='score', ascending=False)
data.head(20)


In [None]:
sample_submission = pd.DataFrame()
sample_submission['TransactionID'] = x_test['TransactionID']
sample_submission['isFraud'] = y_preds
sample_submission.to_csv('XGB_model.csv')