# Tuning Hyper Parameters for XGBoost

In [17]:
import sys, os
sys.path.append('/Users/raynoldng/Projects/ieee-fraud-detection')

import pandas as pd
import numpy as np
from sklearn import preprocessing
import scipy as sp
from scipy import stats

from fraud.tuning import optimize_hyper_parameters
import fraud.utils as utils

# Feature Engineering

In [2]:
def map_emails(df):
    emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

    us_emails = ['gmail', 'net', 'edu']

    #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
    for c in ['P_emaildomain', 'R_emaildomain']:
        df[c + '_bin'] = df[c].map(emails)
        
        df[c + '_suffix'] = df[c].map(lambda x: str(x).split('.')[-1])
        
        df[c + '_suffix'] = df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    return df

def map_transaction_amount(df):
    df['Trans_min_mean'] = df['TransactionAmt'] - df['TransactionAmt'].mean()
    df['Trans_min_std'] = df['Trans_min_mean'] / df['TransactionAmt'].std()

    df['TransactionAmt_to_mean_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('mean')
    df['TransactionAmt_to_mean_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('mean')
    df['TransactionAmt_to_std_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('std')
    df['TransactionAmt_to_std_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('std')

    df['TransactionAmt_log'] = np.log(df['TransactionAmt'])

    df['TransactionAmt_cents'] = df['TransactionAmt'] % 1

    return df


def encode_categorical_features(df_train, df_test):
    # NOTE this is the only feature engineering function that takes in both df_train 
    # df_test, this is because we want values across test and train datasets

    for f in df_train.drop('isFraud', axis=1).columns:
        if df_train[f].dtype=='object' or df_test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df_train[f].values) + list(df_test[f].values))
            df_train[f] = lbl.transform(list(df_train[f].values))
            df_test[f] = lbl.transform(list(df_test[f].values))       
    
    return df_train, df_test


def drop_V_features(df):
    # applying the PCA is major PITA, for now just drop it
    mas_v = [c for c in df.columns if c.startswith('V')]
    df = df.drop(mas_v, axis=1)

    return df

In [3]:
df_train, df_test = utils.load_data(sample=True)

(10000, 434)
(10000, 433)


In [4]:
df_train_bak, df_test_bak = df_train, df_test
def restore_df():
    global df_train, df_test
    df_train, df_test = df_train_bak, df_test_bak

In [5]:
df_train = map_emails(df_train)
df_test = map_emails(df_test)

df_train = drop_V_features(df_train) # this should be why the training is so slow
df_test = drop_V_features(df_test) # this should be why the training is so slow

df_train = map_transaction_amount(df_train)
df_test = map_transaction_amount(df_test)

df_train, df_test = encode_categorical_features(df_train, df_test)

In [6]:
X_train, y_train = utils.set_X_and_y(df_train)

In [8]:
y_train.head()

27    False
30    False
83    False
84    False
95    False
Name: isFraud, dtype: bool

In [None]:
%%timeit
a = 5
b = a + 6

In [10]:
df_train['isFraud'].value_counts()

0    9632
1     368
Name: isFraud, dtype: int64

In [11]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
       
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary
    

In [18]:
df_train = df_train.reset_index()
resumetable(df_train)

Dataset Shape: (10000, 109)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,level_0,int64,0,10000,0.000000e+00,1.000000e+00,2.000000e+00,13.29
1,index,int64,0,10000,7.838000e+03,6.186100e+04,1.730690e+05,13.29
2,TransactionID,int64,0,10000,2.994838e+06,3.048861e+06,3.160069e+06,13.29
3,isFraud,int64,0,2,0.000000e+00,0.000000e+00,0.000000e+00,0.23
4,TransactionDT,int64,0,9994,2.446300e+05,1.393351e+06,3.763076e+06,13.29
5,TransactionAmt,float64,0,1854,3.000000e+01,4.000000e+01,3.095000e+01,7.94
6,ProductCD,int64,0,5,4.000000e+00,1.000000e+00,4.000000e+00,1.28
7,card1,int64,0,2392,1.232100e+04,1.729300e+04,2.392000e+03,9.48
8,card2,float64,161,440,4.900000e+02,5.550000e+02,3.600000e+02,6.24
9,card3,float64,34,43,1.500000e+02,1.500000e+02,1.500000e+02,0.70
