In [None]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder


from tqdm import tqdm

import datetime
import math
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
SEED = 42
seed_everything(SEED)
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [None]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle('../input/ieee-data-minification/train_transaction.pkl')
test_df = pd.read_pickle('../input/ieee-data-minification/test_transaction.pkl')
train_identity = pd.read_pickle('../input/ieee-data-minification/train_identity.pkl')
test_identity = pd.read_pickle('../input/ieee-data-minification/test_identity.pkl')
    
base_columns = list(train_df) + list(train_identity)
print('Shape control:', train_df.shape, test_df.shape)

In [None]:
########################### Merge Identity columns
temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(train_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
train_df = pd.concat([train_df,temp_df], axis=1)
    
temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(test_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
test_df = pd.concat([test_df,temp_df], axis=1)

In [None]:
y_train = train_df['isFraud'].copy()
X_train = train_df.drop('isFraud' , axis = 1)
X_test = test_df.copy()
print(X_train.shape, y_train.shape)

del train_df, test_df, train_identity, test_identity
x =gc.collect()

In [None]:
def frequency_encoding(train_df, test_df, columns, self_encoding=True):
    for col in columns:
        temp_df = pd.concat([train_df[[col]], test_df[[col]]])
        fq_encode = temp_df[col].value_counts(dropna=False).to_dict()
        train_df[col] = train_df[col].map(fq_encode)
        test_df[col]  = test_df[col].map(fq_encode)
    return train_df, test_df

In [None]:
def uid_aggregation(train_df, test_df, main_columns, uids, aggregations):
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
    return train_df, test_df


### Lấy log cho các thuộc tính TransactionAmt, card1, card2

In [None]:
X_train['TransactionAmt'] = np.log(X_train['TransactionAmt'])
X_train['card1'] = np.log(X_train['card1'])
X_train['card2'] = np.log(X_train['card2'])

X_test['TransactionAmt'] = np.log(X_test['TransactionAmt'])
X_test['card1'] = np.log(X_test['card1'])
X_test['card2'] = np.log(X_test['card2'])


### Xóa bỏ các thuộc tính D có số lượng missing values lớn (>=90%)

In [None]:
print(X_train.shape)
cols = ['D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14']
X_train = X_train.drop(cols, axis=1)
print(X_train.shape)

print(X_test.shape)
cols = ['D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14']
X_test = X_test.drop(cols, axis=1)
print(X_test.shape)

### Normalize giá trị các thuộc tính P_emaildomain và R_emaildomain để tránh overfit cho mô hình, giúp mô hình tổng quát hóa tốt hơn

In [None]:
# Normalize X_train.P_emaildomain
X_train.loc[X_train['P_emaildomain'].isin(['gmail.com', 'gmail']),'P_emaildomain'] = 'Google'

X_train.loc[X_train['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es', 'ymail.com']), 'P_emaildomain'] = 'Yahoo Mail'
X_train.loc[X_train['P_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'P_emaildomain'] = 'Microsoft'
X_train.loc[X_train.P_emaildomain.isin(X_train.P_emaildomain\
                                         .value_counts()[X_train.P_emaildomain.value_counts() <= 500 ]\
                                         .index), 'P_emaildomain'] = "Others"
X_train.P_emaildomain.fillna("NoInf", inplace=True)


# Normalize X_train.R_emaildomain
X_train.loc[X_train['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

X_train.loc[X_train['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jR', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es', 'ymail.com']), 'R_emaildomain'] = 'Yahoo Mail'
X_train.loc[X_train['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
X_train.loc[X_train.R_emaildomain.isin(X_train.R_emaildomain\
                                         .value_counts()[X_train.R_emaildomain.value_counts() <= 500 ]\
                                         .index), 'R_emaildomain'] = "Others"
X_train.R_emaildomain.fillna("NoInf", inplace=True)

# Normalize X_test.P_emaildomain
X_test.loc[X_test['P_emaildomain'].isin(['gmail.com', 'gmail']),'P_emaildomain'] = 'Google'

X_test.loc[X_test['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es', 'ymail.com']), 'P_emaildomain'] = 'Yahoo Mail'
X_test.loc[X_test['P_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'P_emaildomain'] = 'Microsoft'
X_test.loc[X_test.P_emaildomain.isin(X_test.P_emaildomain\
                                         .value_counts()[X_test.P_emaildomain.value_counts() <= 500 ]\
                                         .index), 'P_emaildomain'] = "Others"
X_test.P_emaildomain.fillna("NoInf", inplace=True)

# Normalize X_test.R_emaildomain
X_test.loc[X_test['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

X_test.loc[X_test['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jR', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es', 'ymail.com']), 'R_emaildomain'] = 'Yahoo Mail'
X_test.loc[X_test['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
X_test.loc[X_test.R_emaildomain.isin(X_test.R_emaildomain\
                                         .value_counts()[X_test.R_emaildomain.value_counts() <= 500 ]\
                                         .index), 'R_emaildomain'] = "Others"
X_test.R_emaildomain.fillna("NoInf", inplace=True)

### Xóa bỏ các cột V redundancy

In [None]:
v =  [1, 3, 4, 6, 8, 11] 
v += [13, 14, 17, 20, 23, 26, 27, 30]
v += [36, 37, 40, 41, 44, 47, 48]
v += [54, 56, 59, 62, 65, 67, 68, 70]
v += [76, 78, 80, 82, 86, 88, 89, 91]
v += [96, 98, 99, 104]
v += [107, 108, 111, 115, 117, 120, 121, 123]
v += [124, 127, 129, 130, 136]
v += [138, 139, 142, 147, 156, 162]
v += [165, 160, 166]
v += [178, 176, 173, 182]
v += [187, 203, 205, 207, 215]
v += [169, 171, 175, 180, 185, 188, 198, 210, 209]
v += [218, 223, 224, 226, 228, 229, 235]
v += [240, 258, 257, 253, 252, 260, 261]
v += [264, 266, 267, 274, 277]
v += [220, 221, 234, 238, 250, 271]
v += [294, 284, 285, 286, 291, 297]
v += [303, 305, 307, 309, 310, 320]
v += [281, 283, 289, 296, 301, 314]
v += [332, 325, 335, 338]

v_remove = []
for item in range(1, 340):
    if item not in v:
        v_remove.append('V'+str(item))
        
print(X_train.shape)
X_train = X_train.drop(v_remove, axis=1)
print(X_train.shape)

print(X_test.shape)
X_test = X_test.drop(v_remove, axis=1)
print(X_test.shape)

### Bỏ các giá trị có số lần xuất hiện ít (<2) cho card1

In [None]:
########################## Reset values for "noise" card1
i_cols = ['card1']

for col in i_cols: 
    valid_card = pd.concat([X_train[[col]], X_test[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    X_train[col] = np.where(X_train[col].isin(X_test[col]), X_train[col], np.nan)
    X_test[col]  = np.where(X_test[col].isin(X_train[col]), X_test[col], np.nan)

    X_train[col] = np.where(X_train[col].isin(valid_card), X_train[col], np.nan)
    X_test[col]  = np.where(X_test[col].isin(valid_card), X_test[col], np.nan)

### Tạo ra thuộc tính mới uid

In [None]:
X_train['uid'] = X_train['card1'].astype(str)+'_'+X_train['card2'].astype(str)+'_'+X_train['card3'].astype(str)+'_'+X_train['addr1'].astype(str)
X_test['uid'] = X_test['card1'].astype(str)+'_'+X_test['card2'].astype(str)+'_'+X_test['card3'].astype(str)+'_'+X_test['addr1'].astype(str)

In [None]:
i_cols = ['TransactionAmt']

### Group các transaction theo uid và normalize theo mean và std của từng group

In [None]:
aggregations = ['mean','std']
X_train, X_test = uid_aggregation(X_train, X_test, i_cols, ['uid'], aggregations)
X_train = X_train.drop(['uid'], axis=1)

### Drop thuộc tính uid để tránh hiện tượng overfitting

In [None]:
X_test = X_test.drop(['uid'], axis=1)

### Frequency encoding đối với một số thuộc tính

In [None]:
X_train, X_test = frequency_encoding(X_train, X_test, ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain'])

### Label encoding với các thuộc tính object còn lại

In [None]:
for i in X_train.columns:
    if X_train[i].dtype == 'object' or X_test[i].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(X_train[i].values) + list(X_test[i].values))
        X_train[i] = lbl.transform(list(X_train[i].values))
        X_test[i] = lbl.transform(list(X_test[i].values))

### Fill NA

In [None]:
X_train.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

In [None]:
## We need Divide Train Set by Time blocks
## Convert TransactionDT to Months
train_df = X_train.copy()
train_df['groups'] = train_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train_df['groups'] = (train_df['groups'].dt.year-2017)*12 + train_df['groups'].dt.month 

split_groups = train_df['groups']

### GroupKFold Cross Validation

In [None]:
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from sklearn.metrics import make_scorer
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
import xgboost as xgb

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    N_SPLITS = 6
    folds = GroupKFold(n_splits=N_SPLITS)
    score_mean = 0
    count=1

    score_mean = 0
    for tr_idx, val_idx in folds.split(X_train, y_train, groups=split_groups):
        clf = xgb.XGBClassifier(
            n_estimators=600, random_state=4, verbose=True, 
            tree_method='gpu_hist', 
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / N_SPLITS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / N_SPLITS)


In [None]:
space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

In [None]:
# # Set algoritm parameters
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=27)

# # Print best parameters
# best_params = space_eval(space, best)

### Optimized hyper parameters

In [None]:
best_params =  {'max_depth': 12, 'gamma': '0.338', 'subsample': '0.80', 'reg_alpha': '0.199', 'reg_lambda': '0.125', 'learning_rate': '0.023', 'num_leaves': '70.000', 'colsample_bytree': '0.571', 'min_child_samples': '140.000', 'feature_fraction': '0.410', 'bagging_fraction': '0.828'}
best_params['max_depth'] = int(best_params['max_depth'])
print("best_params: ", best_params)

### Predict

In [None]:
import xgboost as xgb

print(best_params)

clf = xgb.XGBClassifier(
    n_estimators=600,
    **best_params,
    tree_method='gpu_hist'
)

clf.fit(X_train, y_train)

y_preds = clf.predict_proba(X_test)[:,1]

In [None]:
sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
sample_submission['isFraud'] = clf.predict_proba(X_test)[:,1]
sample_submission.to_csv('simple_xgboost.csv', index=False)