In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import gc
import time

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import plot_tree
from sklearn import metrics   
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
import xgboost as xgb


from sklearn.metrics import roc_auc_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reduce memory for 13GB limited memory in Kaggle
# Code Reference: https://www.kaggle.com/sbunzini/reduce-memory-usage-by-75
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
# Loading input data
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [None]:
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

In [None]:
# align the same column name in train identity [id_01,id_02,...] and test identity [id-01,id-02,...]
id_cols = train_identity.columns
test_identity.columns = id_cols

In [None]:
train_transaction = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_transaction = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [None]:
#train_transaction = train_transaction.drop('P_emaildomain',1)
#train_transaction = train_transaction.drop('R_emaildomain',1)
train_transaction = train_transaction.drop('TransactionDT',1)
train_transaction = train_transaction.drop('TransactionID',1)

#test_transaction = test_transaction.drop('P_emaildomain',1)
#test_transaction = test_transaction.drop('R_emaildomain',1)
test_transaction = test_transaction.drop('TransactionDT',1)
test_id = test_transaction['TransactionID']

test_transaction = test_transaction.drop('TransactionID',1)

train_label = train_transaction.isFraud
train_transaction = train_transaction.drop('isFraud',1)

# Part 1: Data Processing for training set
## ProductCD, card1 - card6, addr1 - addr2, M1 - M9, D1 - D15, C1 - C14, TransactionAmt, dist1, dist2 

In [None]:
category_cols =['ProductCD','card1','card2','card3','card4','card5','card6','addr1','addr2',
                'M1','M2','M3','M4','M5','M6','M7','M8','M9']
time_delta_cols = ['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']
counter_cols = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']
numeric_cols = ['TransactionAmt','dist1','dist2']
label_col = ['isFraud']
#text_col= ['P_emaildomain','R_emaildomain']

In [None]:
for col in time_delta_cols:
    train_transaction[col].fillna(0,inplace=True)
    
for col in counter_cols:
    train_transaction[col].fillna(0,inplace=True)
    
for col in numeric_cols:
    train_transaction[col].fillna(train_transaction[col].mean(),inplace=True)

In [None]:
train_transaction['Dsum'] = train_transaction['D1'] + train_transaction['D2'] + train_transaction['D3'] + train_transaction['D4'] + train_transaction['D5']+ train_transaction['D6'] + train_transaction['D7'] + train_transaction['D8']+ train_transaction['D9'] + train_transaction['D10'] + train_transaction['D11'] + train_transaction['D12']+ train_transaction['D13'] + train_transaction['D14'] + train_transaction['D15']

In [None]:
train_transaction['amt_cat'] = pd.cut(train_transaction.TransactionAmt,bins=[0,50,100,150,200,100000],labels=['<=50','>50&<=100','>100&<=150','>150&<=200','>200'])
labelencoder = LabelEncoder()
train_transaction['amt_cat'] = labelencoder.fit_transform(train_transaction['amt_cat'])
#test_set['amt_cat'] = pd.cut(test_set.TransactionAmt,bins=[0,50,100,150,200,100000],labels=['<=50','>50&<=100','>100&<=150','>150&<=200','>200'])

In [None]:
train_transaction['Csum'] = train_transaction['C1'] + train_transaction['C2'] + train_transaction['C3'] + train_transaction['C4'] + train_transaction['C5'] + train_transaction['C6'] + train_transaction['C7'] + train_transaction['C8'] + train_transaction['C9'] + train_transaction['C10'] + train_transaction['C11'] + train_transaction['C12'] + train_transaction['C13'] + train_transaction['C14']

In [None]:
train_transaction[category_cols] = train_transaction[category_cols].astype('str')
#test_set[category_cols] = test_set[category_cols].astype('str')

In [None]:
train_transaction['P_emaildomain'] = train_transaction['P_emaildomain'].astype('str')
train_transaction['R_emaildomain'] = train_transaction['R_emaildomain'].astype('str')

In [None]:
train_transaction['P_neq_R'] = train_transaction['P_emaildomain'] != train_transaction['R_emaildomain']

In [None]:
train_transaction['P_neq_R'].value_counts()

In [None]:
train_transaction = train_transaction.drop('P_emaildomain',1)
train_transaction = train_transaction.drop('R_emaildomain',1)

In [None]:
labelencoder = LabelEncoder()
for col in category_cols:
    train_transaction[col] = labelencoder.fit_transform(train_transaction[col])
    
train_transaction['P_neq_R'] = labelencoder.fit_transform(train_transaction['P_neq_R'])

In [None]:
train_transaction['M1'] = train_transaction['M1'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M2'] = train_transaction['M2'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M3'] = train_transaction['M3'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M5'] = train_transaction['M5'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M6'] = train_transaction['M6'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M7'] = train_transaction['M7'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M8'] = train_transaction['M8'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['M9'] = train_transaction['M9'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
train_transaction['Msum'] = train_transaction['M1'] + train_transaction['M2'] + train_transaction['M3'] + train_transaction['M5'] + train_transaction['M6'] + train_transaction['M7'] + train_transaction['M8'] + train_transaction['M9']

train_transaction['distSum'] = train_transaction['dist1'] + train_transaction['dist2']

In [None]:
minmax_scaler = MinMaxScaler()
train_transaction[time_delta_cols] = minmax_scaler.fit_transform(train_transaction[time_delta_cols])
train_transaction[counter_cols] = minmax_scaler.fit_transform(train_transaction[counter_cols])
train_transaction[['Csum']] = minmax_scaler.fit_transform(train_transaction[['Csum']])
train_transaction[['Dsum']] = minmax_scaler.fit_transform(train_transaction[['Dsum']])
train_transaction[['Msum']] = minmax_scaler.fit_transform(train_transaction[['Msum']])
train_transaction[['TransactionAmt']] = minmax_scaler.fit_transform(train_transaction[['TransactionAmt']])
train_transaction[['dist1']] = minmax_scaler.fit_transform(train_transaction[['dist1']])
train_transaction[['dist2']] = minmax_scaler.fit_transform(train_transaction[['dist2']])
train_transaction[['distSum']] = minmax_scaler.fit_transform(train_transaction[['distSum']])

In [None]:
features_cols = time_delta_cols + counter_cols + numeric_cols + category_cols + ['distSum'] + ['Msum'] + ['Dsum'] + ['Csum']+ ['amt_cat'] + ['P_neq_R']
#features_cols

In [None]:
fig_width = 35
fig_height = 8
font_size = 15

# Part 2 Data Processing for training set
## V columns (V1 - V339) data processing

In [None]:
vcol = [v for v in train_transaction if v[0] == 'V']

In [None]:
## Found V-columns with <= 50 class categories 
CATEGORY_COUNT = 50

def find_categorical_v(df):
    num_v = []
    cat_v = []
    for v in vcol:
        cnt = train_transaction[v].value_counts(dropna=False).count()
        if (cnt <= CATEGORY_COUNT):
            cat_v.append([v,cnt])
        else:
            num_v.append([v,cnt])
    return cat_v, num_v

In [None]:
(cat_v, num_v) = find_categorical_v(train_transaction)
arr_cat_v = np.array(cat_v)
arr_num_v = np.array(num_v)

catv_df = pd.DataFrame({'vcolname': arr_cat_v[:,0], 'catcnt': arr_cat_v[:,1]})  
numv_df = pd.DataFrame({'vcolname': arr_num_v[:,0], 'catcnt': arr_num_v[:,1]})
num_v_cols = numv_df.vcolname.tolist()
cat_v_cols = catv_df.vcolname.tolist()

In [None]:
for col in num_v_cols:
    train_transaction[col].fillna(train_transaction[col].mean(),inplace=True)

for col in cat_v_cols:
    train_transaction[col].fillna(0,inplace=True)

# Part 3 Data Processing
## Identity Columns: id_01 - id_38, DeviceType, DeviceInfo

In [None]:
train_transaction['DeviceType'].value_counts(dropna = False)

In [None]:
train_transaction['DeviceInfo'].value_counts(dropna = False)

In [None]:
id_categorial_cols = ['id_12','id_13','id_14','id_15','id_16','id_17','id_18','id_19','id_20','id_21',
                      'id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30','id_31',
                      'id_32','id_33','id_34','id_35','id_36','id_37','id_38','DeviceType','DeviceInfo']

id_numeric_cols = ['id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10','id_11']


for col in id_numeric_cols:
    train_transaction[col].fillna(train_transaction[col].mean(), inplace=True)
    
for col in id_categorial_cols:
    train_transaction[col].fillna('na', inplace=True)

In [None]:
train_transaction.loc[train_transaction['id_31'].str.contains('samsung', case=False), 'id_31'] = 'samsung'
train_transaction.loc[train_transaction['id_31'].str.contains('chrome', case=False) & train_transaction['id_31'].str.contains('android', case=False), 'id_31'] = 'chrome'
train_transaction.loc[train_transaction['id_31'].str.contains('chrome', case=False), 'id_31'] = 'chrome'
train_transaction.loc[train_transaction['id_31'].str.contains('ie', case=False), 'id_31'] = 'ie'
train_transaction.loc[train_transaction['id_31'].str.contains('edge', case=False), 'id_31'] = 'edge'
train_transaction.loc[train_transaction['id_31'].str.contains('firefox', case=False), 'id_31'] = 'firefox'
train_transaction.loc[train_transaction['id_31'].str.contains('opera', case=False), 'id_31'] = 'opera'
train_transaction.loc[train_transaction['id_31'].str.contains('safari', case=False), 'id_31'] = 'safari'
#for other
train_transaction.loc[~train_transaction['id_31'].isin(['samsung','chrome','ie','edge','firefox','opera','safari']), 'id_31'] = 'other'

In [None]:
train_transaction.loc[train_transaction['DeviceInfo'].str.contains('iOS', case=False), 'DeviceInfo'] = 'iOS'
train_transaction.loc[train_transaction['DeviceInfo'].str.contains('Trident', case=False), 'DeviceInfo'] = 'Trident'
train_transaction.loc[train_transaction['DeviceInfo'].str.contains('Windows', case=False), 'DeviceInfo'] = 'Windows'
train_transaction.loc[train_transaction['DeviceInfo'].str.contains('MacOS', case=False), 'DeviceInfo'] = 'MacOS'
#for other
train_transaction.loc[~train_transaction['DeviceInfo'].isin(['iOS','Trident','Windows','MacOS']), 'DeviceInfo'] = 'other'

In [None]:
train_transaction['DeviceInfo'].value_counts()

In [None]:
labelencoder = LabelEncoder()
for col in id_categorial_cols:
    train_transaction[col] = labelencoder.fit_transform(train_transaction[col].astype(str))

In [None]:
minmax_scaler = MinMaxScaler()
train_transaction[id_numeric_cols] = minmax_scaler.fit_transform(train_transaction[id_numeric_cols])

In [None]:
id_cols = id_categorial_cols + id_numeric_cols

In [None]:
train_transaction[features_cols+vcol+id_cols].columns

# Features Selection

In [None]:
all_cols = features_cols+vcol+id_cols

In [None]:
len(all_cols)

In [None]:
def lg_modelfit(dtrain, target):
    model = LogisticRegression(class_weight="balanced")
    model.fit(dtrain, target) 
    return model

def xgb_modelfit(dtrain, target):    
    model = xgb.XGBClassifier(
         learning_rate =0.01,
         n_estimators=1000,
         max_depth=8,
         min_child_weight=1,
         gamma=0,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=27,
         verbosity=0,
         tree_method='gpu_hist'
    )
    # fit the model
    model.fit(dtrain, target)
    return model

def adaboost_modelfit(dtrain, target):
    model = AdaBoostClassifier(
        learning_rate=0.1,  
        n_estimators=1000
    )
    model.fit(dtrain, target) 
    return model

def get_Importances(importances, cols):
    feature_score = {}
    for i,v in enumerate(importances):
        feature_score[cols[i]] = v
    return feature_score 

In [None]:
import csv
def write_csv_kaggle_sub(fname, y_pred):
    sub = pd.DataFrame()
    sub['TransactionID'] = test_id
    sub['isFraud'] = y_pred[:, 1]
    sub.to_csv(fname, index=False)

def write_feature_importances(fname, feature_score):
    with open(fname, 'w') as f:
        writer = csv.writer(f)
        for key, value in dict(sorted(feature_score.items(), key=lambda item: item[1], reverse=True)).items():
            writer.writerow([key, value])

def select_Top_features(feature_score, top_num):
    top_features =[]
    for key, value in dict(sorted(feature_score.items(), key=lambda item: item[1], reverse=True)).items():
        top_features.append(key)
    return top_features[:top_num]

def select_features_range(feature_score, ftop_num, ttop_num):
    top_features =[]
    top_scores =[]
    for key, value in dict(sorted(feature_score.items(), key=lambda item: item[1], reverse=True)).items():
        top_features.append(key)
        top_scores.append(value)
        
    return top_features[ftop_num:ttop_num], top_scores[ftop_num:ttop_num]

def select_features_byCol(feature_score, colList, ftop_num, ttop_num):
    top_features =[]
    top_scores =[]
    for key, value in dict(sorted(feature_score.items(), key=lambda item: item[1], reverse=True)).items():
        if key in colList:
            top_features.append(key)
            top_scores.append(value)
        
    return top_features[ftop_num:ttop_num], top_scores[ftop_num:ttop_num]

def features_plot(fscore, frange, trange, titlename):
    col, val = select_features_range(fscore, frange, trange)
    fig = plt.figure(figsize = (fig_width, fig_height))
    plt.bar(range(len(col)), val, align='center')
    plt.title(titlename)
    plt.xticks(range(len(col)),col)
    plt.show()

def features_plot_bycol(fscore, col_list, frange, trange, titlename):
    col, val = select_features_byCol(fscore, col_list,frange, trange)
    fig = plt.figure(figsize = (fig_width, fig_height))
    plt.bar(range(len(col)), val, align='center')
    plt.title(titlename)
    plt.xticks(range(len(col)),col)
    plt.show()

In [None]:
train_transaction = reduce_memory_usage(train_transaction)

In [None]:
del train_identity
gc.collect()

### Find the weights of the following features by Logistic Regression
#### ProductCD, card1 - card6, addr1 - addr2, M1 - M9, D1 - D15, C1 - C14, TransactionAmt, dist1, dist2 
#### V1 - V339
#### id_01 - id_38, DeviceType, DeviceInfo

In [None]:
LR_all_feature_score = {}
lg_model = lg_modelfit(train_transaction[all_cols], train_label)
f_scores = get_Importances(lg_model.coef_[0],all_cols)
LR_all_feature_score.update(f_scores)

In [None]:
write_feature_importances("LR_FeatureSelection.csv",LR_all_feature_score)
del lg_model
gc.collect()

In [None]:
features_plot_bycol(LR_all_feature_score, features_cols, 0, 50,"Top 50 selected transaction cols in LR")

In [None]:
features_plot_bycol(LR_all_feature_score, vcol, 0, 50,"Top 50 selected V's cols in LR")

In [None]:
features_plot_bycol(LR_all_feature_score, id_cols, 0, 50 ,"Top 50 selected ids' in LR")

In [None]:
features_plot(LR_all_feature_score, 0, 50 ,"Rank 0 - 50 selected features in LR")

### Find the importances of the following features by XGBClassifier
#### ProductCD, card1 - card6, addr1 - addr2, M1 - M9, D1 - D15, C1 - C14, TransactionAmt, dist1, dist2 
#### V1 - V339
#### id_01 - id_38, DeviceType, DeviceInfo


In [None]:
XGB_all_feature_score = {}
xgb_model = xgb_modelfit(train_transaction[all_cols], train_label)
f_scores = get_Importances(xgb_model.feature_importances_,all_cols)
XGB_all_feature_score.update(f_scores)
#XGB_feature_score

In [None]:
write_feature_importances("XGB_FeatureSelection.csv",XGB_all_feature_score)

del xgb_model
gc.collect()

In [None]:
features_plot_bycol(XGB_all_feature_score, features_cols, 0, 50,"Top 50 selected transaction cols in XGB")

In [None]:
features_plot_bycol(XGB_all_feature_score, vcol, 0, 50,"Top 50 selected V's cols in XGB")

In [None]:
features_plot_bycol(XGB_all_feature_score, id_cols, 0, 50,"Top 50 selected ids' in XGB")

In [None]:
features_plot(XGB_all_feature_score, 0, 50 ,"Rank 0 - 50 selected features in XGB")

### Find the importances of the following features by AdaBoostClassifier
#### ProductCD, card1 - card6, addr1 - addr2, M1 - M9, D1 - D15, C1 - C14, TransactionAmt, dist1, dist2 
#### V1 - V339
#### id_01 - id_38, DeviceType, DeviceInfo

In [None]:
AdaBoost_all_feature_score = {}
ada_model = adaboost_modelfit(train_transaction[all_cols], train_label)
f_scores = get_Importances(ada_model.feature_importances_,all_cols)
AdaBoost_all_feature_score.update(f_scores)

In [None]:
write_feature_importances("AdaBoost_FeatureSelection.csv",AdaBoost_all_feature_score)

del ada_model
gc.collect()

In [None]:
features_plot_bycol(AdaBoost_all_feature_score, features_cols, 0, 50,"Top 50 selected transaction cols in AdaBoost")

In [None]:
features_plot_bycol(AdaBoost_all_feature_score, vcol, 0, 50,"Top 50 selected V's cols in AdaBoost")

In [None]:
features_plot_bycol(AdaBoost_all_feature_score, id_cols, 0, 50,"Top 50 selected ids' in AdaBoost")

In [None]:
features_plot(AdaBoost_all_feature_score, 0, 50 ,"Rank 0 - 50 selected features in AdaBoost")

# Model Buidling

### Test Data Processing

In [None]:
for col in time_delta_cols:
    test_transaction[col].fillna(0,inplace=True)
    
for col in counter_cols:
    test_transaction[col].fillna(0,inplace=True)
    
for col in numeric_cols:
    test_transaction[col].fillna(test_transaction[col].mean(),inplace=True)

In [None]:
test_transaction['Dsum'] = test_transaction['D1'] + test_transaction['D2'] + test_transaction['D3'] + test_transaction['D4'] + test_transaction['D5']+ test_transaction['D6'] + test_transaction['D7'] + test_transaction['D8']+ test_transaction['D9'] + test_transaction['D10'] + test_transaction['D11'] + test_transaction['D12']+ test_transaction['D13'] + test_transaction['D14'] + test_transaction['D15']

test_transaction['amt_cat'] = pd.cut(test_transaction.TransactionAmt,bins=[0,50,100,150,200,100000],labels=['<=50','>50&<=100','>100&<=150','>150&<=200','>200'])
test_transaction['amt_cat'] = labelencoder.fit_transform(test_transaction['amt_cat'])

test_transaction['Csum'] = test_transaction['C1'] + test_transaction['C2'] + test_transaction['C3'] + test_transaction['C4'] + test_transaction['C5'] + test_transaction['C6'] + test_transaction['C7'] + test_transaction['C8'] + test_transaction['C9'] + test_transaction['C10'] + test_transaction['C11'] + test_transaction['C12'] + test_transaction['C13'] + test_transaction['C14']

test_transaction[category_cols] = test_transaction[category_cols].astype('str')
#test_set[category_cols] = test_set[category_cols].astype('str')

test_transaction['P_neq_R'] = test_transaction['P_emaildomain'] != test_transaction['R_emaildomain']

labelencoder = LabelEncoder()
for col in category_cols:
    test_transaction[col] = labelencoder.fit_transform(test_transaction[col])
    
test_transaction['P_neq_R'] = labelencoder.fit_transform(test_transaction['P_neq_R'])

In [None]:
test_transaction['M1'] = test_transaction['M1'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M2'] = test_transaction['M2'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M3'] = test_transaction['M3'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M5'] = test_transaction['M5'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M6'] = test_transaction['M6'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M7'] = test_transaction['M7'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M8'] = test_transaction['M8'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['M9'] = test_transaction['M9'].replace('T','1').replace('F','0').replace('None','0').fillna('0').astype(int)
test_transaction['Msum'] = test_transaction['M1'] + test_transaction['M2'] + test_transaction['M3'] + test_transaction['M5'] + test_transaction['M6'] + test_transaction['M7'] + test_transaction['M8'] + test_transaction['M9']

test_transaction['distSum'] = test_transaction['dist1'] + test_transaction['dist2']

In [None]:
minmax_scaler = MinMaxScaler()
test_transaction[time_delta_cols] = minmax_scaler.fit_transform(test_transaction[time_delta_cols])
test_transaction[counter_cols] = minmax_scaler.fit_transform(test_transaction[counter_cols])
test_transaction[['Csum']] = minmax_scaler.fit_transform(test_transaction[['Csum']])
test_transaction[['Dsum']] = minmax_scaler.fit_transform(test_transaction[['Dsum']])
test_transaction[['Msum']] = minmax_scaler.fit_transform(test_transaction[['Msum']])
test_transaction[['TransactionAmt']] = minmax_scaler.fit_transform(test_transaction[['TransactionAmt']])
test_transaction[['dist1']] = minmax_scaler.fit_transform(test_transaction[['dist1']])
test_transaction[['dist2']] = minmax_scaler.fit_transform(test_transaction[['dist2']])
test_transaction[['distSum']] = minmax_scaler.fit_transform(test_transaction[['distSum']])

In [None]:
for col in num_v_cols:
    test_transaction[col].fillna(test_transaction[col].mean(),inplace=True)

for col in cat_v_cols:
    test_transaction[col].fillna(0,inplace=True)
    
for col in id_numeric_cols:
    test_transaction[col].fillna(test_transaction[col].mean(), inplace=True)
    
for col in id_categorial_cols:
    test_transaction[col].fillna('na', inplace=True)
    
test_transaction.loc[test_transaction['id_31'].str.contains('samsung', case=False), 'id_31'] = 'samsung'
test_transaction.loc[test_transaction['id_31'].str.contains('chrome', case=False) & test_transaction['id_31'].str.contains('android', case=False), 'id_31'] = 'chrome'
test_transaction.loc[test_transaction['id_31'].str.contains('chrome', case=False), 'id_31'] = 'chrome'
test_transaction.loc[test_transaction['id_31'].str.contains('ie', case=False), 'id_31'] = 'ie'
test_transaction.loc[test_transaction['id_31'].str.contains('edge', case=False), 'id_31'] = 'edge'
test_transaction.loc[test_transaction['id_31'].str.contains('firefox', case=False), 'id_31'] = 'firefox'
test_transaction.loc[test_transaction['id_31'].str.contains('opera', case=False), 'id_31'] = 'opera'
test_transaction.loc[test_transaction['id_31'].str.contains('safari', case=False), 'id_31'] = 'safari'
#for other
test_transaction.loc[~test_transaction['id_31'].isin(['samsung','chrome','ie','edge','firefox','opera','safari']), 'id_31'] = 'other'
test_transaction.loc[test_transaction['DeviceInfo'].str.contains('iOS', case=False), 'DeviceInfo'] = 'iOS'
test_transaction.loc[test_transaction['DeviceInfo'].str.contains('Trident', case=False), 'DeviceInfo'] = 'Trident'
test_transaction.loc[test_transaction['DeviceInfo'].str.contains('Windows', case=False), 'DeviceInfo'] = 'Windows'
test_transaction.loc[test_transaction['DeviceInfo'].str.contains('MacOS', case=False), 'DeviceInfo'] = 'MacOS'
#for other
test_transaction.loc[~test_transaction['DeviceInfo'].isin(['iOS','Trident','Windows','MacOS']), 'DeviceInfo'] = 'other'

labelencoder = LabelEncoder()
for col in id_categorial_cols:
    test_transaction[col] = labelencoder.fit_transform(test_transaction[col].astype(str))

minmax_scaler = MinMaxScaler()
test_transaction[id_numeric_cols] = minmax_scaler.fit_transform(test_transaction[id_numeric_cols])

### Select Top x Features for Logistic Regression, XGB, AdaBoost

### x = 50, 100, 150, 200

In [None]:
num_features_selected_List = [50,100,150, 200]

In [None]:
for num_features_selected in num_features_selected_List:
    print("Number of features: " + str(num_features_selected))
    print("Running LR")
    
    start_time = time.time()
    LR_train_set = train_transaction[select_Top_features(LR_all_feature_score, num_features_selected)]
    LR_test_set = test_transaction[select_Top_features(LR_all_feature_score, num_features_selected)]
    
    lg_model = lg_modelfit(LR_train_set, train_label)
    
    
    Y_train_pred = lg_model.predict_proba(LR_train_set)
    fpr, tpr, thresholds = roc_curve(train_label, Y_train_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
    fig.tight_layout() # Or equivalently,  "plt.tight_layout()"

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)
    plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Area under ROC = %0.2f (Train set)' % (roc_auc))

    
    lg_precision, lg_recall, _ = precision_recall_curve(train_label, Y_train_pred[:, 1])
    prc_auc = auc(lg_recall, lg_precision)
    
    plt.subplot(1, 2, 2)
    plt.plot(lg_recall, lg_precision, marker='.')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area under PRC = %0.2f (Train set)' % (prc_auc))
 
    plt.show()
    
    
    y_pred = lg_model.predict_proba(LR_test_set)
    write_csv_kaggle_sub("LR_Submission_Selected2_" + str(num_features_selected) +".csv",y_pred)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    del LR_train_set
    del LR_test_set
    del lg_model
    del y_pred
    gc.collect()

In [None]:
for num_features_selected in num_features_selected_List:
    print("Number of features: " + str(num_features_selected))
    print("Running AdaBoost")
    start_time = time.time()
    AdaBoost_train_set = train_transaction[select_Top_features(AdaBoost_all_feature_score, num_features_selected)]
    AdaBoost_test_set = test_transaction[select_Top_features(AdaBoost_all_feature_score, num_features_selected)]
    
    ada_model = adaboost_modelfit(AdaBoost_train_set, train_label)
    
    
    Y_train_pred = ada_model.predict_proba(AdaBoost_train_set)
    fpr, tpr, thresholds = roc_curve(train_label, Y_train_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
    fig.tight_layout() # Or equivalently,  "plt.tight_layout()"

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)
    plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Area under ROC = %0.2f (Train set)' % (roc_auc))

    
    ada_precision, ada_recall, _ = precision_recall_curve(train_label, Y_train_pred[:, 1])
    prc_auc = auc(ada_recall, ada_precision)
    
    plt.subplot(1, 2, 2)
    plt.plot(ada_recall, ada_precision, marker='.')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area under PRC = %0.2f (Train set)' % (prc_auc))
 
    plt.show()
    
    
    y_pred = ada_model.predict_proba(AdaBoost_test_set)
    write_csv_kaggle_sub("AdaBoost_Submission_Selected2_" + str(num_features_selected) +".csv",y_pred)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    del AdaBoost_train_set
    del AdaBoost_test_set
    del ada_model
    del y_pred
    gc.collect()

### XGB Modeling

In [None]:
for num_features_selected in num_features_selected_List:
    print("Number of features: " + str(num_features_selected))
    print("Running XGB")
    start_time = time.time()
    XGB_train_set = train_transaction[select_Top_features(XGB_all_feature_score, num_features_selected)]
    XGB_test_set = test_transaction[select_Top_features(XGB_all_feature_score, num_features_selected)]
    xgb_model = xgb_modelfit(XGB_train_set, train_label)
    
    Y_train_pred = xgb_model.predict_proba(XGB_train_set)
    fpr, tpr, thresholds = roc_curve(train_label, Y_train_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
    fig.tight_layout() # Or equivalently,  "plt.tight_layout()"

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)
    plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Area under ROC = %0.2f (Train set)' % (roc_auc))

    
    xgb_precision, xgb_recall, _ = precision_recall_curve(train_label, Y_train_pred[:, 1])
    prc_auc = auc(xgb_recall, xgb_precision)
    
    plt.subplot(1, 2, 2)
    plt.plot(xgb_recall, xgb_precision, marker='.')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area under PRC = %0.2f (Train set)' % (prc_auc))
 
    plt.show()

    y_pred = xgb_model.predict_proba(XGB_test_set)
    write_csv_kaggle_sub("XGB_Submission_Selected2_" + str(num_features_selected) +".csv",y_pred)
    print("--- %s seconds ---" % (time.time() - start_time))
    del XGB_train_set
    del XGB_test_set
    del xgb_model
    del y_pred
    gc.collect()

### XGB Modeling for common importances in XGB and AdaBoost

In [None]:
num_features_selected_List = [100, 200]
for num_features_selected in num_features_selected_List:
    #print("Number of features: " + str(num_features_selected))
    #print("Running XGB")
    start_time = time.time()
    xgb_cols = select_Top_features(XGB_all_feature_score, num_features_selected)
    ada_cols = select_Top_features(AdaBoost_all_feature_score, num_features_selected)
    
    common_cols = list(set(xgb_cols).intersection(ada_cols))
    print("Number of features: " + str(len(common_cols)))
    print(common_cols)
    
    XGB_train_set = train_transaction[common_cols]
    XGB_test_set = test_transaction[common_cols]
    xgb_model = xgb_modelfit(XGB_train_set, train_label)
    
    Y_train_pred = xgb_model.predict_proba(XGB_train_set)
    fpr, tpr, thresholds = roc_curve(train_label, Y_train_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10,5))
    fig.tight_layout()

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)
    plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Area under ROC = %0.2f (Train set)' % (roc_auc))

    
    xgb_precision, xgb_recall, _ = precision_recall_curve(train_label, Y_train_pred[:, 1])
    prc_auc = auc(xgb_recall, xgb_precision)
    
    plt.subplot(1, 2, 2)
    plt.plot(xgb_recall, xgb_precision, marker='.')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area under PRC = %0.2f (Train set)' % (prc_auc))
 
    plt.show()
    # show the legend

    y_pred = xgb_model.predict_proba(XGB_test_set)
    
    write_csv_kaggle_sub("XGB_Submission_CommonSelected2_" + str(len(common_cols)) +".csv",y_pred)
    print("--- %s seconds ---" % (time.time() - start_time))

    del XGB_train_set
    del XGB_test_set
    del xgb_model
    del y_pred
    gc.collect()

# Stratified 80-Fold training

In [None]:
kfold = 80
skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
skf_pd = pd.DataFrame()
skf_pd['TransactionID'] = test_id
skf_pd['isFraud'] = np.zeros_like(test_id)

In [None]:
xgb_cols = select_Top_features(XGB_all_feature_score, 100)
ada_cols = select_Top_features(AdaBoost_all_feature_score, 100)
    
common_cols = list(set(xgb_cols).intersection(ada_cols))
print("Number of features: " + str(len(common_cols)))
print(common_cols)
    
XGB_train_set = train_transaction[common_cols]
XGB_test_set = test_transaction[common_cols]

total_roc_auc = 0
total_prc_auc = 0
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10,5))
fig.tight_layout()
for i, (train_index, test_index) in enumerate(skf.split(XGB_train_set, train_label)):
    X_train = XGB_train_set.values
    Y_label = train_label.values
    
    X_train, X_valid = X_train[train_index], X_train[test_index]
    Y_train, Y_valid = Y_label[train_index], Y_label[test_index]
    
    xgb_model = xgb_modelfit(X_train, Y_train)
    Y_valid_pred = xgb_model.predict_proba(X_valid)
    
    fpr, tpr, thresholds = roc_curve(Y_valid, Y_valid_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    total_roc_auc = total_roc_auc + roc_auc
    
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)

    xgb_precision, xgb_recall, _ = precision_recall_curve(Y_valid, Y_valid_pred[:, 1])
    prc_auc = auc(xgb_recall, xgb_precision)
    total_prc_auc = total_prc_auc + prc_auc
    
    plt.subplot(1, 2, 2)
    plt.plot(xgb_recall, xgb_precision, lw=2, alpha=0.3)
    
    y_pred = xgb_model.predict_proba(XGB_test_set)
    skf_pd['isFraud'] += y_pred[:, 1]/kfold

plt.subplot(1, 2, 1)
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean area under ROC = %0.2f (80 fold)' % (total_roc_auc/kfold))

plt.subplot(1, 2, 2)
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Mean area under PRC = %0.2f (80 fold)' % (total_prc_auc/kfold))
plt.show()
    
del XGB_train_set
del XGB_test_set
del xgb_model
del y_pred
gc.collect()

skf_pd.to_csv("XGB_StratifiedKFold" + str(len(common_cols)) + "_.csv", index=False)

In [None]:
skf_pd = pd.DataFrame()
skf_pd['TransactionID'] = test_id
skf_pd['isFraud'] = np.zeros_like(test_id)

xgb_cols = select_Top_features(XGB_all_feature_score,200)
ada_cols = select_Top_features(AdaBoost_all_feature_score, 200)
    
common_cols = list(set(xgb_cols).intersection(ada_cols))
print("Number of features: " + str(len(common_cols)))
print(common_cols)
    
XGB_train_set = train_transaction[common_cols]
XGB_test_set = test_transaction[common_cols]

total_roc_auc = 0
total_prc_auc = 0
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10,5))
fig.tight_layout()

for i, (train_index, test_index) in enumerate(skf.split(XGB_train_set, train_label)):
    X_train = XGB_train_set.values
    Y_label = train_label.values
    
    X_train, X_valid = X_train[train_index], X_train[test_index]
    Y_train, Y_valid = Y_label[train_index], Y_label[test_index]
    
    xgb_model = xgb_modelfit(X_train, Y_train)
    Y_valid_pred = xgb_model.predict_proba(X_valid)
    
    fpr, tpr, thresholds = roc_curve(Y_valid, Y_valid_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    total_roc_auc = total_roc_auc + roc_auc
    
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)

    xgb_precision, xgb_recall, _ = precision_recall_curve(Y_valid, Y_valid_pred[:, 1])
    prc_auc = auc(xgb_recall, xgb_precision)
    total_prc_auc = total_prc_auc + prc_auc
    
    plt.subplot(1, 2, 2)
    plt.plot(xgb_recall, xgb_precision, lw=2, alpha=0.3)
    
    y_pred = xgb_model.predict_proba(XGB_test_set)
    skf_pd['isFraud'] += y_pred[:, 1]/kfold

plt.subplot(1, 2, 1)
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean area under ROC = %0.2f (80 fold)' % (total_roc_auc/kfold))

plt.subplot(1, 2, 2)
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Mean area under PRC = %0.2f (80 fold)' % (total_prc_auc/kfold))
plt.show()
    
del XGB_train_set
del XGB_test_set
del xgb_model
del y_pred
gc.collect()

skf_pd.to_csv("XGB_StratifiedKFold" + str(len(common_cols)) + "_.csv", index=False)