# 0. Initialization Setting

## Library Setting

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb
import catboost as catb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.gridspec as gridspec
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# Standard plotly imports
# %pip install chart_studio
# import chart_studio.plotly as py
# import plotly.graph_objs as go
# import plotly.tools as tls
# from plotly.offline import iplot, init_notebook_mode
# import cufflinks
# import cufflinks as cf
# import plotly.figure_factory as ff

# #Using plotly + cufflinks in offline mode
# init_notebook_mode(connected=True)
# cufflinks.go_offline(connected=True)

## Parameter Setting

In [None]:
is_index_TransactionID = False

#Label Encoding 
#one-hot-encoding/ labelencoderlibrary/ label-encoding
label_encoding_option = 'LabelEncoderLibrary'

# 1. Load Data

## Load Data

In [None]:
data_path = "../input/ieee-fraud-detection/"

In [None]:
def load_data (data_path, is_index_TransactionID):
    if is_index_TransactionID:
        train_identity = pd.read_csv(data_path+"train_identity.csv",index_col='TransactionID')
        test_identity = pd.read_csv(data_path+"test_identity.csv",index_col='TransactionID')
        train_transaction = pd.read_csv(data_path+"train_transaction.csv",index_col='TransactionID')
        test_transaction = pd.read_csv(data_path+"test_transaction.csv",index_col='TransactionID')
    else:
        train_identity = pd.read_csv(data_path+"train_identity.csv")
        test_identity = pd.read_csv(data_path+"test_identity.csv")
        train_transaction = pd.read_csv(data_path+"train_transaction.csv")
        test_transaction = pd.read_csv(data_path+"test_transaction.csv")
    return train_identity, test_identity, train_transaction, test_transaction

train_identity, test_identity, train_transaction, test_transaction = load_data(data_path, is_index_TransactionID)

## Reduce Memory Usage

In [None]:
train_transaction.info()
test_transaction.info()

- transaction의 데이터의 크기가 커서 메모리 사용량이 크므로 각 데이터에서 사용하는 숫자의 범위에 맞게 줄일 필요가 있음

In [None]:
## Memory Reducer Function
# :df pandas dataframe to reduce size   # type: pd.DataFrame()
def reduce_memory_usage(df):
    # 숫자 데이터 형 리스트
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    # 처음 메모리 사용량
    start_memory = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtypes
        # feature(column)의 데이터 형이 numerics안에 있으면
        if col_type in numerics:
            #해당 feature의 최소값, 최대값 찾기 
            c_min = df[col].min()
            c_max = df[col].max()
            # int 형인 경우
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            # float형인 경우
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
                    
    #줄인 메모리 사용량
    end_memory = df.memory_usage().sum() / 1024**2
    print('Memory usage decreased from {:5.2f}MB to {:5.2f}MB ({:.1f}% reduction)'.format(start_memory,end_memory, ((start_memory-end_memory)/start_memory)*100))
    return df

In [None]:
for df in [train_identity, test_identity, train_transaction, test_transaction]:
    df = reduce_memory_usage(df)

# 2. EDA

### About Data

In [None]:
print(f'train_transaction shape is {train_transaction.shape}')
print(f'test_transaction shape is {test_transaction.shape}')
print(f'train_identity shape is {train_identity.shape}')
print(f'test_identity shape is {test_identity.shape}')      

# 3. Feature Engineering

- train, test에 대해 transaction과 identitiy 데이터를 합쳐서 사용함
- train, test에 대해 설명변수(X)와 종속변수(y)를 분리함

In [None]:
# 처음에 미리 데이터의 index를 TransactionID로 설정한 경우
if is_index_TransactionID:
    X_train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
    X_test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
else:
# 처음에 미리 데이터의 index를 TransactionID로 설정하지 않은 경우
    # 'TransactionID' feature를 기준으로 transaction, identity 데이터 합치기
    X_train = pd.merge(train_transaction, train_identity, how = 'left', on = 'TransactionID')
    X_test = pd.merge(test_transaction, test_identity, how = 'left', on = 'TransactionID')
    # 'TransactionID' feature를 index로 설정함
    X_train = X_train.set_index('TransactionID')
    X_test = X_test.set_index('TransactionID')

del train_identity, test_identity, train_transaction, test_transaction

y_train = X_train['isFraud'].copy()

X_train.drop(['isFraud'], axis = 1, inplace = True)

## Normalize D Columns (D feature 정규화)

- D feature는 과거의 어떤 순간부터의 거래 시점까지의 "Time delta"값임
- D feature를 델타값이 아닌 그 과거의 시점의 값으로 변환하여 해당 feature가 시간에 따라 증가하는 특성을 없애
  시계열의 특성을 제거한 모델에 조금 더 의미있는 feature로 사용함
- 단, D1의 카드만든 이후 지금까지의 기간과 같이 델타값이 의미가 있는 feature는 제외함

In [None]:
# Normalize D features
for idx in range(1,16):
    # 일부 feature는 제외함
    if idx in [1,2,3,5,9]: 
        continue
    # 델타값(일단위값) - (거래시점(초단위)/24*60*60 -> 초단위값을 일단위로 변환)
    X_train['D'+str(idx)] =  X_train['D'+str(idx)] - X_train.TransactionDT/np.float32(24*60*60)
    X_test['D'+str(idx)] = X_test['D'+str(idx)] - X_test.TransactionDT/np.float32(24*60*60) 

- test의 id feature를 train의 id feature와 이름을 통일시킴

In [None]:
for column in X_test.columns:
    if column.startswith('id'):
            X_test.rename(columns={column:column.replace('-','_')},inplace=True)

## Feature Selection

- 결측치가 N개 이상인 feature들을 모두 제거함으로써 feature를 골라냄

In [None]:
def drop_N_missing_values_columns(df_train, df_test,N=100000):
    
    def getNulls(data):
        #결측치 개수 및 비율 계산
        total = data.isnull().sum()
        percent = data.isnull().sum() / data.isnull().count()
        missing_data = pd.concat([total, percent], axis = 1, keys = ['total', 'precent'])

        return missing_data

    # Train 데이터의 결측치를 파악함
    missing_data_train = getNulls(df_train)

    # 결측치가 N개 이상있는 경우, 해당 feature 버림
    sel_cols = missing_data_train[missing_data_train['total'] > N].index
    del missing_data_train

    # Drop the columns
    df_train.drop(sel_cols, axis = 1, inplace = True)
    df_test.drop(sel_cols, axis = 1, inplace = True)

In [None]:
#N = 100000
#N defalt 값으로 100000으로 설정 
drop_N_missing_values_columns(X_train, X_test)

## Handle Missing Values

- 결측치를 최빈값이나 평균값으로 채우는지, -1로 채우는지에 따라 방법이 다름

In [None]:
def handle_missing_values_mean_mode(df_train, df_test):
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    #train , test데이터를 합침
    df_all = pd.concat([df_train, df_test], axis = 0, sort = False)
    #모든 데이터에 대한 column명을 가져옴
    all_data_cols = df_all.columns

    # 최빈값으로 결측치 채움
    for i in all_data_cols:
        # str값의 경우 최빈값으로 결측치를 채움
        if df_all[i].dtype == 'object':
            df_all[i] = df_all[i].fillna(df_all[i].mode()[0])
        # C 또는 V feature의 경우에 최빈값으로 결측치를 채움
        elif (i.startswith("C") or (i.startswith("V"))) and df_all[i].isnull().sum() > 0:
            df_all[i] = df_all[i].fillna(df_all[i].mode()[0])

    # 평균값으로 결측치 채움
    df_all['card2'] = df_all['card2'].fillna(df_all['card2'].mean())
    df_all['card3'] = df_all['card3'].fillna(df_all['card3'].mean())
    df_all['card5'] = df_all['card5'].fillna(df_all['card5'].mean())
    df_all['D1'] = df_all['D1'].fillna(df_all['D1'].mode()[0])
    df_all['D10'] = df_all['D10'].fillna(df_all['D10'].mode()[0])
    df_all['D15'] = df_all['D15'].fillna(df_all['D15'].mode()[0])
    df_all['addr1'] = df_all['addr1'].fillna(df_all['addr1'].mean())
    df_all['addr2'] = df_all['addr2'].fillna(df_all['addr2'].mean())

    # 다시 train과 test 데이터로 나눔
    df_train = df_all[:ntrain]
    df_test = df_all[ntrain:]
    
    del df_all 
    gc.collect()

def handle_missing_values_negative_one(df_train, df_test):
    for col in df_train.columns:
        #숫자값 데이터를 가지는 feature에 대해
        if not df_train[col].dtype=='object':
            #'TransactionAmt','TransactionDT' feature를 제외한 feature들에 대해
            if col not in ['TransactionAmt','TransactionDT']:
                #각 feature의 최소값을 구해 
                mn = np.min((df_train[col].min(),df_test[col].min()))
                #각 feature의 모든 값들에 대해 최소값을 빼주어 양수로 만듦
                df_train[col] -= np.float32(mn)
                df_test[col] -= np.float32(mn)
                #결측치를 모두 -1로 설정함
                df_train[col].fillna(-1,inplace=True)
                df_test[col].fillna(-1,inplace=True)

In [None]:
 handle_missing_values_negative_one(X_train, X_test)

### Encode the categorical features (Label Encode)

In [None]:
#1) 원핫인코딩(one-hot-encoding)
def encode_one_hot (df_train, df_test):
    df_train = pd.get_dummies(df_train)
    df_test = pd.get_dummies(df_test)

#2) factorize함수로 직접 Label encode 하기

def encode_label (df_train, df_test):
    # 데이터를 라벨숫자로 변환함
    for col in df_train.columns:
        if df_train[col].dtype=='object': 
            df_comb = pd.concat([df_train[col],df_test[col]],axis=0)
            df_comb,_ = df_comb.factorize(sort=True)
            X_train[col] = df_comb[:len(df_train)].astype('int16')
            X_test[col] = df_comb[len(df_train):].astype('int16')
    

In [None]:
#one-hot-encoding/ labelencoderlibrary/ label-encoding
encode_label(X_train, X_test)

### Model Building

XGBoost

In [None]:
# CHRIS - TRAIN 75% PREDICT 25%
idxT = X_train.index[:3*len(X_train)//4]
idxV = X_train.index[3*len(X_train)//4:]

[Hyperparmeter tuning](http://https://www.kaggle.com/ohseongyeon/ieee-hyperparmeter-tuning)

In [None]:
import xgboost as xgb
xgmodel = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        #nthread=4,
        #tree_method='hist' 
        # USE GPU
        tree_method='gpu_hist' 
    )
# xgmodel = xgb.XGBClassifier(n_estimators = 5000,
#                             #max_depth = 12,
#                             #learning_rate = 0.02,
#                             #subsample = 0.8,
#                             #colsample_bytree = 0.4,
#                             #missing = -1,
#                             #random_state = 42,
#                             #tree_method = 'gpu_hist')
xgmodel.fit(X_train.loc[idxT,:], y_train[idxT],eval_set=[(X_train.loc[idxV,:],y_train[idxV])],
        verbose=50, early_stopping_rounds=100)