In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb
import catboost as catb

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

import gc
gc.enable()

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# 1. Data input

In [None]:
data_path = '../input/ieee-fraud-detection/'

In [None]:
def input_data (data_path):
    train_identity = pd.read_csv(data_path + 'train_identity.csv')
    test_identity = pd.read_csv(data_path + 'test_identity.csv')
    train_transaction = pd.read_csv(data_path + 'train_transaction.csv')
    test_transaction = pd.read_csv(data_path + 'test_transaction.csv')
    return train_identity, test_identity, train_transaction, test_transaction

train_identity, test_identity, train_transaction, test_transaction = input_data(data_path)

In [None]:
print(f'train_transaction shape is {train_transaction.shape}')
print(f'test_transaction shape is {test_transaction.shape}')
print(f'train_identity shape is {train_identity.shape}')
print(f'test_identity shape is {test_identity.shape}')     

In [None]:
train_transaction = train_transaction.set_index('TransactionID')
train_transaction.head()

In [None]:
train_identity = train_identity.set_index('TransactionID')
train_identity.head()

In [None]:
train = pd.merge(train_transaction, train_identity, how = 'left', on = 'TransactionID')
test = pd.merge(test_transaction, test_identity, how = 'left', on = 'TransactionID')

del train_identity, test_identity, train_transaction, test_transaction

In [None]:
train

In [None]:
test
# id부분이 id-(숫자) 꼴로 위의 train의 id와 다름

### Handle Missing Values

In [None]:
for column in test.columns:
    if column.startswith('id'):
        test.rename(columns = {column : column.replace('-', '_')}, inplace=True)

In [None]:
test

In [None]:
def getNulls(data):
    total = data.isnull().sum()
    percent = data.isnull().sum() / data.isnull().count()
    missing_data = pd.concat([total, percent], axis=1, keys = ['total', 'percent'])
    
    return missing_data

In [None]:
missing_data_train = getNulls(train)
missing_data_train

In [None]:
# 결측치가 10만개 이상인 것들을 drop
sel_cols = missing_data_train[missing_data_train['total'] > 100000].index
del missing_data_train

train.drop(sel_cols, axis=1, inplace=True)
test.drop(sel_cols, axis=1, inplace=True)

In [None]:
train

In [None]:
test

약 250개의 columns이 줄어듦

In [None]:
print(list(train.columns))

In [None]:
ntrain, ntest = train.shape[0], test.shape[0]
all_data = pd.concat([train, test], axis=0, sort=False)
all_data_cols = all_data.columns

for i in all_data_cols:
    if all_data[i].dtype == 'object':
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])
for i in all_data_cols:
    if (i.startswith('C') or i.startswith('V')) and all_data[i].isnull().sum() > 0:
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])

all_data['card2'] = all_data['card2'].fillna(all_data['card2'].mean())
all_data['card3'] = all_data['card3'].fillna(all_data['card3'].mean())
all_data['card5'] = all_data['card5'].fillna(all_data['card5'].mean())
        
all_data['D1'] = all_data['D1'].fillna(all_data['D1'].mode()[0])
all_data['D10'] = all_data['D10'].fillna(all_data['D10'].mode()[0])
all_data['D15'] = all_data['D15'].fillna(all_data['D15'].mode()[0])

all_data['addr1'] = all_data['addr1'].fillna(all_data['addr1'].mean())
all_data['addr2'] = all_data['addr2'].fillna(all_data['addr2'].mean())

### Encode the categorical features

In [None]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)

In [None]:
print(list(all_data.columns))

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

test_id = test['TransactionID']
target = train['isFraud']

# Fraud의 값을 예측하기 위해서 없앰?
train.drop(['TransactionID', 'isFraud'], axis=1, inplace=True)
test.drop(['TransactionID', 'isFraud'], axis=1, inplace=True)

print(train.shape)
print(test.shape)

del ntrain
del ntest

In [None]:
train

In [None]:
test

### Reduce Memory Use

In [None]:
def detect_num_cols_to_shrink(list_of_num_cols, dataframe):
    convert_to_int8 = []
    convert_to_int16 = []
    convert_to_int32 = []
    
    convert_to_float16 = []
    convert_to_float32 = []
    
    for col in list_of_num_cols:       
        if dataframe[col].dtype in ['int', 'int8', 'int32', 'int64']:
            
            describe_object = dataframe[col].describe()
            minimum = describe_object[3]
            maximum = describe_object[7]
            diff = abs(maximum - minimum)

            if diff < 255:
                convert_to_int8.append(col)                
            elif diff < 65535:
                convert_to_int16.append(col)              
            elif diff < 4294967295:
                convert_to_int32.append(col)   
                
        elif dataframe[col].dtype in ['float', 'float16', 'float32', 'float64']:
            
            describe_object = dataframe[col].describe()
            minimum = describe_object[3]
            maximum = describe_object[7]
            diff = abs(maximum - minimum)

            if diff < 65535:
                convert_to_float16.append(col)                
            elif diff < 4294967295:
                convert_to_float32.append(col) 
        
    list_of_lists = []
    list_of_lists.append(convert_to_int8)
    list_of_lists.append(convert_to_int16)
    list_of_lists.append(convert_to_int32)
    list_of_lists.append(convert_to_float16)
    list_of_lists.append(convert_to_float32)
    
    return list_of_lists

Train Data

In [None]:
num_train_cols = list(train.columns) 
num_cols_to_shrink_train = detect_num_cols_to_shrink(num_train_cols, train)

convert_to_int8 = num_cols_to_shrink_train[0]
convert_to_int16 = num_cols_to_shrink_train[1]
convert_to_int32 = num_cols_to_shrink_train[2]

convert_to_float16 = num_cols_to_shrink_train[3]
convert_to_float32 = num_cols_to_shrink_train[4]

print("starting with converting process....")

for col in convert_to_int16:
    train[col] = train[col].astype('int16')   
for col in convert_to_int32:
    train[col] = train[col].astype('int32') 
for col in convert_to_float16:
    train[col] = train[col].astype('float16')
for col in convert_to_float32:
    train[col] = train[col].astype('float32')
    
print("successfully converted!")

Test Data

In [None]:
num_test_cols = list(test.columns) 
num_cols_to_shrink_test = detect_num_cols_to_shrink(num_test_cols, test)

convert_to_int8 = num_cols_to_shrink_test[0]
convert_to_int16 = num_cols_to_shrink_test[1]
convert_to_int32 = num_cols_to_shrink_test[2]

convert_to_float16 = num_cols_to_shrink_test[3]
convert_to_float32 = num_cols_to_shrink_test[4]

print("starting with converting process....")

for col in convert_to_int16: 
    test[col] = test[col].astype('int16')    
for col in convert_to_int32:
    test[col] = test[col].astype('int32') 
for col in convert_to_float16:
    test[col] = test[col].astype('float16')   
for col in convert_to_float32:
    test[col] = test[col].astype('float32')
    
print("successfully converted!")

### Model Building

XGBoost

In [None]:
# CHRIS - TRAIN 75% PREDICT 25%
idxT = train.index[:3 * len(train) // 4]
idxV = train.index[3 * len(train) // 4:]

In [None]:
# n_estimators, max_depth = map(int, input().split())
# learning_rate = float(input())

n_estimators = 2000
max_depth = 12
learning_rate = 0.005
lr_list = []
auc_list = []

while 1:
    if learning_rate <= 0.105:
        
        xgmodel = xgb.XGBClassifier( 
                                    n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate, subsample = 0.8, 
                                    colsample_bytree = 0.4, missing = -1, eval_metric = 'auc', 
                                        # USE CPU #nthread=4, #tree_method='hist' 
                                        # USE GPU 
                                    tree_method = 'gpu_hist'
                                    )


        model = xgmodel.fit(train.loc[idxT, :], 
                    target[idxT], eval_set = [(train.loc[idxV, :], target[idxV])],
                    verbose = 50, early_stopping_rounds = 100)
        
        
        lr_list.append(learning_rate)
        auc_list.append(model.best_score)
        learning_rate += 0.005
    
    else:
        break
        
# print(model.best_score)
print(lr_list)
print(auc_list)

In [None]:
lr_list2 = []
for i in lr_list:
    I = round(i, ndigits=3)
    lr_list2.append(I)

# print(lr_list2)



df = pd.DataFrame(auc_list,lr_list2) #'n estimators', 'max depth',
print(df)
sns.lineplot(data = df)
plt.show()

learning rate = 0.02 일때 가장 높은 점수를 얻었다.

In [None]:
# n_estimators, max_depth = map(int, input().split())
# learning_rate = float(input())

n_estimators = 1000
max_depth = 12
best_learning_rate = 0.02
estim_list = []
auc_list2 = []

while 1:
    if n_estimators <= 3000:
        
        xgmodel = xgb.XGBClassifier( 
                                    n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate, subsample = 0.8, 
                                    colsample_bytree = 0.4, missing = -1, eval_metric = 'auc', 
                                        # USE CPU #nthread=4, #tree_method='hist' 
                                        # USE GPU 
                                    tree_method = 'gpu_hist'
                                    )


        model = xgmodel.fit(train.loc[idxT, :], 
                    target[idxT], eval_set = [(train.loc[idxV, :], target[idxV])],
                    verbose = 50, early_stopping_rounds = 100)
        
        
        estim_list.append(n_estimators)
        auc_list2.append(model.best_score)
        n_estimators += 100
    
    else:
        break
        
# print(model.best_score)
print(estim_list)
print(auc_list2)

In [None]:
df = pd.DataFrame(auc_list2, estim_list) #'n estimators', 'max depth',
print(df)
sns.lineplot(data = df)
plt.show()

estimators의 변화는 점수에 영향을 주지 못했다. 중앙값인 2000을 기본값으로 삼고 다음 하이퍼파라미터를 보겠다.

In [None]:
# n_estimators, max_depth = map(int, input().split())
# learning_rate = float(input())

n_estimators = 2000
max_depth = 5
best_learning_rate = 0.02
depth_list = []
auc_list3 = []

while 1:
    if max_depth <= 25:
        
        xgmodel = xgb.XGBClassifier( 
                                    n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate, subsample = 0.8, 
                                    colsample_bytree = 0.4, missing = -1, eval_metric = 'auc', 
                                        # USE CPU #nthread=4, #tree_method='hist' 
                                        # USE GPU 
                                    tree_method = 'gpu_hist'
                                    )


        model = xgmodel.fit(train.loc[idxT, :], 
                    target[idxT], eval_set = [(train.loc[idxV, :], target[idxV])],
                    verbose = 50, early_stopping_rounds = 100)
        
        
        depth_list.append(max_depth)
        auc_list3.append(model.best_score)
        max_depth += 1
    
    else:
        break
        
# print(model.best_score)
print(depth_list)
print(auc_list3)

In [None]:
df = pd.DataFrame(auc_list3, depth_list) #'n estimators', 'max depth',
print(df)
sns.lineplot(data = df)
plt.show()

max_depth = 10에서 가장 높은 점수를 보였다.

이를 통해, hyperparmeter가 아래와 같을 때 최적의 점수를 얻을 수 있을 것으로 예상할 수 있다.
- learning rate = 0.02
- max_depth = 10
이때, max_depth가 10~12가 큰 차이가 나지 않으므로 최종제출 전에 저 범위내 값중 높은 점수를 얻은 것을 선택할 수 있을 것이다.
