In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import joblib
import time
import os
import gc

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import xgboost
from xgboost import plot_importance
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

### LabelEncoding을 적용하는 함수입니다.

In [3]:
def label_encoding(df):
    le = LabelEncoder()
    original_columns = list(df.columns)
    
    for col in df:
        # df의 컬럼의 유형이 object인 것들만
        if df[col].dtype == 'object':
            le.fit(df[col])
            df[col] = le.transform(df[col])

    # 새롭게 만들어진 컬럼들의 이름을 리스트로 저장
    new_columns = [c for c in df.columns if c not in original_columns]
    
    # 수치형으로 변경된 df와 새롭게 만들어진 컬럼 이름 리스트를 반환
    return df, new_columns

# 1. Data Load

In [4]:
path = 'data/'
train = pd.read_csv(path + 'train.csv', index_col=0)
test = pd.read_csv(path + 'test_x.csv', index_col=0)

#### label 값의 2를 0으로 변경해 주었습니다.

In [5]:
train['voted'] = 2 - train['voted']
# train['voted'].replace(2, 0, inplace=True)

# 2. Data Cleansing & Feature Engineering

### one hot encoding의 경우 xgbclassifier 모델을 학습시키기 위한 데이터에만 적용

In [6]:
df = train.append(test)

df = pd.get_dummies(df, columns=['religion', 'race'], dummy_na=False) # one-hot-encoding

df, df_new_columns = label_encoding(df)

df = df[df['familysize'] < 100]

In [7]:
qe_columns = [i for i in df.columns[range(1, 40, 2)]]

df['qe_median'] = df[qe_columns].median(axis=1)

df[qe_columns] = np.log1p(df[qe_columns])

df['qe_logsum'] = df[qe_columns].sum(axis=1)

In [8]:
def get_sum_qe_label(df):
    if df < 100: return 1
    elif df < 150: return 2
    else: return 0
    
df['qe_logsum_label'] = df['qe_logsum'].apply(lambda x:get_sum_qe_label(x))

In [9]:
def familysize_label(df):
    if df < 4: return 1
    elif df < 8: return 2
    elif df < 16: return 3
    elif df < 32: return 4
    else: return 0
  
df['familysize_label'] = df['familysize'].apply(lambda x:familysize_label(x))

In [11]:
ls_minus = ['e', 'f', 'k', 'q', 'r', 'a', 'd', 'g', 'i', 'n']

for i in ls_minus:
    df[f'Q{i}A'] = 6 - df[f'Q{i}A']
    
qa_columns = [i for i in df.columns[range(0, 39, 2)]]

df['mach_score'] = df[qa_columns].sum(axis=1)

In [12]:
views_ls = ['QqA', 'QeA', 'QbA', 'QhA', 'QjA', 'QmA']
tactics_ls = ['QcA', 'QfA', 'QoA', 'QsA', 'QrA']

df['views_score'] = df[views_ls].sum(axis=1)
df['tactics_score'] = df[tactics_ls].sum(axis=1) # catboost에서는 제외

In [13]:
df['tp06'] = 8 - df['tp06']
df['tp02'] = 8 - df['tp02']
df['tp08'] = 8 - df['tp08']
df['tp04'] = 8 - df['tp04']
df['tp10'] = 8 - df['tp10']

df['extreversion_score'] = df['tp01'] + df['tp06'] # catboost에서는 제외
df['agreeableness_score'] = df['tp07'] + df['tp02'] 
df['conscientiousness_score'] = df['tp03'] + df['tp08'] # catboost에서는 제외
df['emotionalstability_score'] = df['tp09'] + df['tp04'] # catboost에서는 제외
df['opennessexperiences_score'] = df['tp05'] + df['tp10']

In [14]:
wrs = [f for f in df.columns if 'wr' in f]
wfs = [f for f in df.columns if 'wf' in f]

df['wrs_count'] = df[wrs].sum(axis=1) 
df['wfs_count'] = df[wfs].sum(axis=1) # catboost에서는 제외

df['wrs_kurt'] = df[wrs].kurtosis(axis=1) # catboost에서는 제외
df['wrs_skew'] = df[wrs].skew(axis=1) 

In [15]:
df['age_group'].replace(0, 7, inplace=True)
df['education'].replace(0, 5, inplace=True)

df['n2_prod'] = df['age_group'] * df['education']
df['n2_prod_weighted'] = df['age_group'] * 0.52 + df['education'] * 0.48

### Feature Selection

In [16]:
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE', 
             'hand']

df = df.drop(drop_list, axis=1)

# 3. Modeling

## XGBClassifier Modeling

In [None]:
params =  {'colsample_bylevel': 0.92859, 'colsample_bytree': 0.55352, 'eta': 0.015, 'gamma': 0.62587, 'min_child_weight': 48.0, 'reg_alpha': 0.54, 'reg_lambda': 1.062, 'subsample': 0.89}

params['min_child_weight'] = int(params['min_child_weight'])

clf = XGBClassifier(
             **params,
             max_depth=11,
             booster='gbtree',
             n_estimators=5000,
             objective= 'binary:logistic',
             eval_metric='auc',
             n_jobs= -1,
             scale_pos_weight= 1.206,
             random_state= 55 
)

train = df[df['voted'].notnull()]
test = df[df['voted'].isnull()]

train = train.astype({'voted':'int'})
test = test.drop(columns = ['voted'])

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
feats = [f for f in train.columns if f not in ['voted']]

oof_preds = np.zeros(train.shape[0])
xgb_sub_preds = np.zeros(test.shape[0])

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['voted'])):
    train_x, train_y = train[feats].iloc[train_idx], train['voted'].iloc[train_idx]
    valid_x, valid_y = train[feats].iloc[valid_idx], train['voted'].iloc[valid_idx]
    
    clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)],
        eval_metric='auc', verbose=False, early_stopping_rounds=500)
    
    oof_preds[valid_idx] = clf.predict_proba(valid_x, ntree_limit=clf.best_ntree_limit)[:, 1]
    xgb_sub_preds += (1 - clf.predict_proba(test, ntree_limit=clf.best_ntree_limit)[:, 1]) / folds.n_splits

#     print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
# print('Full AUC score %.6f' % roc_auc_score(train['voted'], oof_preds))

## LGBMClassifier Modeling

In [None]:
params = {'colsample_bytree': 0.675479, 'learning_rate': 0.00645, 'max_depth': 10.3, 'min_child_samples': 88.41, 'min_child_weight': 28.4, 'min_split_gain': 0.025029, 'num_leaves': 86.2, 'reg_alpha': 0.544736, 'reg_lambda': 0.15015, 'subsample': 0.7295}

params['num_leaves'] = int(params['num_leaves'])
params['max_depth'] = int(params['max_depth'])
params['min_child_samples'] = int(params['min_child_samples'])

clf = LGBMClassifier(
        **params,
        objective= 'binary',
        subsample_for_bin= 240000,
        is_unbalance= False,
        n_estimators=10000,
        n_jobs=-1,
        silent= -1,
        verbose= -1,
        random_state=55     
)

train = df[df['voted'].notnull()]
test = df[df['voted'].isnull()]

train = train.astype({'voted':'int'})
test = test.drop(columns = ['voted'])

cat_feature = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
feats = [f for f in train.columns if f not in ['voted']]

oof_preds = np.zeros(train.shape[0])
lgbm_sub_preds = np.zeros(test.shape[0])


for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['voted'])):
    train_x, train_y = train[feats].iloc[train_idx], train['voted'].iloc[train_idx]
    valid_x, valid_y = train[feats].iloc[valid_idx], train['voted'].iloc[valid_idx]
    
    clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)],
        eval_metric='auc', verbose=False, early_stopping_rounds=800,
        feature_name= list(train[feats].columns), categorical_feature= cat_feature)
    
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    lgbm_sub_preds += clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_split

#     print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
# print('Full AUC score %.6f' % roc_auc_score(train['voted'], oof_preds))

## Catboostclassifier Modeling

In [None]:
train = df[df['voted'].notnull()]
test = df[df['voted'].isnull()]

train = train.astype({'voted':'int'})
test = test.drop(columns = ['voted'])

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
feats = [f for f in train.columns if f not in ['voted']]

oof_preds = np.zeros(train.shape[0])
cat_preds = np.zeros(test.shape[0])

cat_feature = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']

params = {'bagging_temperature': 0.375906,
  'depth': 9.0,
  'l2_leaf_reg': 68.8,
  'learning_rate': 0.011,
  'od_wait': 138.699,
  'subsample': 0.76046}

params['depth'] = int(params['depth'])
params['l2_leaf_reg'] = int(params['l2_leaf_reg'])
params['od_wait'] = int(params['od_wait'])

clf = CatBoostClassifier(
                          **params,
                          iterations=5000,
                          eval_metric='AUC',
                          allow_writing_files=False,
                          od_type='Iter',
                          random_state=55)
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['voted'])):
    train_x, train_y = train[feats].iloc[train_idx], train['voted'].iloc[train_idx]
    valid_x, valid_y = train[feats].iloc[valid_idx], train['voted'].iloc[valid_idx]
    
    clf.fit(train_x, train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
            verbose=False, early_stopping_rounds=500,
            use_best_model=True, 
            cat_features=cat_feature)
    
    oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
    cat_sub_preds += clf.predict_proba(test)[:, 1] / folds.n_splits
    
#     print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    
# print('Full AUC score %.6f' % roc_auc_score(train['voted'], oof_preds))

# 4. Create Submission File

In [None]:
ensemble_sub_preds = (cat_sub_preds * 0.55) + (xgb_sub_preds * 0.37) + (lgbm_sub_preds * 0.08)

submit = pd.read_csv(path + 'sample_submission.csv', index_col = 0)

submit['voted'] = ensemble_sub_preds
submit.to_csv('ensemble.csv')