In [43]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
 
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool,CatBoostClassifier
from datetime import datetime


In [44]:
DATA_PATH = "data/"
SUBMIT_PATH = "submit/"
SEED = 42

- 실험1) xgboost + target encoding

In [45]:
import pandas as pd
from category_encoders import TargetEncoder
encoder = TargetEncoder()


In [46]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')


train_data.shape , test_data.shape

((501951, 35), (46404, 34))

In [47]:
# hour 변수를 넣기 위해서 추가
train_data['contents_open_dt'] = train_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
test_data['contents_open_dt'] = test_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

train_data['hour'] = train_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])
test_data['hour'] = test_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])

train_data.head(3)

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target,hour
0,0,True,True,True,False,False,False,1,4,3,...,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1,12
1,1,False,False,False,True,True,False,1,3,4,...,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0,17
2,2,False,False,False,True,False,False,2,0,3,...,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0,20


In [48]:
# Train data EDA에 따라서
# good - mid - low time (target에 따라서 good <-> low 범위)
# low : 23~07
# mid : 17~22
# good : 8~16
train_data.loc[train_data['hour']<=7, 'hour_band'] = 'low_time'
train_data.loc[train_data['hour']==23, 'hour_band'] = 'low_time'

train_data.loc[(train_data['hour']>7) & (train_data['hour'] <= 16), 'hour_band'] = 'good_time'
train_data.loc[(train_data['hour']>=17) & (train_data['hour']<=22), 'hour_band'] = 'mid_time'
train_data = train_data.drop(columns = ['hour'])

test_data.loc[test_data['hour']<=7, 'hour_band'] = 'low_time'
test_data.loc[test_data['hour']==23, 'hour_band'] = 'low_time'

test_data.loc[(test_data['hour']>7) & (test_data['hour'] <= 16), 'hour_band'] = 'good_time'
test_data.loc[(test_data['hour']>=17) & (test_data['hour']<=22), 'hour_band'] = 'mid_time'
test_data = test_data.drop(columns = ['hour'])

In [49]:
from typing import Dict
import numpy as np
import pandas as pd

def add_code(
    df: pd.DataFrame,
    d_code: Dict[int, Dict[str, int]], 
    h_code: Dict[int, Dict[str, int]], 
    l_code: Dict[int, Dict[str, int]],
) -> pd.DataFrame:
    
    # Copy input data
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    
    return df

In [50]:
d_code = pd.read_csv('data/속성_D_코드.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('data/속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('data/속성_L_코드.csv', index_col=0).T.to_dict()


train_data = add_code(train_data, d_code, h_code, l_code)
test_data = add_code(test_data, d_code, h_code, l_code)

In [51]:
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

train_data = train_data.drop(columns = cols_drop)
test_data = test_data.drop(columns = cols_drop)


# x_train, y_train = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
# x_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
# x_train.shape , y_train.shape , x_test.shape

In [52]:
# 이 부분을 추가한 모델이 성능이 조금은 더 높더라.
# 변수 개수가 여기에서 차이가 난다.
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_s" , "contents_attribute_d_s"),
    ("person_prefer_d_2_m" , "contents_attribute_d_m"),
    ("person_prefer_d_2_l" , "contents_attribute_d_l"),
    ("person_prefer_d_3_s" , "contents_attribute_d_s"),
    ("person_prefer_d_3_m" , "contents_attribute_d_m"),
    ("person_prefer_d_3_l" , "contents_attribute_d_l"),

    ("person_prefer_h_1_m" , "contents_attribute_h_m"),
    ("person_prefer_h_2_m" , "contents_attribute_h_m"),
    ("person_prefer_h_3_m" , "contents_attribute_h_m"),
    ("person_prefer_h_1_l" , "contents_attribute_h_l"),
    ("person_prefer_h_2_l" , "contents_attribute_h_l"),
    ("person_prefer_h_3_l" , "contents_attribute_h_l"),
]

    

In [53]:
bool_cols = ['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn'	,'h_l_match_yn','h_m_match_yn',	'h_s_match_yn',	'person_attribute_a']
def preprocess_data(
                    df:pd.DataFrame,
                    cols_equi:List[Tuple[str,str]]= [] )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()


    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    return (df)

In [54]:
train = preprocess_data(train_data,cols_equi)
test = preprocess_data(test_data, cols_equi)

In [55]:
train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'target', 'hour_band',
       'person_prefer_d_1_n', 'person_prefer_d_1_s', 'person_prefer_d_1_m',
       'person_prefer_d_1_l', 'person_prefer_d_2_n', 'person_prefer_d_2_s',
       'person_prefer_d_2_m', 'person_prefer_d_2_l', 'person_prefer_d_3_n',
       'person_prefer_d_3_s', 'person_prefer_d_3_m', 'person_prefer_d_3_

In [56]:
train[train.columns[train.nunique() > 2]].nunique()
# person_rn 은 제외하고 진행하자.

person_attribute_a_1           8
person_attribute_b             6
person_prefer_c                5
person_prefer_d_1           1093
person_prefer_d_2           1081
person_prefer_d_3           1043
person_prefer_e               12
person_prefer_h_1            279
person_prefer_h_2            279
person_prefer_h_3            279
contents_attribute_i           3
contents_attribute_a           3
contents_attribute_j_1         9
contents_attribute_c           4
contents_attribute_l        1752
contents_attribute_d        1065
contents_attribute_m           5
contents_attribute_e          12
contents_attribute_h         250
person_rn                 300177
hour_band                      3
person_prefer_d_1_n          443
person_prefer_d_1_s          137
person_prefer_d_1_m           36
person_prefer_d_1_l           11
person_prefer_d_2_n          435
person_prefer_d_2_s          137
person_prefer_d_2_m           36
person_prefer_d_2_l           11
person_prefer_d_3_n          420
person_pre

In [58]:
# 30만 유저?
train = train.drop(columns = ['person_rn'])

In [59]:
# 변수를 one-hot-encoding 기법으로 처리해본다.

# 2개 초과인 변수
cat_features = train.columns[train.nunique() > 2].tolist()

In [60]:
ohe_train = pd.get_dummies(train[cat_features].astype('str'))

In [61]:
train = train[train.columns.difference(cat_features)]
train.head(3)

Unnamed: 0,contents_attribute_c_person_prefer_c,contents_attribute_e_person_prefer_e,contents_attribute_j,contents_attribute_k,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,...,person_prefer_d_3_l_contents_attribute_d_l,person_prefer_d_3_m_contents_attribute_d_m,person_prefer_d_3_s_contents_attribute_d_s,person_prefer_h_1_l_contents_attribute_h_l,person_prefer_h_1_m_contents_attribute_h_m,person_prefer_h_2_l_contents_attribute_h_l,person_prefer_h_2_m_contents_attribute_h_m,person_prefer_h_3_l_contents_attribute_h_l,person_prefer_h_3_m_contents_attribute_h_m,target
0,0,0,2,2,1,1,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1,1,1,1,2,0,0,0,1,1,0,...,0,0,0,1,1,1,0,1,0,0
2,0,0,2,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


In [75]:
ohe_train.head(3)

Unnamed: 0,person_attribute_a_1_0,person_attribute_a_1_1,person_attribute_a_1_2,person_attribute_a_1_3,person_attribute_a_1_4,person_attribute_a_1_5,person_attribute_a_1_6,person_attribute_a_1_7,person_attribute_b_0,person_attribute_b_1,...,contents_attribute_l_l_2015,contents_attribute_l_l_2016,contents_attribute_l_l_2017,contents_attribute_l_l_2018,contents_attribute_l_l_2019,contents_attribute_l_l_2020,contents_attribute_l_l_2021,contents_attribute_l_l_2022,contents_attribute_l_l_2023,contents_attribute_l_l_2024
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [76]:
train = pd.concat([train, ohe_train], axis=1)

In [77]:
ohe_test = pd.get_dummies(test[cat_features].astype('str'))
test = test[test.columns.difference(cat_features)]
test = pd.concat([test, ohe_test], axis=1)

In [78]:
from sklearn.model_selection import train_test_split
X_features = train[train.columns.difference(['target'])]
y_label = train['target']
X_features.head(3)

Unnamed: 0,contents_attribute_a_1,contents_attribute_a_2,contents_attribute_a_3,contents_attribute_c_1,contents_attribute_c_2,contents_attribute_c_3,contents_attribute_c_4,contents_attribute_c_person_prefer_c,contents_attribute_d_10,contents_attribute_d_100,...,person_prefer_h_3_m_556,person_prefer_h_3_m_557,person_prefer_h_3_m_558,person_prefer_h_3_m_559,person_prefer_h_3_m_560,person_prefer_h_3_m_566,person_prefer_h_3_m_567,person_prefer_h_3_m_568,person_prefer_h_3_m_569,person_prefer_h_3_m_contents_attribute_h_m
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# !pip install xgboost

In [79]:
# 사이킷런 래퍼 XGBoost 클래스인 XGBClassifier 임포트
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test=train_test_split(X_features, y_label,
                                         test_size=0.2, random_state=42 )


evals = [(X_test, y_test)]

xgb_wrapper = XGBClassifier(n_estimators=4000, learning_rate=0.05, max_depth=10, #objective = 'binary:logistic',
                            subsample=0.8,
                            colsample_bytree = 0.5,
                            reg_lambda = 10,
                            gamma=0.25)


In [None]:
xgb_wrapper.fit(X_train , y_train,  early_stopping_rounds=200,eval_set=evals, eval_metric="logloss",  verbose=10)

w_preds = xgb_wrapper.predict(X_test)
w_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]



In [63]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

# 수정된 get_clf_eval() 함수 
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [64]:
get_clf_eval(y_test , w_preds, w_pred_proba)

오차 행렬
[[35018 15141]
 [10400 39832]]
정확도: 0.7456, 정밀도: 0.7246, 재현율: 0.7930,    F1: 0.7572, AUC:0.8435


In [65]:
sum(xgb_wrapper.predict(test))

39381

In [66]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = xgb_wrapper.predict(test)
sample_submission
sample_submission.to_csv(f"{SUBMIT_PATH}xgboost_target_encoding.csv", index=False)