In [3]:
# !pip install catboost

In [27]:
DATA_PATH = "data/"
SUBMIT_PATH = "submit/"
SEED = 42

In [88]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
 
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool,CatBoostClassifier
from datetime import datetime

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Windows-10-10.0.19041-SP0
- python: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
- pandas: 1.3.5
- numpy: 1.22.0
- sklearn: 1.0.2


In [89]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train_data.shape , test_data.shape

((501951, 35), (46404, 34))

In [30]:
code_d.columns

Index(['속성 D 코드', '속성 D 세분류코드', '속성 D 소분류코드', '속성 D 중분류코드', '속성 D 대분류코드'], dtype='object')

In [31]:
code_h.columns

Index(['속성 H 코드', '속성 H 중분류코드', '속성 H 대분류코드'], dtype='object')

In [32]:
code_l.columns

Index(['속성 L 코드', '속성 L 세분류코드', '속성 L 소분류코드', '속성 L 중분류코드', '속성 L 대분류코드'], dtype='object')

### 속성 코드 데이터 컬럼명 변경

In [90]:
# code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
# code_h.columns= ["attribute_h","attribute_h_p"]
# code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_m", "attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

In [91]:
code_d

Unnamed: 0,attribute_d,attribute_d_d,attribute_d_s,attribute_d_m,attribute_d_l
0,4,4,3,2,1
1,5,5,3,2,1
2,7,7,6,2,1
3,8,8,6,2,1
4,9,8,6,2,1
...,...,...,...,...,...
1109,1254,1254,1254,1235,1235
1110,1255,1254,1254,1235,1235
1111,1256,1254,1254,1235,1235
1112,1257,1254,1254,1235,1235


In [92]:
# hour 변수를 넣기 위해서 추가
train_data['contents_open_dt'] = train_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
test_data['contents_open_dt'] = test_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

train_data['hour'] = train_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])
test_data['hour'] = test_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])

train_data.head(3)

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target,hour
0,0,True,True,True,False,False,False,1,4,3,...,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1,12
1,1,False,False,False,True,True,False,1,3,4,...,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0,17
2,2,False,False,False,True,False,False,2,0,3,...,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0,20


In [93]:
# Train data EDA에 따라서
# good - mid - low time (target에 따라서 good <-> low 범위)
# low : 23~07
# mid : 17~22
# good : 8~16
train_data.loc[train_data['hour']<=7, 'hour_band'] = 'low_time'
train_data.loc[train_data['hour']==23, 'hour_band'] = 'low_time'

train_data.loc[(train_data['hour']>7) & (train_data['hour'] <= 16), 'hour_band'] = 'good_time'
train_data.loc[(train_data['hour']>=17) & (train_data['hour']<=22), 'hour_band'] = 'mid_time'
train_data = train_data.drop(columns = ['hour'])

test_data.loc[test_data['hour']<=7, 'hour_band'] = 'low_time'
test_data.loc[test_data['hour']==23, 'hour_band'] = 'low_time'

test_data.loc[(test_data['hour']>7) & (test_data['hour'] <= 16), 'hour_band'] = 'good_time'
test_data.loc[(test_data['hour']>=17) & (test_data['hour']<=22), 'hour_band'] = 'mid_time'
test_data = test_data.drop(columns = ['hour'])

### 속성코드 데이터 merge 함수

In [94]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

### 데이터 전처리 함수


In [95]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

### 전처리 컬럼명 정의


In [96]:
# 위의 column 명을 변경했기 때문에 이 부분에도 수정이 있음
# cols_equi 부분 수정

# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"),

    
]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

In [97]:
cols_merge

[('person_prefer_d_1',
        attribute_d  attribute_d_d  attribute_d_s  attribute_d_m  attribute_d_l
  0               4              4              3              2              1
  1               5              5              3              2              1
  2               7              7              6              2              1
  3               8              8              6              2              1
  4               9              8              6              2              1
  ...           ...            ...            ...            ...            ...
  1109         1254           1254           1254           1235           1235
  1110         1255           1254           1254           1235           1235
  1111         1256           1254           1254           1235           1235
  1112         1257           1254           1254           1235           1235
  1113         1258           1258           1258           1258           1258
  
  [1114 rows x

### 학습및 추론셋 전처리 !!


In [98]:
x_train, y_train = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

((501951, 72), (501951,), (46404, 72))

In [99]:
x_train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_rn', 'hour_band',
       'person_prefer_d_1_attribute_d_d', 'person_prefer_d_1_attribute_d_s',
       'person_prefer_d_1_attribute_d_m', 'person_prefer_d_1_attribute_d_l',
       'person_prefer_d_2_attribute_d_d', 'person_prefer_d_2_attribute_d_s',
       'person_prefer_d_2_attribute_d_m', 'person_prefer_d_2_attribute_d_l',
  

In [43]:
# # 추가한 부분
# x_train
# bool_cols = ['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn'	,'h_l_match_yn','h_m_match_yn',	'h_s_match_yn',	'person_attribute_a']

# # 나머지 범주형 변수들 모두 target encoding 변환
# for col in x_train.columns:
#     if col not in bool_cols:
#         if col == "target" :
#             break        
#         encoder = TargetEncoder()
#         x_train[col] = encoder.fit_transform(x_train[col].astype('str'), df['target']) 
#         x_test[col] = encoder.transform(x_test[col].astype('str'))
# # 변수가 str 형태로 되어 있어서, category 변수라는 것을 알려주어야 한다.


In [185]:
x_train.head(3)

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s,person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m,person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_h_1_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l
0,1,1,1,0,0,0,1,0.48361,0.487071,0.509803,...,0.550807,0.481079,0.452564,0.549095,0.498124,0.500084,0.500012,0.481011,0.499739,0.498647
1,0,0,0,1,1,0,1,0.486858,0.50085,0.49309,...,0.426427,0.481079,0.452564,0.441344,0.502553,0.500084,0.500012,0.504553,0.499739,0.501022
2,0,0,0,1,0,0,2,0.493002,0.487071,0.509803,...,0.550807,0.481079,0.452564,0.441344,0.498124,0.500084,0.500012,0.504553,0.499741,0.501022


In [44]:
['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn',	'h_l_match_yn',	'h_m_match_yn',	'h_s_match_yn',	'person_attribute_a']

['d_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a']

In [45]:
y_train

array([1, 0, 0, ..., 1, 1, 1], dtype=int64)

In [46]:
x_test

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s,person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m,person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_h_1_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l
0,1,0,0,1,1,1,1,1,2,1,...,1,0,0,0,1,0,0,1,0,0
1,0,0,0,1,0,0,2,0,2,2,...,0,0,0,0,0,1,0,1,1,0
2,1,0,0,1,1,1,2,3,2,1,...,0,0,0,0,1,0,0,1,1,1
3,1,0,0,1,1,1,1,2,2,5,...,1,0,1,1,1,0,0,1,1,1
4,1,0,0,1,0,0,1,6,4,5,...,1,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,1,1,1,0,0,0,2,0,4,1,...,1,1,1,1,0,0,0,0,0,0
46400,1,0,0,1,0,0,2,0,4,1,...,1,0,1,1,0,0,0,1,0,0
46401,1,1,0,1,1,1,2,0,3,1,...,1,1,1,1,1,0,0,1,1,1
46402,1,1,0,1,1,1,1,3,3,2,...,1,1,1,1,1,0,0,1,1,1


### 범주형 컬럼 리스트(catboost 파라미터에 넣을 용도)


In [100]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()


In [56]:
cat_features

['person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_e',
 'contents_attribute_h',
 'person_rn',
 'hour_band',
 'person_prefer_d_1_attribute_d_d',
 'person_prefer_d_1_attribute_d_s',
 'person_prefer_d_1_attribute_d_m',
 'person_prefer_d_1_attribute_d_l',
 'person_prefer_d_2_attribute_d_d',
 'person_prefer_d_2_attribute_d_s',
 'person_prefer_d_2_attribute_d_m',
 'person_prefer_d_2_attribute_d_l',
 'person_prefer_d_3_attribute_d_d',
 'person_prefer_d_3_attribute_d_s',
 'person_prefer_d_3_attribute_d_m',
 'person_prefer_d_3_attribute_d_l',
 'contents_attribute_d_attribute_d_d',
 'contents_attribute_d_attribute_d_s',
 'contents_attr

### 학습 파라미터


In [57]:
is_holdout = False
n_splits = 5 # 기존 5
iterations = 10000 # 기존 3000
patience = 80 # 기존 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

### 학습 시작!!


In [60]:
scores = []
models = []


models = []
for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations,
                               random_state=SEED,
                               task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               # ['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn',	'h_l_match_yn',	'h_m_match_yn',	'h_s_match_yn',	'person_attribute_a'],
                               one_hot_max_size=4,
                               bagging_temperature=0.2,
#                               depth=10,
# depth default 값을 모르겠지만 학습이 엄청 오래걸리게 된다.
                               use_best_model=True)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    



Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.016489
0:	learn: 0.6117428	test: 0.6131131	best: 0.6131131 (0)	total: 256ms	remaining: 42m 40s
100:	learn: 0.6398302	test: 0.6478540	best: 0.6478540 (100)	total: 27.1s	remaining: 44m 11s
200:	learn: 0.6471398	test: 0.6576513	best: 0.6576571 (199)	total: 55s	remaining: 44m 39s
300:	learn: 0.6534750	test: 0.6692216	best: 0.6692216 (300)	total: 1m 21s	remaining: 43m 34s
400:	learn: 0.6585276	test: 0.6778207	best: 0.6778207 (400)	total: 1m 46s	remaining: 42m 27s
500:	learn: 0.6620458	test: 0.6827344	best: 0.6828140 (497)	total: 2m 11s	remaining: 41m 34s
600:	learn: 0.6642141	test: 0.6846628	best: 0.6846628 (600)	total: 2m 36s	remaining: 40m 46s
700:	learn: 0.6661976	test: 0.6865140	best: 0.6865876 (677)	total: 3m 1s	remaining: 40m 10s
800:	learn: 0.6678819	test: 0.6865363	best: 0.6868281 (731)	total: 3m 26s	remaining: 39m 30s
bestTest = 0.6868281335
bestIteration = 731
Shrink model to first 732 iterations.
Learning rate set to 0.016489
0:	learn: 0.6139656	test: 0.615

### CV 결과 확인


In [101]:
print(scores)
print(np.mean(scores))

# hyperparemeter 수정 X
# 결과 0.75 이전꺼



[0.7118805181413933]
0.7118805181413933


### threshold 정의


In [73]:
threshold = 0.4

### threshold값 변경에 따른 검증점수 확인 및 추론


In [102]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7118805181413933, 0.7125812569909381, 0.7102494327403686, 0.7103180735041763, 0.7116297109282809]
0.7113317984610313


### 산술평균 앙상블!!


In [103]:
pred = np.mean( pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)

### 제출파일!!


In [104]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [105]:
sum(sample_submission['target'])

34451

In [106]:
sample_submission.to_csv(f"{SUBMIT_PATH}catboost-append_hour.csv", index=False)

### 진행하며
- 기존 공유 코드와 다른 부분들 존재

In [11]:
# !pip install category_encoders

In [12]:
import pandas as pd
from category_encoders import TargetEncoder

In [38]:
encoder = TargetEncoder()
df = train_data.copy()


In [16]:
df.head(3)

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,...,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,...,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,...,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0


In [41]:
df['test_encode'] = encoder.fit_transform(df['contents_attribute_k'].astype('str'), df['target']) 
# 변수가 str 형태로 되어 있어서, category 변수라는 것을 알려주어야 한다.


In [61]:
print(df['test_encode'].head(3), df['contents_attribute_l'].head(3))

0    0.502061
1    0.502061
2    0.438810
Name: test_encode, dtype: float64 0    1608
1    1608
2    1600
Name: contents_attribute_l, dtype: int64


In [46]:
test_df = test_data.copy()

In [63]:
test_df
test_df['test_encode'] = encoder.transform(test_df['contents_attribute_k'].astype('str')) 
print(test_df['contents_attribute_k'].head(3),test_df['test_encode'].head(3))
# 변수가 str 형태로 되어 있어서, category 변수라는 것을 알려주어야 한다.


0    2
1    2
2    2
Name: contents_attribute_k, dtype: int64 0    0.502061
1    0.502061
2    0.502061
Name: test_encode, dtype: float64


In [58]:
df.dtypes

id                          int64
d_l_match_yn                 bool
d_m_match_yn                 bool
d_s_match_yn                 bool
h_l_match_yn                 bool
h_m_match_yn                 bool
h_s_match_yn                 bool
person_attribute_a          int64
person_attribute_a_1        int64
person_attribute_b          int64
person_prefer_c             int64
person_prefer_d_1           int64
person_prefer_d_2           int64
person_prefer_d_3           int64
person_prefer_e             int64
person_prefer_f             int64
person_prefer_g             int64
person_prefer_h_1           int64
person_prefer_h_2           int64
person_prefer_h_3           int64
contents_attribute_i        int64
contents_attribute_a        int64
contents_attribute_j_1      int64
contents_attribute_j        int64
contents_attribute_c        int64
contents_attribute_k        int64
contents_attribute_l        int64
contents_attribute_d        int64
contents_attribute_m        int64
contents_attri

### XGBoost 

- 실험1) xgboost + target encoding

In [146]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

In [147]:
train = train.drop(columns=cols_drop)
test = test.drop(columns=cols_drop)
train.head(3)


Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,target
0,True,True,True,False,False,False,1,4,3,5,...,2,1,2,1608,275,1,4,139,618822,1
1,False,False,False,True,True,False,1,3,4,1,...,1,1,2,1608,275,1,4,133,571659,0
2,False,False,False,True,False,False,2,0,3,5,...,2,1,1,1600,94,1,4,53,399816,0


In [148]:
bool_cols = ['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn'	,'h_l_match_yn','h_m_match_yn',	'h_s_match_yn',	'person_attribute_a']
# bool 형태 데이터 변환
for col in bool_cols :
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

In [154]:

# 나머지 범주형 변수들 모두 target encoding 변환
for col in train.columns:
    if col not in bool_cols:
        if col == "target" :
            break        
        encoder = TargetEncoder()
        train[col] = encoder.fit_transform(train[col].astype('str'), df['target']) 
        test[col] = encoder.transform(test[col].astype('str'))
# 변수가 str 형태로 되어 있어서, category 변수라는 것을 알려주어야 한다.


In [155]:
train.head(3)

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,target
0,1,1,1,0,0,0,1,0.48361,0.487071,0.509803,...,0.46724,0.512659,0.502061,0.501231,0.417795,0.508141,0.504105,0.463074,0.536122,1
1,0,0,0,1,1,0,1,0.486858,0.50085,0.49309,...,0.50988,0.512659,0.502061,0.501231,0.417795,0.508141,0.504105,0.543445,0.0,0
2,0,0,0,1,0,0,2,0.493002,0.487071,0.509803,...,0.46724,0.512659,0.43881,0.575518,0.484517,0.508141,0.504105,0.460276,0.536122,0


In [156]:
test.head(3)

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn
0,1,0,0,1,1,1,1,0.49974,0.49974,0.49974,...,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974
1,0,0,0,1,0,0,2,0.49974,0.49974,0.49974,...,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974
2,1,0,0,1,1,1,2,0.49974,0.49974,0.49974,...,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974,0.49974


In [157]:
from sklearn.model_selection import train_test_split
X_features = train[train.columns.difference(['target'])]
y_label = train['target']
X_features.head(3)

Unnamed: 0,contents_attribute_a,contents_attribute_c,contents_attribute_d,contents_attribute_e,contents_attribute_h,contents_attribute_i,contents_attribute_j,contents_attribute_j_1,contents_attribute_k,contents_attribute_l,...,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,person_rn
0,0.49702,0.512659,0.417795,0.504105,0.463074,0.504674,0.46724,0.472427,0.502061,0.501231,...,0.487071,0.509803,0.555977,0.501582,0.555916,0.580604,0.572908,0.541915,0.558634,0.536122
1,0.49702,0.512659,0.417795,0.504105,0.543445,0.487215,0.50988,0.527519,0.502061,0.501231,...,0.50085,0.49309,0.4965,0.492168,0.488634,0.500543,0.515464,0.495718,0.496665,0.0
2,0.507262,0.512659,0.484517,0.504105,0.460276,0.504674,0.46724,0.472427,0.43881,0.575518,...,0.487071,0.509803,0.431395,0.487861,0.435365,0.479206,0.463174,0.486959,0.469785,0.536122


In [215]:
# 사이킷런 래퍼 XGBoost 클래스인 XGBClassifier 임포트
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test=train_test_split(X_features, y_label,
                                         test_size=0.2, random_state=42 )


evals = [(X_test, y_test)]

xgb_wrapper = XGBClassifier(n_estimators=4000, learning_rate=0.05, max_depth=10, objective = 'binary:logistic',
                            subsample=0.8,
                            colsample_bytree = 0.5,
                            reg_lambda = 10,
                            gamma=0.25)


In [216]:
xgb_wrapper.fit(X_train , y_train,  early_stopping_rounds=200,eval_set=evals, eval_metric="logloss",  verbose=10)

w_preds = xgb_wrapper.predict(X_test)
w_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]



[0]	validation_0-logloss:0.67745
[10]	validation_0-logloss:0.60144
[20]	validation_0-logloss:0.56598
[30]	validation_0-logloss:0.54076
[40]	validation_0-logloss:0.51683
[50]	validation_0-logloss:0.50484
[60]	validation_0-logloss:0.49105
[70]	validation_0-logloss:0.48277
[80]	validation_0-logloss:0.47919
[90]	validation_0-logloss:0.47536
[100]	validation_0-logloss:0.47097
[110]	validation_0-logloss:0.46897
[120]	validation_0-logloss:0.46757
[130]	validation_0-logloss:0.46548
[140]	validation_0-logloss:0.46459
[150]	validation_0-logloss:0.46368
[160]	validation_0-logloss:0.46289
[170]	validation_0-logloss:0.46214
[180]	validation_0-logloss:0.46166
[190]	validation_0-logloss:0.46139
[200]	validation_0-logloss:0.46096
[210]	validation_0-logloss:0.46057
[220]	validation_0-logloss:0.46036
[230]	validation_0-logloss:0.46014
[240]	validation_0-logloss:0.46000
[250]	validation_0-logloss:0.45989
[260]	validation_0-logloss:0.45977
[270]	validation_0-logloss:0.45968
[280]	validation_0-logloss:0.45

In [217]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

# 수정된 get_clf_eval() 함수 
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [225]:
get_clf_eval(y_test , w_preds, w_pred_proba)

오차 행렬
[[34762 15397]
 [10682 39550]]
정확도: 0.7402, 정밀도: 0.7198, 재현율: 0.7873,    F1: 0.7521, AUC:0.8393


In [226]:
sum(xgb_wrapper.predict(test))

2141

In [224]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = xgb_wrapper.predict(test)
sample_submission
sample_submission.to_csv(f"{SUBMIT_PATH}xgboost_target_encoding2.csv", index=False)