In [3]:
# !pip install catboost

In [4]:
DATA_PATH = "data/"
SUBMIT_PATH = "submit/"
SEED = 42

In [5]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
 
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool,CatBoostClassifier

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.4.0-42-generic-x86_64-with-glibc2.10
- python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
- pandas: 1.1.3
- numpy: 1.19.2
- sklearn: 0.23.2


In [6]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train_data.shape , test_data.shape

((501951, 35), (46404, 34))

In [7]:
code_d.columns

Index(['속성 D 코드', '속성 D 세분류코드', '속성 D 소분류코드', '속성 D 중분류코드', '속성 D 대분류코드'], dtype='object')

In [8]:
code_h.columns

Index(['속성 H 코드', '속성 H 중분류코드', '속성 H 대분류코드'], dtype='object')

In [9]:
code_l.columns

Index(['속성 L 코드', '속성 L 세분류코드', '속성 L 소분류코드', '속성 L 중분류코드', '속성 L 대분류코드'], dtype='object')

### 속성 코드 데이터 컬럼명 변경

In [10]:
# code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
# code_h.columns= ["attribute_h","attribute_h_p"]
# code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_m", "attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

In [11]:
code_d

Unnamed: 0,attribute_d,attribute_d_d,attribute_d_s,attribute_d_m,attribute_d_l
0,4,4,3,2,1
1,5,5,3,2,1
2,7,7,6,2,1
3,8,8,6,2,1
4,9,8,6,2,1
...,...,...,...,...,...
1109,1254,1254,1254,1235,1235
1110,1255,1254,1254,1235,1235
1111,1256,1254,1254,1235,1235
1112,1257,1254,1254,1235,1235


### 속성코드 데이터 merge 함수

In [12]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

### 데이터 전처리 함수


In [13]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

### 전처리 컬럼명 정의


In [16]:
# 위의 column 명을 변경했기 때문에 이 부분에도 수정이 있음
# cols_equi 부분 수정

# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"),

    
]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

In [27]:
cols_merge

[('person_prefer_d_1',
        attribute_d  attribute_d_d  attribute_d_s  attribute_d_m  attribute_d_l
  0               4              4              3              2              1
  1               5              5              3              2              1
  2               7              7              6              2              1
  3               8              8              6              2              1
  4               9              8              6              2              1
  ...           ...            ...            ...            ...            ...
  1109         1254           1254           1254           1235           1235
  1110         1255           1254           1254           1235           1235
  1111         1256           1254           1254           1235           1235
  1112         1257           1254           1254           1235           1235
  1113         1258           1258           1258           1258           1258
  
  [1114 rows x

### 학습및 추론셋 전처리 !!


In [17]:
x_train, y_train = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

((501951, 71), (501951,), (46404, 71))

In [28]:
x_train

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s,person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m,person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_h_1_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l
0,1,1,1,0,0,0,1,4,3,5,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,1,1,0,1,3,4,1,...,0,0,0,0,1,0,0,1,1,1
2,0,0,0,1,0,0,2,0,3,5,...,1,0,0,0,0,0,0,1,0,1
3,0,0,0,1,0,0,2,0,2,5,...,0,0,0,0,0,0,0,1,0,0
4,1,1,1,0,0,0,1,3,4,5,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,0,0,0,1,0,0,1,1,2,2,...,0,0,0,0,0,0,0,1,0,0
501947,1,1,0,1,0,0,1,6,2,1,...,1,1,1,1,0,1,0,1,1,1
501948,1,1,1,1,0,0,1,7,4,1,...,1,1,1,1,0,0,0,1,0,0
501949,1,0,0,1,0,0,1,1,2,1,...,1,1,1,1,0,0,0,1,0,1


In [29]:
y_train

array([1, 0, 0, ..., 1, 1, 1])

In [30]:
x_test

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s,person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m,person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_h_1_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l
0,1,0,0,1,1,1,1,1,2,1,...,1,0,0,0,1,0,0,1,0,0
1,0,0,0,1,0,0,2,0,2,2,...,0,0,0,0,0,1,0,1,1,0
2,1,0,0,1,1,1,2,3,2,1,...,0,0,0,0,1,0,0,1,1,1
3,1,0,0,1,1,1,1,2,2,5,...,1,0,1,1,1,0,0,1,1,1
4,1,0,0,1,0,0,1,6,4,5,...,1,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,1,1,1,0,0,0,2,0,4,1,...,1,1,1,1,0,0,0,0,0,0
46400,1,0,0,1,0,0,2,0,4,1,...,1,0,1,1,0,0,0,1,0,0
46401,1,1,0,1,1,1,2,0,3,1,...,1,1,1,1,1,0,0,1,1,1
46402,1,1,0,1,1,1,1,3,3,2,...,1,1,1,1,1,0,0,1,1,1


### 범주형 컬럼 리스트(catboost 파라미터에 넣을 용도)


In [18]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()


In [33]:
cat_features

['person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_e',
 'contents_attribute_h',
 'person_rn',
 'person_prefer_d_1_attribute_d_d',
 'person_prefer_d_1_attribute_d_s',
 'person_prefer_d_1_attribute_d_m',
 'person_prefer_d_1_attribute_d_l',
 'person_prefer_d_2_attribute_d_d',
 'person_prefer_d_2_attribute_d_s',
 'person_prefer_d_2_attribute_d_m',
 'person_prefer_d_2_attribute_d_l',
 'person_prefer_d_3_attribute_d_d',
 'person_prefer_d_3_attribute_d_s',
 'person_prefer_d_3_attribute_d_m',
 'person_prefer_d_3_attribute_d_l',
 'contents_attribute_d_attribute_d_d',
 'contents_attribute_d_attribute_d_s',
 'contents_attribute_d_attrib

### 학습 파라미터


In [38]:
is_holdout = False
n_splits = 8 # 기존 5
iterations = 10000 # 기존 3000
patience = 80 # 기존 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

### 학습 시작!!


In [39]:
scores = []
models = []


models = []
for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations,random_state=SEED,task_type="GPU",eval_metric="F1",cat_features=cat_features,one_hot_max_size=4)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.016364
0:	learn: 0.6472782	test: 0.6525232	best: 0.6525232 (0)	total: 43.8ms	remaining: 7m 17s
bestTest = 0.6525231665
bestIteration = 0
Shrink model to first 1 iterations.
Learning rate set to 0.016364
0:	learn: 0.6089516	test: 0.6069876	best: 0.6069876 (0)	total: 48.4ms	remaining: 8m 3s
100:	learn: 0.6418844	test: 0.6500791	best: 0.6501105 (95)	total: 3.92s	remaining: 6m 23s
200:	learn: 0.6484265	test: 0.6607986	best: 0.6609400 (199)	total: 7.86s	remaining: 6m 23s
300:	learn: 0.6543436	test: 0.6721795	best: 0.6721795 (300)	total: 11.8s	remaining: 6m 19s
400:	learn: 0.6587126	test: 0.6802200	best: 0.6802200 (400)	total: 15.7s	remaining: 6m 14s
500:	learn: 0.6610710	test: 0.6842896	best: 0.6843304 (499)	total: 19.5s	remaining: 6m 9s
600:	learn: 0.6634407	test: 0.6874674	best: 0.6874832 (598)	total: 23.3s	remaining: 6m 4s
700:	learn: 0.6656716	test: 0.6884937	best: 0.6885217 (699)	total: 27.1s	remaining: 5m 59s
800:	learn: 0.6673173	test: 0.6888529	best: 0.6892224

### CV 결과 확인


In [37]:
print(scores)
print(np.mean(scores))

# hyperparemeter 수정 X
# 결과 0.67



[0.6530414195243155, 0.6837090663565776, 0.6804908458560274, 0.6775732659450081, 0.6778448763917722]
0.6745318948147402


### threshold 정의


In [22]:
threshold = 0.4

### threshold값 변경에 따른 검증점수 확인 및 추론


In [23]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.6669897691587606, 0.7106666773931442, 0.7102101007154605, 0.7103042950186261, 0.707188118012673]
0.7010717920597329


### 산술평균 앙상블!!


In [24]:
pred = np.mean( pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)

### 제출파일!!


In [25]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,1
1,1,1
2,2,1
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [26]:
sample_submission.to_csv(f"{SUBMIT_PATH}share-cb_cv_5fold.csv", index=False)

### 진행하며
- 기존 공유 코드와 다른 부분들 존재