## 라이브러리

In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from scipy.stats.mstats import gmean
from sklearn.feature_selection import RFE
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
import sys

import gc
import re
from typing import List, Dict, Tuple

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Windows-10-10.0.22000-SP0
- python: 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
- pandas: 1.4.0
- numpy: 1.22.1
- sklearn: 1.0.2


  from pandas import MultiIndex, Int64Index


In [2]:
DATA_PATH = "data/"
SUBMIT_PATH = "submission/"
SEED = 42

In [3]:
train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

d_code = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
h_code = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
l_code = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train.shape, test.shape

((501951, 35), (46404, 34))

## 데이터 살펴보기

In [4]:
train.iloc[:, :20].head()

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56
3,3,False,False,False,True,False,False,2,0,2,5,703,705,704,3,1,1,72,227,2
4,4,True,True,True,False,False,False,1,3,4,5,275,370,369,4,1,1,214,210,209


In [5]:
train.iloc[:, 20:].head()

Unnamed: 0,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0
3,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0
4,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0


### D_code
- 코드 사이에 숫자가 밑의 코드로 갈린다
- 이걸 피처로 어떻게 나타낼 수 있을까

In [6]:
d_code

Unnamed: 0,속성 D 코드,속성 D 세분류코드,속성 D 소분류코드,속성 D 중분류코드,속성 D 대분류코드
0,4,4,3,2,1
1,5,5,3,2,1
2,7,7,6,2,1
3,8,8,6,2,1
4,9,8,6,2,1
...,...,...,...,...,...
1109,1254,1254,1254,1235,1235
1110,1255,1254,1254,1235,1235
1111,1256,1254,1254,1235,1235
1112,1257,1254,1254,1235,1235


In [7]:
d_code.groupby('속성 D 대분류코드')['속성 D 중분류코드'].unique()

속성 D 대분류코드
1                                            [2, 56, 189]
216                             [217, 220, 231, 274, 297]
377                             [378, 439, 450, 473, 477]
482                                                 [482]
522                                            [523, 603]
618                        [619, 644, 659, 690, 708, 716]
744                                            [745, 824]
864                                                 [864]
926     [927, 1000, 1053, 1093, 1104, 1137, 1169, 1193...
1235                                               [1235]
1258                                               [1258]
Name: 속성 D 중분류코드, dtype: object

In [8]:
d_code.groupby('속성 D 중분류코드')['속성 D 소분류코드'].unique()

속성 D 중분류코드
2                              [3, 6, 16, 29, 37, 43]
56            [57, 58, 63, 72, 83, 91, 109, 123, 152]
189                                   [190, 197, 210]
217                                             [217]
220                                        [221, 224]
231                    [232, 238, 242, 258, 270, 273]
274                                             [274]
297     [298, 312, 315, 332, 340, 353, 356, 359, 368]
378                         [379, 382, 396, 397, 431]
439                                        [440, 446]
450                                   [451, 463, 470]
473                                             [473]
477                                             [477]
482               [483, 488, 489, 490, 494, 495, 509]
523               [524, 537, 540, 547, 560, 586, 600]
603                                             [603]
619                                        [620, 636]
644                              [645, 652, 655, 656]
659              

### H_code
* 중분류가 대분류에 이어지는 숫자 형태
* 대분류랑 코드는 D와 같은 형태인데 중분류가 너무 눈치없음

In [9]:
h_code

Unnamed: 0,속성 H 코드,속성 H 중분류코드,속성 H 대분류코드
0,2,315,1
1,4,316,3
2,5,317,3
3,6,318,3
4,7,319,3
...,...,...,...
289,309,566,308
290,310,567,308
291,311,568,308
292,313,569,312


In [12]:
h_code.groupby('속성 H 대분류코드')['속성 H 중분류코드'].unique()

속성 H 대분류코드
1                                                  [315]
3      [316, 317, 318, 319, 320, 321, 322, 323, 324, ...
30     [342, 343, 344, 345, 346, 347, 348, 349, 350, ...
48         [359, 360, 361, 362, 363, 364, 365, 366, 367]
58     [368, 369, 370, 371, 372, 373, 374, 375, 376, ...
71                        [379, 380, 381, 382, 383, 384]
78                        [385, 386, 387, 388, 389, 390]
85                        [391, 392, 393, 394, 395, 396]
92                                                 [397]
94     [398, 399, 400, 401, 402, 403, 404, 405, 406, ...
149    [432, 433, 434, 435, 436, 437, 438, 439, 440, ...
169    [451, 452, 453, 454, 455, 456, 457, 458, 459, ...
188    [463, 464, 465, 466, 467, 468, 469, 470, 471, ...
208    [480, 481, 482, 483, 484, 485, 486, 487, 488, ...
226    [495, 496, 497, 498, 499, 500, 501, 502, 503, ...
250    [518, 519, 520, 521, 522, 523, 524, 525, 526, ...
277    [542, 543, 544, 545, 546, 547, 548, 549, 550, ...
302                 

In [13]:
h_code.groupby('속성 H 대분류코드')['속성 H 코드'].unique()

속성 H 대분류코드
1                                                    [2]
3      [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
30     [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 4...
48                  [49, 50, 51, 52, 53, 54, 55, 56, 57]
58      [59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
71                              [72, 73, 74, 75, 76, 77]
78                              [79, 80, 81, 82, 83, 84]
85                              [86, 87, 88, 89, 90, 91]
92                                                  [93]
94     [95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...
149    [150, 151, 152, 153, 154, 155, 156, 157, 158, ...
169    [170, 171, 172, 173, 174, 175, 176, 177, 178, ...
188    [189, 190, 191, 192, 193, 194, 195, 196, 197, ...
208    [209, 210, 211, 212, 213, 214, 215, 216, 217, ...
226    [227, 228, 229, 230, 231, 232, 233, 234, 235, ...
250    [251, 252, 253, 254, 255, 256, 257, 258, 259, ...
277    [278, 279, 280, 281, 282, 283, 284, 285, 286, ...
302                 

In [14]:
h_code.groupby('속성 H 중분류코드')['속성 H 코드'].unique()

속성 H 중분류코드
315      [2]
316      [4]
317      [5]
318      [6]
319      [7]
       ...  
566    [309]
567    [310]
568    [311]
569    [313]
570    [314]
Name: 속성 H 코드, Length: 256, dtype: object

### L_code

In [15]:
l_code.head()

Unnamed: 0,속성 L 코드,속성 L 세분류코드,속성 L 소분류코드,속성 L 중분류코드,속성 L 대분류코드
0,1,1,1,1,2004
1,2,2,2,1,2004
2,3,3,2,1,2004
3,4,3,2,1,2004
4,5,5,2,1,2004


In [16]:
l_code.groupby('속성 L 대분류코드')['속성 L 중분류코드'].unique()

속성 L 대분류코드
2004                                    [1, 42, 53, 2004]
2005                               [67, 74, 81, 95, 2005]
2006    [99, 168, 183, 187, 229, 257, 272, 295, 321, 3...
2007                                          [869, 2007]
2008                           [887, 893, 901, 925, 2008]
2009                                     [930, 954, 2009]
2010                             [1003, 1021, 1152, 2010]
2011                       [1271, 1301, 1316, 1323, 2011]
2012                                   [1353, 1365, 2012]
2013           [1397, 1417, 1435, 1446, 1458, 1467, 2013]
2014                             [1480, 1504, 1522, 2014]
2015                                         [1538, 2015]
2016                       [1556, 1572, 1605, 1623, 2016]
2017                             [1645, 1658, 1688, 2017]
2018                                         [1713, 2018]
2019                                         [1752, 2019]
2020                                   [1810, 1832, 2020]
202

In [17]:
l_code.groupby('속성 L 중분류코드')['속성 L 소분류코드'].unique()

속성 L 중분류코드
1       [1, 2, 18, 30, 33, 39]
42                    [42, 43]
53                [53, 54, 60]
67                [67, 68, 71]
74                [74, 75, 78]
                 ...          
2021                    [2021]
2022                    [2022]
2023                    [2023]
2024                    [2024]
2025                    [2025]
Name: 속성 L 소분류코드, Length: 99, dtype: object

In [18]:
l_code.groupby('속성 L 소분류코드')['속성 L 세분류코드'].unique()

속성 L 소분류코드
1                        [1]
2       [2, 3, 5, 9, 12, 14]
18      [18, 19, 22, 24, 27]
30                  [30, 31]
33              [33, 34, 37]
                ...         
2021                  [2021]
2022                  [2022]
2023                  [2023]
2024                  [2024]
2025                  [2025]
Name: 속성 L 세분류코드, Length: 332, dtype: object

## preprocessing & engineering

### person_rn, contents_rn 활용

In [4]:
train['person_contents_mul'] = train['person_rn'] * train['contents_rn']
test['person_contents_mul'] = test['person_rn'] * test['contents_rn']

In [5]:
train['person_contents_sum'] = train['person_rn'] + train['contents_rn']
test['person_contents_sum'] = test['person_rn'] + test['contents_rn']

### contents_open_dt 관련

In [6]:
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'])
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'])

In [7]:
train['contents_open_hour'] = train['contents_open_dt'].dt.hour
test['contents_open_hour'] = test['contents_open_dt'].dt.hour

In [8]:
train_hour = train.groupby('contents_open_hour').target.sum() / train.groupby('contents_open_hour').target.size()
train['contents_open_hour'] = train['contents_open_hour'].apply(lambda x: train_hour[x])
test['contents_open_hour'] = test['contents_open_hour'].apply(lambda x: train_hour[x])

### 전처리
- 코드표 결합
- 같은 범주 일치 확인

In [9]:
d_code.columns = ['attribute_d', 'attribute_d_d', 'attribute_d_s', 'attribute_d_m', 'attribute_d_l']
h_code.columns = ['attribute_h', 'attribute_h_m', 'attribute_h_l']
l_code.columns = ['attribute_l', 'attribute_l_d', 'attribute_l_s', 'attribute_l_m', 'attribute_l_l']

In [10]:
def merge_codes(df : pd.DataFrame, 
                df_code : pd.DataFrame,
                col : str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)

In [11]:
def preprocess_data(
                    df : pd.DataFrame, 
                    is_train : bool = True, 
                    cols_merge : List[Tuple[str, pd.DataFrame]] = [], 
                    cols_equi : List[Tuple[str, str]] = [] ,
                    cols_drop : List[str] = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_open_dt']
                    ) -> Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df['target'].to_numpy()
        df = df.drop(columns='target')

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f'{col1}_{col2}'] = (df[col1] == df[col2]).astype(int)
        
    df = df.drop(columns=cols_drop)
    return (df, y_data)

In [12]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ('person_prefer_d_1', d_code),
              ('person_prefer_d_2', d_code),
              ('person_prefer_d_3', d_code),
              ('contents_attribute_d', d_code),
              ('person_prefer_h_1', h_code),
              ('person_prefer_h_2', h_code),
              ('person_prefer_h_3', h_code),
              ('contents_attribute_h', h_code),
              ('contents_attribute_l', l_code),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ('contents_attribute_a', 'person_attribute_a'),
    ('contents_attribute_c', 'person_prefer_c'),
    ('contents_attribute_e', 'person_prefer_e'),

    ('person_prefer_d_1_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_1_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_2_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_2_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_2_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_2_attribute_d_l', 'contents_attribute_d_attribute_d_l'),
    ('person_prefer_d_3_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_3_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_3_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_3_attribute_d_l', 'contents_attribute_d_attribute_d_l'),

    ('person_prefer_h_2_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_2_attribute_h_l', 'contents_attribute_h_attribute_h_l'),
    ('person_prefer_h_3_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_3_attribute_h_l', 'contents_attribute_h_attribute_h_l'),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_rn', 'contents_open_dt', 'person_rn']

In [13]:
x_train, y_train = preprocess_data(train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_test, _ = preprocess_data(test, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train.shape, y_train.shape, x_test.shape

((501951, 76), (501951,), (46404, 76))

### 대중소세 코드 값 가지고 비교

In [14]:
# D : 대-중-소-세 일치여부 score
x_train['person_D_code1_score'] = (x_train['d_l_match_yn'] + x_train['person_prefer_d_1_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_1_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['d_m_match_yn'])
x_train['person_D_code2_score'] = (x_train['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_train['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_2_attribute_d_d_contents_attribute_d_attribute_d_d'])
x_train['person_D_code3_score'] = (x_train['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_train['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_train['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_train['person_prefer_d_3_attribute_d_d_contents_attribute_d_attribute_d_d'])

x_test['person_D_code1_score'] =( x_test['d_l_match_yn'] + x_test['person_prefer_d_1_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_1_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['d_m_match_yn'])
x_test['person_D_code2_score'] = (x_test['person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_test['person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_2_attribute_d_d_contents_attribute_d_attribute_d_d'])
x_test['person_D_code3_score'] = (x_test['person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l'] + x_test['person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m'] \
                                + x_test['person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s'] + x_test['person_prefer_d_3_attribute_d_d_contents_attribute_d_attribute_d_d'])

In [15]:
# H : 대-중 일치여부 score
x_train['person_H_code1_score'] = (x_train['h_l_match_yn'] + x_train['h_m_match_yn'])
x_train['person_H_code2_score'] = (x_train['person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_train['person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m'])
x_train['person_H_code3_score'] = (x_train['person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_train['person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m'])

x_test['person_H_code1_score'] = (x_test['h_l_match_yn'] + x_test['h_m_match_yn'])
x_test['person_H_code2_score'] = (x_test['person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_test['person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m'])
x_test['person_H_code3_score'] = (x_test['person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l'] + x_test['person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m'])

In [16]:
x_train['content_L_code_sum'] = x_train['contents_attribute_l_attribute_l_l'] + x_train['contents_attribute_l_attribute_l_m'] \
                                + x_train['contents_attribute_l_attribute_l_s'] + x_train['contents_attribute_l_attribute_l_d']
x_train['content_L_code_mul'] = x_train['contents_attribute_l_attribute_l_l'] * x_train['contents_attribute_l_attribute_l_m'] \
                                * x_train['contents_attribute_l_attribute_l_s'] * x_train['contents_attribute_l_attribute_l_d']

x_test['content_L_code_sum'] = x_test['contents_attribute_l_attribute_l_l'] + x_test['contents_attribute_l_attribute_l_m'] \
                               + x_test['contents_attribute_l_attribute_l_s'] + x_test['contents_attribute_l_attribute_l_d']
x_test['content_L_code_mul'] = x_test['contents_attribute_l_attribute_l_l'] * x_test['contents_attribute_l_attribute_l_m'] \
                               * x_test['contents_attribute_l_attribute_l_s'] * x_test['contents_attribute_l_attribute_l_d']

In [17]:
x_train['D_H_1_mul'] = x_train['person_D_code1_score'] * x_train['person_H_code1_score']
x_train['D_H_2_mul'] = x_train['person_D_code2_score'] * x_train['person_H_code2_score']
x_train['D_H_3_mul'] = x_train['person_D_code3_score'] * x_train['person_H_code3_score']
x_train['D_H_1_sum'] = x_train['person_D_code1_score'] + x_train['person_H_code1_score']
x_train['D_H_2_sum'] = x_train['person_D_code2_score'] + x_train['person_H_code2_score']
x_train['D_H_3_sum'] = x_train['person_D_code3_score'] + x_train['person_H_code3_score']

x_test['D_H_1_mul'] = x_test['person_D_code1_score'] * x_test['person_H_code1_score']
x_test['D_H_2_mul'] = x_test['person_D_code2_score'] * x_test['person_H_code2_score']
x_test['D_H_3_mul'] = x_test['person_D_code3_score'] * x_test['person_H_code3_score']
x_test['D_H_1_sum'] = x_test['person_D_code1_score'] + x_test['person_H_code1_score']
x_test['D_H_2_sum'] = x_test['person_D_code2_score'] + x_test['person_H_code2_score']
x_test['D_H_3_sum'] = x_test['person_D_code3_score'] + x_test['person_H_code3_score']

In [18]:
x_train['DD_12_diff'] = x_train['person_D_code1_score'] - x_train['person_D_code2_score']
x_train['DD_13_diff'] = x_train['person_D_code1_score'] - x_train['person_D_code3_score']
x_train['DD_23_diff'] = x_train['person_D_code2_score'] - x_train['person_D_code3_score']

x_test['DD_12_diff'] = x_test['person_D_code1_score'] - x_test['person_D_code2_score']
x_test['DD_13_diff'] = x_test['person_D_code1_score'] - x_test['person_D_code3_score']
x_test['DD_23_diff'] = x_test['person_D_code2_score'] - x_test['person_D_code3_score']

In [19]:
x_train['HH_12_diff'] = x_train['person_H_code1_score'] - x_train['person_H_code2_score']
x_train['HH_13_diff'] = x_train['person_H_code1_score'] - x_train['person_H_code3_score']
x_train['HH_23_diff'] = x_train['person_H_code2_score'] - x_train['person_H_code3_score']

x_test['HH_12_diff'] = x_test['person_H_code1_score'] - x_test['person_H_code2_score']
x_test['HH_13_diff'] = x_test['person_H_code1_score'] - x_test['person_H_code3_score']
x_test['HH_23_diff'] = x_test['person_H_code2_score'] - x_test['person_H_code3_score']

In [20]:
x_train['DDD_sum'] = x_train['person_D_code1_score'] + x_train['person_D_code2_score'] + x_train['person_D_code3_score']
x_train['HHH_sum'] = x_train['person_H_code1_score'] + x_train['person_H_code2_score'] + x_train['person_H_code3_score']

x_test['DDD_sum'] = x_test['person_D_code1_score'] + x_test['person_D_code2_score'] + x_test['person_D_code3_score']
x_test['HHH_sum'] = x_test['person_H_code1_score'] + x_test['person_H_code2_score'] + x_test['person_H_code3_score']

In [21]:
x_train['person_contents_e_diff'] = x_train['person_prefer_e'] - x_train['contents_attribute_e']
x_test['person_contents_e_diff'] = x_test['person_prefer_e'] - x_test['contents_attribute_e']

In [22]:
x_train['D_E_1_mul'] = x_train['person_D_code1_score'] * x_train['person_contents_e_diff']
x_train['D_E_2_mul'] = x_train['person_D_code2_score'] * x_train['person_contents_e_diff']
x_train['D_E_3_mul'] = x_train['person_D_code3_score'] * x_train['person_contents_e_diff']
x_train['D_E_1_sum'] = x_train['person_D_code1_score'] + x_train['person_contents_e_diff']
x_train['D_E_2_sum'] = x_train['person_D_code2_score'] + x_train['person_contents_e_diff']
x_train['D_E_3_sum'] = x_train['person_D_code3_score'] + x_train['person_contents_e_diff']

x_test['D_E_1_mul'] = x_test['person_D_code1_score'] * x_test['person_contents_e_diff']
x_test['D_E_2_mul'] = x_test['person_D_code2_score'] * x_test['person_contents_e_diff']
x_test['D_E_3_mul'] = x_test['person_D_code3_score'] * x_test['person_contents_e_diff']
x_test['D_E_1_sum'] = x_test['person_D_code1_score'] + x_test['person_contents_e_diff']
x_test['D_E_2_sum'] = x_test['person_D_code2_score'] + x_test['person_contents_e_diff']
x_test['D_E_3_sum'] = x_test['person_D_code3_score'] + x_test['person_contents_e_diff']

In [23]:
x_train['H_E_1_mul'] = x_train['person_H_code1_score'] * x_train['person_contents_e_diff']
x_train['H_E_2_mul'] = x_train['person_H_code2_score'] * x_train['person_contents_e_diff']
x_train['H_E_3_mul'] = x_train['person_H_code3_score'] * x_train['person_contents_e_diff']
x_train['H_E_1_sum'] = x_train['person_H_code1_score'] + x_train['person_contents_e_diff']
x_train['H_E_2_sum'] = x_train['person_H_code2_score'] + x_train['person_contents_e_diff']
x_train['H_E_3_sum'] = x_train['person_H_code3_score'] + x_train['person_contents_e_diff']

x_test['H_E_1_mul'] = x_test['person_H_code1_score'] * x_test['person_contents_e_diff']
x_test['H_E_2_mul'] = x_test['person_H_code2_score'] * x_test['person_contents_e_diff']
x_test['H_E_3_mul'] = x_test['person_H_code3_score'] * x_test['person_contents_e_diff']
x_test['H_E_1_sum'] = x_test['person_H_code1_score'] + x_test['person_contents_e_diff']
x_test['H_E_2_sum'] = x_test['person_H_code2_score'] + x_test['person_contents_e_diff']
x_test['H_E_3_sum'] = x_test['person_H_code3_score'] + x_test['person_contents_e_diff']

In [24]:
x_train['L_E_mul'] = x_train['content_L_code_sum'] * x_train['person_contents_e_diff']
x_train['L_E_sum'] = x_train['content_L_code_sum'] + x_train['person_contents_e_diff']

x_test['L_E_mul'] = x_test['content_L_code_sum'] * x_test['person_contents_e_diff']
x_test['L_E_sum'] = x_test['content_L_code_sum'] + x_test['person_contents_e_diff']

In [25]:
x_train.shape, x_test.shape

((501951, 113), (46404, 113))

In [26]:
x_train.iloc[:, -20:]

Unnamed: 0,HH_12_diff,HH_13_diff,HH_23_diff,DDD_sum,HHH_sum,person_contents_e_diff,D_E_1_mul,D_E_2_mul,D_E_3_mul,D_E_1_sum,D_E_2_sum,D_E_3_sum,H_E_1_mul,H_E_2_mul,H_E_3_mul,H_E_1_sum,H_E_2_sum,H_E_3_sum,L_E_mul,L_E_sum
0,-1,0,1,6,1,4,16,4,4,8,5,5,0,4,0,4,5,4,27336,6838
1,1,1,0,0,4,0,0,0,0,0,0,0,0,0,0,2,1,1,0,6834
2,1,0,-1,2,2,-1,0,-2,0,-1,1,-1,-1,0,-1,0,-1,0,-6782,6781
3,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,6834
4,0,0,0,6,0,0,0,0,0,4,1,1,0,0,0,0,0,0,0,6834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,1,1,0,0,1,-2,0,0,0,-2,-2,-2,-2,0,0,-1,-2,-2,-6102,3049
501947,-1,0,1,10,4,0,0,0,0,4,3,3,0,0,0,1,2,1,0,2398
501948,1,1,0,9,1,-3,-12,-3,-12,1,-2,1,-3,0,0,-2,-3,-3,-9801,3264
501949,1,0,-1,9,2,0,0,0,0,1,4,4,0,0,0,1,0,1,0,3968


### 범주형 칼럼 리스트

In [27]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()
cat_features = list(set(cat_features) - set(['content_L_code_sum', 'content_L_code_mul', 'L_E_mul', 'L_E_sum']))

In [28]:
num_features = ['content_L_code_sum', 'content_L_code_mul', 'L_E_mul', 'L_E_sum']

### 가우스-랭크 스케일링

In [29]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted

class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [30]:
scaler = GaussRankScaler()
x_train[num_features] = pd.DataFrame(scaler.fit_transform(x_train[num_features]))
x_test[num_features] = pd.DataFrame(scaler.transform(x_test[num_features]))

x_train[num_features].columns = num_features
x_test[num_features].columns = num_features

In [31]:
x_train[num_features].head()

Unnamed: 0,content_L_code_sum,content_L_code_mul,L_E_mul,L_E_sum
0,0.588596,0.588596,0.917041,0.608866
1,0.588596,0.588596,0.071223,0.606081
2,0.571712,0.571712,-0.098734,0.584805
3,0.588596,0.588596,0.071223,0.606081
4,0.588596,0.588596,0.071223,0.606081


## 모델링

### 학습 파라미터

In [32]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [33]:
x_train[num_features] = x_train[num_features].astype(float)
x_train[cat_features] = x_train[cat_features].astype(str)

x_test[num_features] = x_test[num_features].astype(float)
x_test[cat_features] = x_test[cat_features].astype(str)

### 학습

In [34]:
scores = []
models = []

for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED,
                               #task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience,
            verbose=100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.086395
0:	learn: 0.6228265	test: 0.6256143	best: 0.6256143 (0)	total: 4.84s	remaining: 4h 1m 54s
100:	learn: 0.6585608	test: 0.6845697	best: 0.6847914 (98)	total: 8m 16s	remaining: 3h 57m 44s
200:	learn: 0.6658963	test: 0.6908721	best: 0.6908721 (200)	total: 16m 10s	remaining: 3h 45m 18s
300:	learn: 0.6698645	test: 0.6919800	best: 0.6919800 (300)	total: 24m 11s	remaining: 3h 36m 57s
400:	learn: 0.6724766	test: 0.6933551	best: 0.6935256 (372)	total: 32m 10s	remaining: 3h 28m 30s
500:	learn: 0.6745086	test: 0.6938787	best: 0.6940588 (479)	total: 40m 14s	remaining: 3h 20m 44s
600:	learn: 0.6763565	test: 0.6942549	best: 0.6943947 (554)	total: 48m 34s	remaining: 3h 13m 52s
700:	learn: 0.6779117	test: 0.6947194	best: 0.6949998 (690)	total: 56m 47s	remaining: 3h 6m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6949997652
bestIteration = 690

Shrink model to first 691 iterations.
Learning rate set to 0.086395
0:	learn: 0.6361772	test: 0.6377400

### cv 결과 확인

In [35]:
print(scores)
print(np.mean(scores))

[0.6949997652471946, 0.6940439047699356, 0.6911261995121906, 0.6909868012276177, 0.6922592352316211]
0.6926831811977119


### threshold 정의

In [36]:
threshold = 0.4

### threshold값 변경에 따른 검증점수 확인 및 추론

In [37]:
pred_list = []
scores = []
for i, (tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai], pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7183982843438188, 0.7155118411311514, 0.7155490473580446, 0.7163829787234043, 0.7146004110946418]
0.7160885125302122


### 산술평균 앙상블

In [38]:
pred = np.mean(pred_list, axis=0)
pred = np.where(pred >= threshold, 1, 0)

### submission

In [39]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [40]:
sample_submission['target'].value_counts()

1    32513
0    13891
Name: target, dtype: int64

In [73]:
sample_submission.to_csv(f"{SUBMIT_PATH}jp_0124_1.csv", index=False)

In [None]:
(pd.DataFrame(pred_list).T).to_csv(f"{SUBMIT_PATH}jp_0124_1_proba.csv", index=False)

### feature importance
cb = CatBoostRegressor()
cb.get_feature_importance(type= "___")

 "type" possible values:
  - PredictionValuesChange
  - LossFunctionChange
  - FeatureImportance : 
      PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics
  - ShapValues : 
      Calculate SHAP Values for every object
  - Interaction :
      Calculate pairwise score between every feature

In [41]:
values = model.get_feature_importance(type='FeatureImportance')
fi = pd.DataFrame(x_train.columns)
fi['values'] = values
fi.columns = ['column', 'values']

In [42]:
fi.sort_values(by='values', ascending=False).head(30)

Unnamed: 0,column,values
19,contents_attribute_j_1,7.098501
96,DDD_sum,6.556798
24,contents_attribute_d,6.371989
23,contents_attribute_l,5.173323
27,contents_attribute_h,5.00642
10,person_prefer_d_1,3.454419
43,contents_attribute_d_attribute_d_d,3.359922
53,contents_attribute_h_attribute_h_m,3.019841
31,person_prefer_d_1_attribute_d_d,2.809022
32,person_prefer_d_1_attribute_d_s,2.697019
