In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
#import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sys
# Evaluation
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from sklearn.feature_selection import RFE

import gc
import random
import re
from typing import List ,Dict, Tuple

from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn
import catboost

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

# 버전 체크 

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")
print(f"- catboost: {catboost.__version__}")

- os: Windows-10-10.0.22000-SP0
- python: 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
- pandas: 1.3.5
- numpy: 1.20.3
- sklearn: 0.24.2
- catboost: 1.0.3


In [2]:
SEED = 20182830

In [3]:
d_code = pd.read_csv('data/속성_D_코드.csv')
h_code = pd.read_csv('data/속성_H_코드.csv')
l_code = pd.read_csv('data/속성_L_코드.csv')

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train.shape, test.shape

((501951, 35), (46404, 34))

## preprocessing & engineering

### 순서형 변수 처리
단순히 category 형태가 아닌 ordinal 처리

In [4]:
train['person_attribute_a_1'] = train['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
train['person_attribute_b'] = train['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
train['person_prefer_e'] = train['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
train['contents_attribute_e'] = train['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

In [5]:
test['person_attribute_a_1'] = test['person_attribute_a_1'].astype(pd.CategoricalDtype(ordered=True))
test['person_attribute_b'] = test['person_attribute_b'].astype(pd.CategoricalDtype(ordered=True))
test['person_prefer_e'] = test['person_prefer_e'].astype(pd.CategoricalDtype(ordered=True))
test['contents_attribute_e'] = test['contents_attribute_e'].astype(pd.CategoricalDtype(ordered=True))

### person_rn, content_rn
* 사람과 컨텐츠의 조합에 관해 특성을 만들어야함

In [6]:
train['person_contents_mul'] = train['person_rn'] * train['contents_rn']
test['person_contents_mul'] = test['person_rn'] * test['contents_rn']

### 회원속성 - 컨텐츠 속성 일치 여부 

In [7]:
d_code.columns = ['attribute_d', 'attribute_d_d', 'attribute_d_s', 'attribute_d_m', 'attribute_d_l']
h_code.columns = ['attribute_h', 'attribute_h_m', 'attribute_h_l']
l_code.columns = ['attribute_l', 'attribute_l_d', 'attribute_l_s', 'attribute_l_m', 'attribute_l_l']

In [8]:
def merge_codes(df : pd.DataFrame, 
                df_code : pd.DataFrame,
                col : str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)

In [9]:
def preprocess_data(
                    df : pd.DataFrame, 
                    is_train : bool = True, 
                    cols_merge : List[Tuple[str, pd.DataFrame]] = [], 
                    cols_equi : List[Tuple[str, str]] = [] ,
                    cols_drop : List[str] = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_open_dt']
                    ) -> Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df['target'].to_numpy()
        df = df.drop(columns='target')

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f'{col1}_{col2}'] = (df[col1] == df[col2]).astype(int)
        df[f'{col1}_{col2}_squared'] = (df[col1].astype(int) - df[col2].astype(int))**2

    df = df.drop(columns=cols_drop)
    return (df, y_data)

In [10]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ('person_prefer_d_1', d_code),
              ('person_prefer_d_2', d_code),
              ('person_prefer_d_3', d_code),
              ('contents_attribute_d', d_code),
              ('person_prefer_h_1', h_code),
              ('person_prefer_h_2', h_code),
              ('person_prefer_h_3', h_code),
              ('contents_attribute_h', h_code),
              ('contents_attribute_l', l_code),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ('contents_attribute_c', 'person_prefer_c'),
    ('contents_attribute_e', 'person_prefer_e'),

    ('person_prefer_d_2_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_2_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_2_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_2_attribute_d_l', 'contents_attribute_d_attribute_d_l'),
    ('person_prefer_d_3_attribute_d_d', 'contents_attribute_d_attribute_d_d'),
    ('person_prefer_d_3_attribute_d_s', 'contents_attribute_d_attribute_d_s'),
    ('person_prefer_d_3_attribute_d_m', 'contents_attribute_d_attribute_d_m'),
    ('person_prefer_d_3_attribute_d_l', 'contents_attribute_d_attribute_d_l'),
    
    ("person_prefer_h_1_attribute_h_m", "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l", "contents_attribute_h_attribute_h_l"),
    ('person_prefer_h_2_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_2_attribute_h_l', 'contents_attribute_h_attribute_h_l'),
    ('person_prefer_h_3_attribute_h_m', 'contents_attribute_h_attribute_h_m'),
    ('person_prefer_h_3_attribute_h_l', 'contents_attribute_h_attribute_h_l'),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ['id', 'person_prefer_f', 'person_prefer_g', 'contents_rn', 'person_rn']

# 민석이 추가

In [11]:
x_train, y_train = preprocess_data(train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_test, _ = preprocess_data(test, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train.shape, y_train.shape, x_test.shape

((501951, 90), (501951,), (46404, 90))

## KMEAN

In [12]:
from sklearn.cluster import KMeans

class KMeansFeaturizer:
    """ 숫자 데이터를 k-평균 클러스터 멤버십으로 변환.

    이 변환기는 입력 데이터에 k-평균을 수행해 각 데이터 포인트를 가장 가까운 클러스터의 id로 변환한다.
    만약 목표 변수가 주어지면 유사한 데이터 포인트와 함께 grouping되고,
    분류 경계에 따르는 클러스터를 생성하기 위해 스케일링되고, k-평균 입력에 포함된다.
    """

    def __init__(self, k = 100, target_scale = 5.0, random_state = None):
        self.k = k
        self.target_scale = target_scale
        self.random_state = random_state

    def fit(self, X, y = None):
        """ 입력 데이터에 k-평균을 수행하고 중심점을 찾는다.
        """
        if y is None: # 목표 변수가 없으면 단순한 k-평균 수행
            km_model = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
            km_model.fit(X)
            
            self.inertia_ = km_model.inertia_
            self.km_model = km_model
            self.cluster_centers_ = km_model.cluster_centers_
            return self

        # 목표 변수가 있으면, 적절한 스케일링을 적용하고, 이를 k-평균에 대한 입력 데이터에 포함시킨다.
        data_with_target = np.hstack((X, y[:, np.newaxis] * self.target_scale))
        # 데이터와 타겟에 대해 사전 학습할 k-평균 모델 구축
        km_model_pretrain = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
        km_model_pretrain.fit(data_with_target)

        # k평균을 두번째로 실행해 목표 변수 없이 원시 공간에서 클러스터를 얻는다. 사전 학습을 통해 얻은 중심점을 활용해 초기화한다.
        # 반복을 통해 클러스터 할당과 중심점 계산을 다시 수행한다.

        km_model = KMeans(n_clusters = self.k, init = km_model_pretrain.cluster_centers_[:,:data_with_target.shape[1]-1], n_init = 1, max_iter = 1)

        km_model.fit(X)
        
        self.inertia_ = km_model.inertia_
        self.km_model = km_model
        self.cluster_centers_ = km_model.cluster_centers_
        return self

    def transform(self, X, y = None):
        """ 각 입력 데이터 포인트에 대해 가장 가까운 클러스터 ID 산출
        """
        clusters = self.km_model.predict(X)
        return clusters[:, np.newaxis]

    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X, y)

In [13]:
# unique 값이 10개 이하인 person 피쳐
per = ['person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_e',]

In [14]:
# unique 값이 10개 이하인 content 피쳐
con = [ 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_k',
 'contents_attribute_m',
 'contents_attribute_e',]

In [15]:
km = KMeansFeaturizer(random_state = SEED, k = 40)# 초반 실험단계에서 피쳐 갯수 * 10 개의 군집으로 시도 가장 좋은성능

In [16]:
per_k = x_train[per].astype(str)
per_k_t = x_test[per].astype(str)

per_k_train = km.fit_transform(per_k)

per_k_test = km.transform(per_k_t)

In [17]:
km = KMeansFeaturizer(random_state = SEED, k = 70)# 초반 실험단계에서 피쳐 갯수 * 10 개의 군집으로 시도 가장 좋은성능

In [18]:
con_k = x_train[con].astype(str)
con_k_t = x_test[con].astype(str)

con_k_train = km.fit_transform(con_k)

con_k_test = km.transform(con_k_t)

In [19]:
x_train["per_kmeans"] = per_k_train.astype(str)
x_test["per_kmeans"] = per_k_test.astype(str)

In [20]:
x_train["con_kmeans"] = con_k_train.astype(str)
x_test["con_kmeans"] = con_k_test.astype(str)

In [21]:
#클러스터링한 값들을 단순 문자열 결합을 통해서 군집과 군집의 조합을 찾으려 함
x_train["per_con_kmean"] = x_train['per_kmeans']+"_"+x_train['con_kmeans']
x_test["per_con_kmean"] = x_test['per_kmeans']+"_"+x_test['con_kmeans']

## 시간

In [22]:
x_train["contents_open_dt_hour"] = x_train["contents_open_dt"].apply(lambda x:x.split(" ")[1][:2])
x_train["contents_open_dt_day"] = x_train["contents_open_dt"].apply(lambda x:x.split("-")[2][:2])
x_train["contents_open_dt_month"] = x_train["contents_open_dt"].apply(lambda x:x.split("-")[1])

x_test["contents_open_dt_hour"] = x_test["contents_open_dt"].apply(lambda x:x.split(" ")[1][:2])
x_test["contents_open_dt_day"] = x_test["contents_open_dt"].apply(lambda x:x.split("-")[2][:2])
x_test["contents_open_dt_month"] = x_test["contents_open_dt"].apply(lambda x:x.split("-")[1])

In [23]:
x_train = x_train.drop(["contents_open_dt"],axis = 1)

x_test = x_test.drop(["contents_open_dt"],axis = 1)

## 매칭 T/F 카운트

In [24]:
lst = ['person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h']
for i in lst:
    x = set(x_test[i].unique()) - set( x_train[i].unique())
    if len(x)>1:
        print(i,x,test[i].nunique())

person_prefer_d_2 {546, 445} 863
person_prefer_d_3 {1185, 578, 71, 500, 380, 761, 732} 775
contents_attribute_l {1893, 46, 79, 400, 273, 1362, 51, 1618, 1431, 1372} 1422
contents_attribute_d {384, 550, 427, 591, 721, 403, 828, 445} 919


In [25]:
x_train["d_match_count"] = x_train["d_l_match_yn"] + x_train["d_m_match_yn"] + x_train["d_s_match_yn"]

x_test["d_match_count"] = x_test["d_l_match_yn"] + x_test["d_m_match_yn"] + x_test["d_s_match_yn"]

In [26]:
x_train["h_match_count"] = x_train["h_l_match_yn"] + x_train["h_m_match_yn"] + x_train["h_s_match_yn"]

x_test["h_match_count"] = x_test["h_l_match_yn"] + x_test["h_m_match_yn"] + x_test["h_s_match_yn"]

## 선호특징과 컨탠츠 특징을 구성
### 둘의 합

In [27]:
x_train["person_prefer_mul"] = (x_train["person_prefer_c"].astype("int")**2) * (x_train["person_prefer_e"].astype("int")**3)

x_test["person_prefer_mul"] = (x_test["person_prefer_c"].astype("int")**2) * (x_test["person_prefer_e"].astype("int")**3)

In [28]:
x_train["contents_attribute_mul"] = (x_train["contents_attribute_j_1"].astype("int")) * (x_train["contents_attribute_a"].astype("int"))\
* (x_train["contents_attribute_i"].astype("int")) * (x_train["contents_attribute_c"].astype("int")) * (x_train["contents_attribute_k"].astype("int"))\
* (x_train["contents_attribute_m"].astype("int")) * (x_train["contents_attribute_e"].astype("int"))

x_test["contents_attribute_mul"] = (x_test["contents_attribute_j_1"].astype("int")) * (x_test["contents_attribute_a"].astype("int"))\
* (x_test["contents_attribute_i"].astype("int")) * (x_test["contents_attribute_c"].astype("int")) * (x_test["contents_attribute_k"].astype("int"))\
* (x_test["contents_attribute_m"].astype("int")) * (x_test["contents_attribute_e"].astype("int"))

In [29]:
x_train["prefer_attribute_com"] = x_train["person_prefer_mul"] + x_train["contents_attribute_mul"]

x_test["prefer_attribute_com"] = x_test["person_prefer_mul"] + x_test["contents_attribute_mul"]

### 대중소세 코드 값 가지고

In [30]:
x_train['person_D_code1_sum'] = x_train['person_prefer_d_1_attribute_d_l'] + x_train['person_prefer_d_1_attribute_d_m'] \
                                + x_train['person_prefer_d_1_attribute_d_s'] + x_train['person_prefer_d_1_attribute_d_d'] + x_train['person_prefer_d_1']
x_train['person_D_code2_sum'] = x_train['person_prefer_d_2_attribute_d_l'] + x_train['person_prefer_d_2_attribute_d_m'] \
                                + x_train['person_prefer_d_2_attribute_d_s'] + x_train['person_prefer_d_2_attribute_d_d'] + x_train['person_prefer_d_2']
x_train['person_D_code3_sum'] = x_train['person_prefer_d_3_attribute_d_l'] + x_train['person_prefer_d_3_attribute_d_m'] \
                                + x_train['person_prefer_d_3_attribute_d_s'] + x_train['person_prefer_d_3_attribute_d_d'] + x_train['person_prefer_d_3']

x_test['person_D_code1_sum'] = x_test['person_prefer_d_1_attribute_d_l'] + x_test['person_prefer_d_1_attribute_d_m'] \
                               + x_test['person_prefer_d_1_attribute_d_s'] + x_test['person_prefer_d_1_attribute_d_d'] + x_test['person_prefer_d_1']
x_test['person_D_code2_sum'] = x_test['person_prefer_d_2_attribute_d_l'] + x_test['person_prefer_d_2_attribute_d_m'] \
                               + x_test['person_prefer_d_2_attribute_d_s'] + x_test['person_prefer_d_2_attribute_d_d'] + x_test['person_prefer_d_2']
x_test['person_D_code3_sum'] = x_test['person_prefer_d_3_attribute_d_l'] + x_test['person_prefer_d_3_attribute_d_m'] \
                               + x_test['person_prefer_d_3_attribute_d_s'] + x_test['person_prefer_d_3_attribute_d_d'] + x_test['person_prefer_d_3']

In [31]:
x_train['person_H_code1_sum'] = x_train['person_prefer_h_1_attribute_h_l'] + x_train['person_prefer_h_1_attribute_h_m'] + x_train['person_prefer_h_1']
x_train['person_H_code2_sum'] = x_train['person_prefer_h_2_attribute_h_l'] + x_train['person_prefer_h_2_attribute_h_m'] + x_train['person_prefer_h_2']
x_train['person_H_code3_sum'] = x_train['person_prefer_h_3_attribute_h_l'] + x_train['person_prefer_h_3_attribute_h_m'] + x_train['person_prefer_h_3']

x_test['person_H_code1_sum'] = x_test['person_prefer_h_1_attribute_h_l'] + x_test['person_prefer_h_1_attribute_h_m'] + x_test['person_prefer_h_1']
x_test['person_H_code2_sum'] = x_test['person_prefer_h_2_attribute_h_l'] + x_test['person_prefer_h_2_attribute_h_m'] + x_test['person_prefer_h_2']
x_test['person_H_code3_sum'] = x_test['person_prefer_h_3_attribute_h_l'] + x_test['person_prefer_h_3_attribute_h_m'] + x_test['person_prefer_h_3']

In [32]:
x_train['content_D_code_sum'] = x_train['contents_attribute_d_attribute_d_l'] + x_train['contents_attribute_d_attribute_d_m'] \
                                + x_train['contents_attribute_d_attribute_d_s'] + x_train['contents_attribute_d_attribute_d_d'] + x_train['contents_attribute_d']

x_test['content_D_code_sum'] = x_test['contents_attribute_d_attribute_d_l'] + x_test['contents_attribute_d_attribute_d_m'] \
                               + x_test['contents_attribute_d_attribute_d_s'] + x_test['contents_attribute_d_attribute_d_d'] + x_test['contents_attribute_d']

In [33]:
x_train['content_H_code_sum'] = x_train['contents_attribute_h_attribute_h_l'] + x_train['contents_attribute_h_attribute_h_m'] + x_train['contents_attribute_h']

x_test['content_H_code_sum'] = x_test['contents_attribute_h_attribute_h_l'] + x_test['contents_attribute_h_attribute_h_m'] + x_test['contents_attribute_h']

In [34]:
x_train['content_L_code_sum'] = x_train['contents_attribute_l_attribute_l_l'] + x_train['contents_attribute_l_attribute_l_m'] \
                                + x_train['contents_attribute_l_attribute_l_s'] + x_train['contents_attribute_l_attribute_l_d'] + x_train['contents_attribute_l']

x_test['content_L_code_sum'] = x_test['contents_attribute_l_attribute_l_l'] + x_test['contents_attribute_l_attribute_l_m'] \
                               + x_test['contents_attribute_l_attribute_l_s'] + x_test['contents_attribute_l_attribute_l_d'] + x_test['contents_attribute_l']

In [35]:
x_train['D1_D_compare'] = np.abs(x_train['person_D_code1_sum'] - x_train['content_D_code_sum'])
x_train['D2_D_compare'] = np.abs(x_train['person_D_code2_sum'] - x_train['content_D_code_sum'])
x_train['D3_D_compare'] = np.abs(x_train['person_D_code3_sum'] - x_train['content_D_code_sum'])
x_train['D_compare'] = x_train['D1_D_compare'] * x_train['D2_D_compare'] * x_train['D3_D_compare']

x_test['D1_D_compare'] = np.abs(x_test['person_D_code1_sum'] - x_test['content_D_code_sum'])
x_test['D2_D_compare'] = np.abs(x_test['person_D_code2_sum'] - x_test['content_D_code_sum'])
x_test['D3_D_compare'] = np.abs(x_test['person_D_code3_sum'] - x_test['content_D_code_sum'])
x_test['D_compare'] = x_test['D1_D_compare'] * x_test['D2_D_compare'] * x_test['D3_D_compare']

In [36]:
x_train['D1_D2_compare'] = np.abs(x_train['person_D_code1_sum'] - x_train['person_D_code2_sum'])
x_train['D1_D3_compare'] = np.abs(x_train['person_D_code1_sum'] - x_train['person_D_code3_sum'])
x_train['D2_D3_compare'] = np.abs(x_train['person_D_code2_sum'] - x_train['person_D_code3_sum'])
x_train['D_D_compare'] = x_train['D1_D2_compare'] * x_train['D1_D3_compare'] * x_train['D2_D3_compare']

x_test['D1_D2_compare'] = np.abs(x_test['person_D_code1_sum'] - x_test['person_D_code2_sum'])
x_test['D1_D3_compare'] = np.abs(x_test['person_D_code1_sum'] - x_test['person_D_code3_sum'])
x_test['D2_D3_compare'] = np.abs(x_test['person_D_code2_sum'] - x_test['person_D_code3_sum'])
x_test['D_D_compare'] = x_test['D1_D2_compare'] * x_test['D1_D3_compare'] * x_test['D2_D3_compare']

In [37]:
x_train['H1_H_compare'] = np.abs(x_train['person_H_code1_sum'] - x_train['content_H_code_sum'])
x_train['H2_H_compare'] = np.abs(x_train['person_H_code2_sum'] - x_train['content_H_code_sum'])
x_train['H3_H_compare'] = np.abs(x_train['person_H_code3_sum'] - x_train['content_H_code_sum'])
x_train['H_compare'] = x_train['H1_H_compare'] * x_train['H2_H_compare'] * x_train['H3_H_compare']

x_test['H1_H_compare'] = np.abs(x_test['person_H_code1_sum'] - x_test['content_H_code_sum'])
x_test['H2_H_compare'] = np.abs(x_test['person_H_code2_sum'] - x_test['content_H_code_sum'])
x_test['H3_H_compare'] = np.abs(x_test['person_H_code3_sum'] - x_test['content_H_code_sum'])
x_test['H_compare'] = x_test['H1_H_compare'] * x_test['H2_H_compare'] * x_test['H3_H_compare']

In [38]:
x_train['H1_H2_compare'] = np.abs(x_train['person_H_code1_sum'] - x_train['person_H_code2_sum'])
x_train['H1_H3_compare'] = np.abs(x_train['person_H_code1_sum'] - x_train['person_H_code3_sum'])
x_train['H2_H3_compare'] = np.abs(x_train['person_H_code2_sum'] - x_train['person_H_code3_sum'])
x_train['H_H_compare'] = x_train['H1_H2_compare'] * x_train['H1_H3_compare'] * x_train['H2_H3_compare']

x_test['H1_H2_compare'] = np.abs(x_test['person_H_code1_sum'] - x_test['person_H_code2_sum'])
x_test['H1_H3_compare'] = np.abs(x_test['person_H_code1_sum'] - x_test['person_H_code3_sum'])
x_test['H2_H3_compare'] = np.abs(x_test['person_H_code2_sum'] - x_test['person_H_code3_sum'])
x_test['H_H_compare'] = x_test['H1_H2_compare'] * x_test['H1_H3_compare'] * x_test['H2_H3_compare']

In [39]:
drop_lst = ['person_D_code1_sum', 'person_D_code2_sum', 'person_D_code3_sum',
            'person_H_code1_sum', 'person_H_code2_sum', 'person_H_code3_sum',
            'content_D_code_sum', 'content_H_code_sum',]

In [40]:
x_train = x_train.drop(drop_lst, axis=1)
x_test = x_test.drop(drop_lst, axis=1)

### 범주형 칼럼 리스트

In [41]:
cat_features = x_train.columns[x_train.nunique() >= 2].tolist()

In [42]:
cat_features

['d_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a',
 'person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_j',
 'contents_attribute_c',
 'contents_attribute_k',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_e',
 'contents_attribute_h',
 'person_contents_mul',
 'person_prefer_d_1_attribute_d_d',
 'person_prefer_d_1_attribute_d_s',
 'person_prefer_d_1_attribute_d_m',
 'person_prefer_d_1_attribute_d_l',
 'person_prefer_d_2_attribute_d_d',
 'person_prefer_d_2_attribute_d_s',
 'person_prefer_d_2_attribute_d_m',
 'person_prefer_d_2_attribute_d_l',
 'person_prefer_d_3_attribute_d_d',
 'person_prefer_d_3_attribute_

In [43]:
cat_features = x_train.columns[x_train.nunique() >= 2].tolist()
num_features = ['D1_D_compare', 'D2_D_compare', 'D3_D_compare', 'D_compare', 
                                             'H1_H_compare', 'H2_H_compare', 'H3_H_compare', 'H_compare',
                                             'D1_D2_compare', 'D1_D3_compare', 'D2_D3_compare', 'D_D_compare',
                                             'H1_H2_compare', 'H1_H3_compare', 'H2_H3_compare', 'H_H_compare',
                                              'content_L_code_sum', 'person_contents_mul',
                                             'person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s_squared',
                                             'person_prefer_d_3_attribute_d_d_contents_attribute_d_attribute_d_d_squared',
                                             'person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m_squared',
                                             'person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l_squared',
                                             'person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l_squared',
                                             'person_prefer_d_2_attribute_d_d_contents_attribute_d_attribute_d_d_squared',
                                             'person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l_squared',
                                             'person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l_squared',
                                             'person_prefer_d_2_attribute_d_m_contents_attribute_d_attribute_d_m_squared',
                                             'person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m_squared',
                                             'person_prefer_d_2_attribute_d_s_contents_attribute_d_attribute_d_s_squared',
                                             'person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m_squared',
                                             'contents_attribute_e_person_prefer_e_squared',
                                             'contents_attribute_c_person_prefer_c_squared',
                                              'd_match_count',
                                             'h_match_count',
                                             'person_prefer_mul',
                                             'contents_attribute_mul',
                                             'prefer_attribute_com'
                                            ]
cat_features = list(set(cat_features) - set(num_features))

### 스케일링

In [44]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted

class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [45]:
scaler = GaussRankScaler()
x_train[num_features] = pd.DataFrame(scaler.fit_transform(x_train[num_features]))
x_test[num_features] = pd.DataFrame(scaler.transform(x_test[num_features]))

x_train[num_features].columns = num_features
x_test[num_features].columns = num_features

### feature selection

In [46]:
model = CatBoostClassifier(random_state=SEED, eval_metric="F1", cat_features=cat_features, one_hot_max_size=4, verbose=100)

In [47]:
x_train[num_features] = x_train[num_features].astype(float)
x_train[cat_features] = x_train[cat_features].astype(str)

x_test[num_features] = x_test[num_features].astype(float)
x_test[cat_features] = x_test[cat_features].astype(str)

In [48]:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(model, threshold='median')
sfm.fit(x_train, y_train)
x_new = sfm.transform(x_train)
x_te_new = sfm.transform(x_test)
feature_idx = sfm.get_support()
feature_name = x_train.columns[feature_idx]

Learning rate set to 0.146594
0:	learn: 0.6176766	total: 2.68s	remaining: 44m 39s
100:	learn: 0.6722698	total: 5m 53s	remaining: 52m 29s
200:	learn: 0.6790754	total: 11m 13s	remaining: 44m 38s
300:	learn: 0.6823341	total: 16m 47s	remaining: 38m 58s
400:	learn: 0.6849917	total: 22m 36s	remaining: 33m 46s
500:	learn: 0.6870437	total: 28m 20s	remaining: 28m 14s
600:	learn: 0.6890181	total: 33m 50s	remaining: 22m 28s
700:	learn: 0.6905038	total: 39m 26s	remaining: 16m 49s
800:	learn: 0.6916727	total: 44m 45s	remaining: 11m 7s
900:	learn: 0.6935998	total: 50m 10s	remaining: 5m 30s
999:	learn: 0.6951411	total: 55m 33s	remaining: 0us


In [49]:
x_new = pd.DataFrame(x_new)
x_te_new = pd.DataFrame(x_te_new)

x_new.columns = feature_name
x_te_new.columns = feature_name

In [51]:
x_new.shape,x_te_new.shape

((501951, 59), (46404, 59))

### 학습 파라미터

In [58]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [59]:
cat_features = x_new.columns[x_new.nunique() >= 2].tolist()
num_features = list([ 'contents_attribute_e_person_prefer_e_squared',
       'person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l_squared',
       'per_kmeans', 'con_kmeans', 'per_con_kmean', 'contents_open_dt_hour',
       'contents_open_dt_month', 'contents_attribute_mul',
       'content_L_code_sum', 'D1_D_compare', 'D_compare', 'D1_D2_compare',
       'D1_D3_compare', 'D_D_compare', 'H1_H_compare', 'H_compare']
                                             )
cat_features = list(set(cat_features) - set(num_features))

In [60]:
x_new[num_features] = x_new[num_features].astype(float)
x_new[cat_features] = x_new[cat_features].astype(str)

x_te_new[num_features] = x_te_new[num_features].astype(float)
x_te_new[cat_features] = x_te_new[cat_features].astype(str)

### 학습

In [61]:
scores = []
models = []
for tri, vai in cv.split(x_new):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED,
   #                            task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4)
    model.fit(x_new.iloc[tri], y_train[tri], 
            eval_set=[(x_new.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience,
            verbose=100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.086395
0:	learn: 0.6323506	test: 0.6546117	best: 0.6546117 (0)	total: 2.8s	remaining: 2h 20m 3s
100:	learn: 0.6601830	test: 0.6838303	best: 0.6838303 (100)	total: 4m 7s	remaining: 1h 58m 28s
200:	learn: 0.6675362	test: 0.6905417	best: 0.6905815 (192)	total: 7m 59s	remaining: 1h 51m 23s
300:	learn: 0.6713473	test: 0.6916148	best: 0.6916559 (299)	total: 11m 59s	remaining: 1h 47m 30s
400:	learn: 0.6741031	test: 0.6927639	best: 0.6930245 (367)	total: 16m	remaining: 1h 43m 44s
500:	learn: 0.6759529	test: 0.6928782	best: 0.6930560 (414)	total: 20m 49s	remaining: 1h 43m 54s
600:	learn: 0.6777427	test: 0.6932065	best: 0.6934627 (590)	total: 25m 54s	remaining: 1h 43m 24s
700:	learn: 0.6794223	test: 0.6935572	best: 0.6936492 (685)	total: 31m 24s	remaining: 1h 43m 1s
800:	learn: 0.6804999	test: 0.6934461	best: 0.6939205 (735)	total: 37m 17s	remaining: 1h 42m 21s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6939204732
bestIteration = 735

Shrink model

### cv 결과 확인

In [62]:
print(scores)
print(np.mean(scores))

[0.69392047321722, 0.6932834424695978, 0.6910418074007356, 0.6939975684921824, 0.6930818903082918]
0.6930650363776055


### threshold 정의

In [63]:
threshold = 0.375

### threshold값 변경에 따른 검증점수 확인 및 추론

In [64]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate(cv.split(x_new)):
    
    pred = models[i].predict_proba(x_new.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train[vai], pred)
    scores.append(score)
    pred = models[i].predict_proba(x_te_new)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))
#375

[0.7170974550957825, 0.7167593378439475, 0.7157052912744848, 0.7175540665248967, 0.71428231279019]
0.7162796927058602


### threshold 0.375

### 산술평균 앙상블

In [65]:
pred = np.mean(pred_list[:], axis=0)
pred = np.where(pred >= threshold, 1, 0)

### submission

In [66]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [67]:
sample_submission['target'].value_counts()

1    34254
0    12150
Name: target, dtype: int64

In [68]:
sample_submission.to_csv("data/pred0125_375_0.70423.csv", index=False)

In [None]:
(pd.DataFrame(pred_list).T).to_csv(f"{SUBMIT_PATH}predproba0125_375_0.70423.csv", index=False)

### feature importance

In [69]:
values = model.get_feature_importance(type='FeatureImportance')
fi = pd.DataFrame(x_new.columns)
fi['values'] = values
fi.columns = ['column', 'values']

In [70]:
fi.sort_values(by='values', ascending=False).head(30)

Unnamed: 0,column,values
53,D_compare,9.480334
13,contents_attribute_d,7.405525
11,contents_attribute_j_1,5.773934
12,contents_attribute_l,5.71145
16,contents_attribute_h,4.580179
36,contents_attribute_h_attribute_h_m,3.958212
19,person_prefer_d_1_attribute_d_s,3.095873
18,person_prefer_d_1_attribute_d_d,2.790939
57,H1_H_compare,2.714651
0,person_attribute_a_1,2.688221
