### Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font',family='malgun gothic')
plt.rc('axes',unicode_minus=False)
import seaborn as sns

# EDA
import klib

# Preprocessing&Feature Engineering
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from gensim.models import word2vec

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
import kerastuner as kt

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, ARDRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
import tensorflow as tf
from vecstack import StackingTransformer
from vecstack import stacking

# Eveluation
from sklearn.metrics import mean_squared_error # squared=False시 RMSE
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import pickle

### Read Data

In [None]:
df_train =  pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
y_train =  pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949')
df_test =  pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
test_id = df_test['custid'].unique()
y_train=y_train.set_index('custid')

### Feature Generation&Feature Engineering

In [None]:
tr = pd.concat([df_train, df_test])
tr.head()

### PCA

In [None]:
# 차원축소 매소드 
from sklearn.decomposition import PCA

def dummy_to_pca(tr, column_name:str) :
    max_seq = 300
    max_d = 15
    col_count = tr.groupby(column_name)[column_name].count()
    if len(col_count) > max_seq:
        tops = col_count.sort_values(ascending=False)[0:max_seq].index
        f =tr.loc[tr[column_name].isin(tops)][['custid', column_name]]
    else:
        tops = col_count.index
        f =tr[['custid', column_name]]
    f = pd.get_dummies(f, columns=[column_name])  # This method performs One-hot-encoding
    f = f.groupby('custid').mean()
    if len(tops) < max_d:
        max_d = len(tops)
    pca = PCA(n_components=max_d)
    pca.fit(f)
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합
    print(cumsum)
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 차원의 수
    if num_d == 1:
        num_d = max_d
    pca = PCA(n_components=num_d)    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns]
    result.index = f.index
    return result.reset_index()

In [None]:
features=pd.DataFrame({'custid':tr.custid.unique()})
f = dummy_to_pca(tr, 'brd_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'corner_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'pc_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'part_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'buyer_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'team_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'goodcd'); features = pd.merge(features,f,on='custid',how='left')

### W2V-상품분류

In [None]:
def age_vec():
    sentences = []
    df_all = df_train
    for id in tqdm(df_all.custid.unique()):
        x = df_all.query('custid == @id')[level].unique()
        y = y_train.query('custid == @id').age
        for j in range(20):
            y = np.append(y, np.random.choice(x, len(x), replace=False))
        sentences.append(list(y))

In [None]:
level = 'corner_nm' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 300 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
#                           size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
# Make features based on Word2Vec
# 고객별로 구매한 상품의 평균벡터를 feature로 사용한다.
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train2 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test2 = np.array(features_wv)

X_train2 = pd.DataFrame(X_train2)
X_test2 = pd.DataFrame(X_test2)
X_train2.insert(0,'custid',df_train.custid.unique())
X_test2.insert(0,'custid',df_test.custid.unique())

### W2V-브랜드명

In [None]:
### W2V-브랜드명level = 'brd_nm' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 300 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

from gensim.models import word2vec

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
#                           size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
# Make features based on Word2Vec
# 고객별로 구매한 상품의 평균벡터를 feature로 사용한다.
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train3 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test3 = np.array(features_wv)

In [None]:
X_train3 = pd.DataFrame(X_train3)
X_test3 = pd.DataFrame(X_test3)
X_train3.insert(0,'custid',df_train.custid.unique())
X_test3.insert(0,'custid',df_test.custid.unique())

In [None]:
X_train2 = pd.read_excel('X_train2.xlsx',index_col=0)
X_test2 = pd.read_excel('X_test2.xlsx',index_col=0)
X_train3 = pd.read_excel('X_train3.xlsx',index_col=0)
X_test3 = pd.read_excel('X_test3.xlsx',index_col=0)

In [None]:
def month_modify(x):
    if x > 12:
        return x-12
    else:
        return x

def extract_hour(x):
    if len(str(x))>3:
        return str(x)[:2]
    else: 
        return str(x)[:1]
def extract_season(x):
    if 3 <= x <= 5 :
        return('봄')
    elif 6 <= x <= 8 :
        return('여름')
    elif 9 <= x <= 11 :    
        return('가을')
    else :
        return('겨울') 
def time_(x):
    if 9 <= x <= 11 :
        return('아침_구매건수')
    elif 12 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')
def half_year(x):
    if 1<=x<=5:
        return('전반기')
    else:
        return('후반기')
def peak_season(x):
    if x in [7,8,12,1,2]:
        return('성수기')
    else:
        return('비성수기')
def div_month(x):
    if 1<=x<=10:
        return('월초')
    elif 11<=x<=20:
        return('월중')
    else:
        return('월말')
def noon(x):
    if int(str(x)[-2:])<=12:
        return('오전')
    else:
        return('오후')

일반변수

In [None]:
df = tr.copy()
df['sales_hour'] = df.sales_time.apply(extract_hour)
df['sales_hour'] = pd.to_numeric(df['sales_hour'])
df['sales_hour'] = np.where(df['sales_hour'] < 9, 21, df['sales_hour'])
df['방문시간대'] = df.sales_hour.apply(time_)
df['sales_month'] = df.sales_month.apply(month_modify)
df['반기'] = df.sales_month.apply(half_year)
df['성수기여부'] = df.sales_month.apply(peak_season)
df['월_초중말'] = df.sales_day.apply(div_month)
df['오전/오후'] = df.sales_time.apply(noon)

# 총구매액
f = df.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features = pd.merge(features,f, on = 'custid')
# 평균구매액
f = df.groupby('custid')['tot_amt'].agg([('평균구매액', 'mean')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 총할인금액
f = pd.DataFrame(df.groupby('custid').dis_amt.sum()).rename(columns={'dis_amt':'총할인금액'})
features = pd.merge(features,f, on = 'custid')
# 평균할인금액
f = pd.DataFrame(df.groupby('custid').dis_amt.mean()).rename(columns={'dis_amt':'평균할인금액'})
features = pd.merge(features,f, on = 'custid')

# 총실구매액
f = pd.DataFrame(df.groupby('custid').net_amt.sum()).rename(columns={'net_amt':'총실구매액'})
features = pd.merge(features,f, on = 'custid')
# 평균실구매액
f = pd.DataFrame(df.groupby('custid').net_amt.mean()).rename(columns={'net_amt':'평균실구매액'})
features = pd.merge(features,f, on = 'custid')

# 요일별 방문횟수
f = pd.crosstab(df.custid,df.sales_dayofweek, margins=True).reindex(columns=['월','화','수',
                                                        '목','금','토','일','All']).iloc[:-1,:].rename(columns={'All':'총방문횟수'})
features = pd.merge(features,f, on = 'custid')

# 시간별 방문횟수
f = pd.crosstab(df.custid, df.sales_hour).rename(columns=dict(zip(df.sales_hour.unique(),[str(i)+'시방문' for i in df.sales_hour.unique()])))
features = pd.merge(features,f, on = 'custid')

# 수입상품 구매 건수
f = pd.DataFrame(df.groupby('custid').import_flg.sum()).rename(columns={'import_flg':'수입상품구매건수'})
features = pd.merge(features,f, on = 'custid')

# x = df[df['import_flg'] == 1].groupby('custid').size()
# f = x.reset_index().rename(columns={0: '수입상품_구매건수'}).fillna(0)
# features = pd.merge(features,f, on = 'custid')

# 구매건수
f = df.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 평균할부개월수
f = df.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 지점별 방문횟수
f = pd.crosstab(df.custid, df.str_nm).rename(columns=dict(zip(df.str_nm.unique(),[i+'방문' for i in df.str_nm.unique()])))
features = pd.merge(features,f, on = 'custid')

# 방문지점 수
f = df.groupby('custid')['str_nm'].agg([('방문지점수',lambda x: x.nunique())])
features = pd.merge(features,f, on = 'custid')

# 월별 구매횟수
f = pd.crosstab(df.custid,df.sales_month).rename(columns=dict(zip(df.sales_month.unique(), [str(i)+'월방문' for i in df.sales_month.unique()])))
features = pd.merge(features,f, on = 'custid')

# 구매일수
df['sales_month'] = df['sales_month'].astype(str)
df['sales_day'] = df['sales_day'].astype(str)
df['판매일'] = df['sales_month'] + '-' + df['sales_day']
df.판매일 = pd.to_datetime(df.판매일,format='%m-%d')
f = df.groupby(by = 'custid')['판매일'].agg([('구매일수','nunique')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 구매주기
f = df.groupby('custid')['판매일'].agg([('구매주기', lambda x: int((x.astype('datetime64').max() - x.astype('datetime64').min()).days / x.nunique()))]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 최고 구입 금액
f = df.groupby('custid')['tot_amt'].agg([('최고구매금액', 'max')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 계절별 구매건수
df['sales_month'] = pd.to_numeric(df['sales_month'])
df['계절'] = df.sales_month.apply(extract_season)
f = pd.pivot_table(df, index = 'custid', columns = '계절', values = 'tot_amt',
                  aggfunc = np.size, fill_value = 0).reset_index()
features = pd.merge(features,f, on = 'custid')

# 반기별 구매건수
f = pd.crosstab(df.custid, df.반기).reset_index()
features = pd.merge(features,f, on = 'custid')

# 성수기 여부 별 구매건수
f = pd.crosstab(df.custid, df.성수기여부).reset_index()
features = pd.merge(features,f, on = 'custid')

# 월 시기별 구매건수
f = pd.crosstab(df.custid, df.월_초중말).reset_index()
features = pd.merge(features,f, on = 'custid')

# 오전/오후별 구매건수
f = pd.crosstab(df.custid, df['오전/오후']).reset_index()
features = pd.merge(features,f, on = 'custid')

# 방문지점 갯수
f = df.groupby(by = 'custid')['str_nm'].agg([('방문지점개수','nunique')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 구매상품 다양성
n = df.corner_nm.nunique()
f = df.groupby('custid')['goodcd'].agg([('구매상품다양성', lambda x: len(x.unique()) / n)]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 시간대별 방문횟수
f = pd.crosstab(df.custid, df.방문시간대)
features = pd.merge(features,f, on = 'custid')

# 주구매코너
#f = df.groupby('custid')['corner_nm'].agg([('주구매코너', lambda x: x.value_counts().index[0])]).reset_index()
#f = pd.get_dummies(f, columns=['주구매코너'])  # This method performs One-hot-encoding
#features = pd.merge(features,f, on = 'custid')

# 코너별 구매건수
f = pd.pivot_table(df, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')

# 바이어 이름 별 구매건수
f = pd.pivot_table(df, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')

# 무이자 할부 평균 가격
f = df.loc[df.inst_fee==1].groupby('custid').net_amt.agg([('무이자할부평균가격','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left').fillna(0)

# 할부결제건수
f = df.loc[df.inst_mon>1].groupby('custid').inst_mon.agg([('할부결제건수','count')])
features = pd.merge(features,f,on='custid',how='left').fillna(0)

# 내점 당 구매금액
f = df.groupby(['custid','판매일'])['tot_amt'].sum().reset_index().groupby('custid')['tot_amt'].agg([('내점당구매금액','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left')

# 내점 당 구매개수
f = df.groupby(['custid','판매일'])['tot_amt'].count().reset_index().groupby('custid')['tot_amt'].agg([('내점당구매개수','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left')

# 평균쇼핑시간
f = df.groupby(['custid','판매일'])['sales_time'].agg(lambda x: x.max()-x.min()).reset_index().groupby('custid').sales_time.agg([('평균쇼핑시간','mean')])
features = pd.merge(features,f,on='custid',how='left')

```python
# 파트별 구매액 합
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc='sum', fill_value=0).reset_index()
features = pd.merge(features,f, on = 'custid')

# 파트별 할인금액
f = pd.pivot_table(df, index='custid', columns='part_nm', values='dis_amt', 
                   aggfunc='sum', fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')

# part_nm별 구매건수
f = pd.pivot_table(df, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
```

In [None]:
del features['custid']

In [None]:
# 이상치 처리
features = features.apply(lambda x: x.clip(x.quantile(0.05), x.quantile(0.95)), axis=0)

# 표준화
features.loc[:,:] = RobustScaler().fit_transform(features)

비율변수

In [None]:
features['custid'] = tr.custid.unique()

tr['sales_hour'] = tr.sales_time.apply(extract_hour)
weekdays = pd.crosstab(tr.custid,tr.sales_dayofweek, margins=True).reindex(columns=['월','화','수',
                                                        '목','금','토','일','All']).iloc[:-1,:].rename(columns={'All':'총방문횟수'})
sales_hour = pd.crosstab(df.custid, df.sales_hour, margins=True)
sales_hour = sales_hour.rename(columns=dict(zip(sales_hour.columns,[str(i)+'시방문' for i in sales_hour.columns])))
str_nm = pd.crosstab(tr.custid, tr.str_nm,margins=True)
str_nm = str_nm.rename(columns=dict(zip(str_nm.columns,[i+'방문' for i in str_nm.columns])))
sales_month = pd.crosstab(df.custid,df.sales_month, margins=True)
sales_month = sales_month.rename(columns=dict(zip(sales_month.columns, [str(i)+'월방문' for i in sales_month])))
tr['sales_month'] = pd.to_numeric(tr['sales_month'])
tr['sales_month'] = tr.sales_month.apply(month_modify)
tr['season'] = pd.DataFrame(tr.sales_month.apply(extract_season))
season_visit = pd.crosstab(tr.custid, tr.season)
tr['mln'] = tr.sales_time.apply(lambda x: int(str(x)[:2])).apply(time_)
mln = pd.crosstab(tr.custid, tr.mln,margins=True)
inv = tr.loc[tr.inst_mon>1].groupby('custid').inst_mon.agg([('할부결제건수','count')])
trans_amount = tr.groupby('custid')['tot_amt'].agg([('구매건수', 'size')])
peak = pd.crosstab(df.custid, df.성수기여부, margins=True)
peak = peak.divide(peak.iloc[:,-1],axis=0).iloc[:-1,:-1]
half = pd.crosstab(df.custid, df.반기, margins=True)
half = half.divide(half.iloc[:,-1],axis=0).iloc[:-1,:-1]
noon = pd.crosstab(df.custid, df['오전/오후'], margins=True)
noon = noon.divide(noon.iloc[:,-1],axis=0).iloc[:-1,:-1]

In [None]:
# 수입상품 구매비율
x = df[df['import_flg'] == 1].groupby('custid').size() / df.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1])
features = pd.merge(features,f, on = 'custid')

# 주말 방문 비율
day_to_int = {
    '월': 1,'화': 2,'수': 3,'목': 4,'금': 5,'토': 6,'일': 7}
df2 = df.copy()
df2['sales_dayofweek'] = df2['sales_dayofweek'].map(day_to_int)
df2 = pd.pivot_table(df, index='custid', columns='sales_dayofweek', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index();
df2['주말방문비율'] = ((df2.iloc[:,5]+df2.iloc[:,6]) / (df2.iloc[:,1]+df2.iloc[:,2]+df2.iloc[:,3]+df2.iloc[:,4]+
                                                df2.iloc[:,5]+df2.iloc[:,6]+df2.iloc[:,7]))
f = df2[['custid','주말방문비율']]
features = pd.merge(features,f, on = 'custid')

# 요일별 방문비율
f = weekdays.iloc[:,:-1].divide(weekdays.iloc[:,-1], axis=0).rename(columns=dict(zip(weekdays.columns,
                                                                                    [str(i)+'_prop' for i in weekdays.columns])))
features = pd.merge(features,f, on = 'custid')

# 시간별 방문비율
f = sales_hour.iloc[:-1,:-1].divide(sales_hour.iloc[:-1,-1],axis=0).rename(columns=dict(zip(sales_hour.columns,
                                                                                    [str(i)+'_prop' for i in sales_hour.columns])))
features = pd.merge(features,f, on = 'custid')
# 지점별 방문비율
f = str_nm.iloc[:-1,:-1].divide(str_nm.iloc[:-1,-1],axis=0).rename(columns=dict(zip(str_nm.columns,[str(i)+'_prop' for i in str_nm.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 구매월별 방문비율
f = sales_month.iloc[:-1,:-1].divide(sales_month.iloc[:-1,-1],axis=0).rename(columns=
                                                        dict(zip(sales_month.columns,[str(i)+'_prop' for i in sales_month.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 계절별 방문비율
f = season_visit.divide(weekdays.총방문횟수,axis=0).rename(columns=dict(zip(season_visit.columns,[column+'_prop' for column in season_visit.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 시간대별 방문비율
f = mln.div(mln.iloc[:,-1], axis=0).iloc[:-1,:-1].reset_index().rename(columns=dict(zip(mln.columns,[i+'_prop' for i in mln.columns])))
features = pd.merge(features,f, on = 'custid')
# 할부결제비율
f =(inv['할부결제건수']/trans_amount['구매건수']).reset_index().rename(columns={0:'할부결제비율'}).fillna(0)
features = pd.merge(features,f, on = 'custid')
# 성수기 방문비율
f = peak.iloc[:,1].reset_index().rename(columns={'성수기':'성수기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 비성수기 방문비율
f = peak.iloc[:,0].reset_index().rename(columns={'비성수기':'비성수기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 전반기 방문비율
f = half.iloc[:,0].reset_index().rename(columns={'전반기':'전반기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 후반기 방문비율
f = half.iloc[:,1].reset_index().rename(columns={'후반기':'후반기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 오전 방문비율
f = noon.iloc[:,0].reset_index().rename(columns={'오전':'오전방문비율'})
features = pd.merge(features,f, on = 'custid')
# 오후 방문비율
f = noon.iloc[:,1].reset_index().rename(columns={'오후':'오후방문비율'})
features = pd.merge(features,f, on = 'custid')
# 할인율 평균
df['할인율'] = df.dis_amt/df.tot_amt
f = df.groupby('custid')['할인율'].mean().reset_index()
features = pd.merge(features,f,on='custid',how='left')

In [None]:
X_train = pd.DataFrame({'custid': df_train.custid.unique()})
X_train = pd.merge(X_train, features, how='left', on='custid')

X_test = pd.DataFrame({'custid': df_test.custid.unique()})
X_test = pd.merge(X_test, features, how='left', on='custid')

merge W2V

In [None]:
X_train = pd.merge(X_train, X_train2, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test2, how='left', on='custid')
X_test = X_test.set_index('custid')

In [None]:
X_train = pd.merge(X_train, X_train3, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test3, how='left', on='custid')
X_test = X_test.set_index('custid')

One-hot Encoding

In [None]:
level = 'corner_nm'
df_train[level].nunique()
IDtest = df_test.custid.unique()

df_all = pd.concat([df_train, df_test])
x_train = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         set_index('custid')
x_test = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         set_index('custid')

x_train = pd.merge(x_train, X_train, on = 'custid')
x_test = pd.merge(x_test, X_test, on = 'custid')

### Feature Selection

In [None]:
features = x_train
# 사용할 모델 설정 (속도가 빠른 모델 사용 권장)
model = LinearRegression()

# 각 특성과 타깃(class) 사이에 유의한 통계적 관계가 있는지 계산하여 특성을 선택하는 방법 
# feature 개수 바꿔가며 성능 test한다.
cv_scores = []
for p in tqdm(range(5,100,1)):
    X_new = SelectPercentile(percentile=p).fit_transform(x_train, y_train)    
    cv_score = cross_val_score(model, X_new, y_train, scoring='neg_mean_squared_error', cv=5).mean()
    cv_scores.append((p,cv_score))

# Print the best percentile
best_score = cv_scores[np.argmax([score for _, score in cv_scores])]
print(best_score)

# Plot the performance change with p
plt.plot([k for k, _ in cv_scores], [score for _, score in cv_scores])
plt.xlabel('Percent of features')
plt.grid()

In [None]:
# 과적합을 피하기 위해 최적의 p값 주변의 값을 선택하는게 더 나은 결과를 얻을 수 있다. 
fs = SelectPercentile(percentile=best_score[0]).fit(x_train, y_train)
x_train = fs.transform(x_train)
x_test = fs.transform(x_test)

print(x_train.shape)
print(features.columns[fs.get_support()].tolist())

In [None]:
pd.DataFrame(x_train,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train_1round.csv')
pd.DataFrame(x_test,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train_1round.csv')

```python
pd.DataFrame(x_train,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train_1round.csv')
pd.DataFrame(x_test,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train_1round.csv')
```

### Model Tuning

In [None]:
x_train2, x_dev, y_train2, y_dev = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

In [None]:
pbounds = {
    'alpha':(0,50)
}
def rid_opt(alpha):
    params = {
        'alpha':alpha
    }
    rid = Ridge(random_state=0, **params)
    rid.fit(x_train2,y_train2)
    score = mean_squared_error(rid.predict(x_dev),y_dev,squared=False)
    return -score
BO_rid = BayesianOptimization(rid_opt, pbounds, random_state=0)
BO_rid.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def las_opt(alpha):
    params = {
        'alpha':alpha
    }
    las = Lasso(random_state=0, **params)
    las.fit(x_train2,y_train2)
    score = mean_squared_error(las.predict(x_dev),y_dev,squared=False)
    return -score
BO_las = BayesianOptimization(las_opt, pbounds, random_state=0)
BO_las.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def ela_opt(alpha):
    params = {
        'alpha':alpha
    }
    ela = ElasticNet(random_state=0, **params)
    ela.fit(x_train2,y_train2)
    score = mean_squared_error(ela.predict(x_dev),y_dev,squared=False)
    return -score
BO_ela = BayesianOptimization(ela_opt, pbounds, random_state=0)
BO_ela.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def ard_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    ard = ARDRegression(**params)
    ard.fit(x_train2,y_train2)
    score = mean_squared_error(ard.predict(x_dev),y_dev,squared=False)
    return -score
BO_ard = BayesianOptimization(ard_opt, pbounds, random_state=0)
BO_ard.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def bay_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    bay = BayesianRidge(**params)
    bay.fit(x_train2,y_train2)
    score = mean_squared_error(bay.predict(x_dev),y_dev,squared=False)
    return -score
BO_bay = BayesianOptimization(bay_opt, pbounds, random_state=0)
BO_bay.maximize(init_points=50, n_iter=50)

In [None]:
pbounds = {
    'n_estimators':(100,1000),
    'learning_rate':(0,1),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}
def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha
    }
    lgbm = LGBMRegressor(random_state=0, **params)
    lgbm.fit(x_train2,y_train2)
    score = mean_squared_error(lgbm.predict(x_dev),y_dev,squared=False)
    return -score
BO_lgbm = BayesianOptimization(lgbm_opt, pbounds, random_state=0)
BO_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
max_params_rid = BO_rid.max['params']
max_params_las = BO_las.max['params']
max_params_ela = BO_ela.max['params']
max_params_ard = BO_ard.max['params']
max_params_bay = BO_bay.max['params']
max_params_lgbm = BO_lgbm.max['params']

In [None]:
max_params_ard['n_iter'] = int(round(max_params_ard['n_iter']))

max_params_bay['n_iter'] = int(round(max_params_bay['n_iter']))

max_params_lgbm['num_leaves'] = int(round(max_params_lgbm['num_leaves']))
max_params_lgbm['n_estimators'] = int(round(max_params_lgbm['n_estimators']))
max_params_lgbm['max_depth'] = int(round(max_params_lgbm['max_depth']))
max_params_lgbm['min_child_samples'] = int(round(max_params_lgbm['min_child_samples']))
max_params_lgbm['min_child_weight'] = int(round(max_params_lgbm['min_child_weight']))
max_params_lgbm['max_bin'] = int(round(max_params_lgbm['max_bin']))
max_params_lgbm['subsample'] = max(min(max_params_lgbm['subsample'], 1), 0)
max_params_lgbm['colsample_bytree'] = max(min(max_params_lgbm['colsample_bytree'], 1), 0)

In [None]:
print(max_params_rid,'\n',max_params_las,'\n',max_params_ela,'\n',max_params_ard,'\n',max_params_bay,'\n',max_params_lgbm)

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained = [(name, reg.fit(x_train2, y_train2),mean_squared_error(reg.predict(x_dev),y_dev,squared=False))
                    for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

regs_trained_for_submissions = [(name, reg.fit(x_train,y_train)) for name, reg in tqdm(regs_tuned.copy())]

In [None]:
regs_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
regs_tuned = [(str(reg).split('(')[0], reg) for reg in regs_tuned]
regs_tuned[-1] = list(regs_tuned[-1])
regs_tuned[-1][0] = 'CatBoostRegressor'
regs_tuned[-1] = tuple(regs_tuned[-1])

### Deep Neural Network

In [None]:
def reset_seeds(reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(1)
    # seed를 잘 설정하면 성능이 더 잘 오른다.
    random.seed(2)
    tf.compat.v1.set_random_seed(3)
#    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional
   
reset_seeds()

In [None]:
i = int(round(x_train2.shape[0] * 0.8,0))
x_val, y_val = x_train2[i:], y_train2[i:]
x_train3, y_train3 = x_train2[:i], y_train2[:i]

In [None]:
def model_fn(hp):
    inputs = tf.keras.Input(shape=(x_train3.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4, step=1)):
        x = tf.keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16),
                               activation=hp.Choice('activation',['relu','tanh']))(x)
        x = tf.keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='mse', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), 
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
# keras tuner는 튜닝 종류가 4종류가 있음: hyperband, grid search, random search, bayesian optimization
tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_root_mean_squared_error', direction="min"), 
                     max_epochs=30,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')
# objective: 튜닝 기준, hyperband_iterations:이거 자체에서 2번 반복
# overwrite: False시, 기존을 근거로 해 재학습 안시킴

tuner.search(x_train3, y_train3, validation_data=(x_val, y_val),
             callbacks=[tf.keras.callbacks.EarlyStopping()])
# 빨리 끝내려고 파라미터 저렇게 설정한 것임

In [None]:
tuner.results_summary(1) # 1= 제일 성능이 좋은 놈

In [None]:
# Loss & RMSE
dnn = tuner.get_best_models(1)[0] # best model 중 가장 좋은 모델
dnn.evaluate(x_dev, y_dev)

### Model Ensemble

In [None]:
pred_results = []
for name, reg, reg_score in regs_trained:
    pred = list(reg.predict(x_dev))
    name = f'{name} \n({reg_score:.4f})'
    pred_results.append(pd.Series(pred, name=name))
ensemble_results = pd.concat(pred_results, axis=1)
ensemble_results = ensemble_results.applymap(lambda x: float(x))

# 모형의 예측값 간의 상관관계를 보기 위해 hitmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results.corr(), annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
rmse = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'rmse': rmse, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="rmse", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.rmse[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.rmse.min()-0.01,df.rmse.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('RMSE')
plt.grid()
plt.show()

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
avg = (models_for_ensemble[0][1].predict(x_dev).flatten()+models_for_ensemble[1][1].predict(x_dev)+models_for_ensemble[2][1].predict(x_dev))/len(models_for_ensemble)
score = mean_squared_error(avg, y_dev, squared=False)
score

In [None]:
# 최적의 가중치 찾기 
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in regs_trained if name in selected]
weights_avg = []
rmse_best = 1000
for i in tqdm(range(1, 30, 1)):
    for j in range(1, 30, 1):
        for k in range(1, 30, 1):
            if (i+j+k) != 30:
                continue
            pred = (models_for_ensemble[0][1].predict(x_dev).flatten() * i + models_for_ensemble[1][1].predict(x_dev) * j
                    + models_for_ensemble[2][1].predict(x_dev) * k)/30
            rmse = np.sqrt(mean_squared_error(y_dev, pred))
            if rmse < rmse_best:
                weights_avg = [i,j,k]
                rmse_best = rmse 
                print(rmse, i,j,k)            

print(rmse_best, weights_avg)

### Stacking

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor'
            ]
stack_estimators = [reg for name,reg,score in regs_trained if name in selected]

In [None]:
S_train, S_test = stacking(stack_estimators,
                           x_train, y_train, x_test,
                           regression=True, n_folds=5, stratified=True, shuffle=True,
                           random_state=0, verbose=2)

In [None]:
S_train2, S_dev = stacking(stack_estimators,
                           x_train2, y_train2, x_dev,
                           regression=True, n_folds=5, stratified=True, shuffle=True,
                           random_state=0, verbose=2)

In [None]:
pbounds = {
    'alpha':(0,150)
}
def rid_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    rid = Ridge(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5 , shuffle=True, random_state=1)
    score = cross_val_score(rid, S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_rid = BayesianOptimization(rid_stk_opt, pbounds, random_state=0)
BO_stk_rid.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def las_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    las = Lasso(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(las,S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_las = BayesianOptimization(las_stk_opt, pbounds, random_state=0)
BO_stk_las.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def ela_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    ela = ElasticNet(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(ela, S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_ela = BayesianOptimization(ela_stk_opt, pbounds, random_state=0)
BO_stk_ela.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def ard_stk_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    ard = ARDRegression(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(ard, S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_ard = BayesianOptimization(ard_stk_opt, pbounds, random_state=0)
BO_stk_ard.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def bay_stk_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    bay = BayesianRidge(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(bay,S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_bay = BayesianOptimization(bay_stk_opt, pbounds, random_state=0)
BO_stk_bay.maximize(init_points=50, n_iter=50)

In [None]:
pbounds = {
    'n_estimators':(100,1000),
    'learning_rate':(0.0000000000000000001,1),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}
def lgbm_stk_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha
    }
    lgbm = LGBMRegressor(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(lgbm, S_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_lgbm = BayesianOptimization(lgbm_stk_opt, pbounds, random_state=0)
BO_stk_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
max_params_rid = BO_stk_rid.max['params']
max_params_las = BO_stk_las.max['params']
max_params_ela = BO_stk_ela.max['params']
max_params_ard = BO_stk_ard.max['params']
max_params_bay = BO_stk_bay.max['params']
max_params_lgbm = BO_stk_lgbm.max['params']

In [None]:
max_params_ard['n_iter'] = int(round(max_params_ard['n_iter']))

max_params_bay['n_iter'] = int(round(max_params_bay['n_iter']))

max_params_lgbm['num_leaves'] = int(round(max_params_lgbm['num_leaves']))
max_params_lgbm['n_estimators'] = int(round(max_params_lgbm['n_estimators']))
max_params_lgbm['max_depth'] = int(round(max_params_lgbm['max_depth']))
max_params_lgbm['min_child_samples'] = int(round(max_params_lgbm['min_child_samples']))
max_params_lgbm['min_child_weight'] = int(round(max_params_lgbm['min_child_weight']))
max_params_lgbm['max_bin'] = int(round(max_params_lgbm['max_bin']))
max_params_lgbm['subsample'] = max(min(max_params_lgbm['subsample'], 1), 0)
max_params_lgbm['colsample_bytree'] = max(min(max_params_lgbm['colsample_bytree'], 1), 0)

In [None]:
print(max_params_rid,'\n',max_params_las,'\n',max_params_ela,'\n',max_params_ard,'\n',max_params_bay,'\n',max_params_lgbm)

In [None]:
stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
stks_tuned = [(str(stk).split('(')[0], stk) for stk in stks_tuned]
stks_tuned[-1] = list(stks_tuned[-1])
stks_tuned[-1][0] = 'CatBoostRegressor'
stks_tuned[-1] = tuple(stks_tuned[-1])

stks_trained = [(name, stk.fit(S_train2, y_train2),mean_squared_error(stk.predict(S_dev),y_dev,squared=False))
                    for name, stk in tqdm(stks_tuned.copy())]

In [None]:
stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
stks_tuned = [(str(stk).split('(')[0], stk) for stk in stks_tuned]
stks_tuned[-1] = list(stks_tuned[-1])
stks_tuned[-1][0] = 'CatBoostRegressor'
stks_tuned[-1] = tuple(stks_tuned[-1])

stks_trained_for_submissions = [(name, stk.fit(S_train,y_train)) for name, stk in tqdm(stks_tuned.copy())]

In [None]:
stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
stks_tuned = [(str(stk).split('(')[0], stk) for stk in stks_tuned]
stks_tuned[-1] = list(stks_tuned[-1])
stks_tuned[-1][0] = 'CatBoostRegressor'
stks_tuned[-1] = tuple(stks_tuned[-1])

In [None]:
pred_results = []
for name, stk, stk_score in stks_trained:
    pred = list(stk.predict(S_dev))
    name = f'{name} \n({stk_score:.4f})'
    pred_results.append(pd.Series(pred, name=name))
ensemble_results = pd.concat(pred_results, axis=1)
ensemble_results = ensemble_results.applymap(lambda x: float(x))

# 모형의 예측값 간의 상관관계를 보기 위해 hitmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results.corr(), annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
rmse = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'rmse': rmse, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="rmse", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.rmse[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.rmse.min()-0.01,df.rmse.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('RMSE')
plt.grid()
plt.show()

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in stks_trained if name in selected]
results_for_ensemble = []
weights_stk = []
rmse_best = 1000
for i in tqdm(range(1, 90, 1)):
    for j in range(1, 90, 1):
        for k in range(1, 90, 1):
            if (i+j+k) != 90:
                continue
            pred = (models_for_ensemble[0][1].predict(S_dev).flatten() * i + models_for_ensemble[1][1].predict(S_dev) * j
                    + models_for_ensemble[2][1].predict(S_dev) * k)/90
            rmse = np.sqrt(mean_squared_error(y_dev, pred))
            if rmse < rmse_best:
                weights_stk = [i,j,k]
                rmse_best = rmse 
                print(rmse, i,j,k)            

print(rmse_best, weights_stk)

3-layered

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor'
            ]
tri_stack_estimators = [reg for name,reg,score in stks_trained if name in selected]

In [None]:
S3_train, S3_test = stacking(tri_stack_estimators,
                           x_train, y_train, x_test,
                           regression=True, n_folds=5, stratified=True, shuffle=True,
                           random_state=0, verbose=0)

In [None]:
S3_train2, S3_dev = stacking(tri_stack_estimators,
                           x_train2, y_train2, x_dev,
                           regression=True, n_folds=5, stratified=True, shuffle=True,
                           random_state=0, verbose=0)

In [None]:
pbounds = {
    'alpha':(0,150)
}
def rid_tri_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    rid = Ridge(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5 , shuffle=True, random_state=1)
    score = cross_val_score(rid, S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_rid = BayesianOptimization(rid_tri_stk_opt, pbounds, random_state=0)
BO_stk_rid.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def las_tri_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    las = Lasso(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(las,S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_las = BayesianOptimization(las_tri_stk_opt, pbounds, random_state=0)
BO_stk_las.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'alpha':(0,50)
}
def ela_tri_stk_opt(alpha):
    params = {
        'alpha':alpha
    }
    ela = ElasticNet(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(ela, S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_ela = BayesianOptimization(ela_tri_stk_opt, pbounds, random_state=0)
BO_stk_ela.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def ard_tri_stk_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    ard = ARDRegression(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(ard, S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_ard = BayesianOptimization(ard_tri_stk_opt, pbounds, random_state=0)
BO_stk_ard.maximize(init_points=50, n_iter=50) # init_points: exploration, n_iter: iteration

In [None]:
pbounds = {
    'n_iter':(100,1000),
    'alpha_1':(0,50),
    'alpha_2':(0,50),
    'lambda_1':(0,10),
    'lambda_2':(0,10)
}
def bay_tri_stk_opt(n_iter,alpha_1,alpha_2,lambda_1,lambda_2):
    params = {
        'n_iter':int(round(n_iter)),
        'alpha_1':alpha_1,
        'alpha_2':alpha_2,
        'lambda_1':lambda_1,
        'lambda_2':lambda_2
    }
    bay = BayesianRidge(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(bay,S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_bay = BayesianOptimization(bay_tri_stk_opt, pbounds, random_state=0)
BO_stk_bay.maximize(init_points=50, n_iter=50)

In [None]:
pbounds = {
    'n_estimators':(100,1000),
    'learning_rate':(0.0000000000000000001,1),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}
def lgbm_tri_stk_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha
    }
    lgbm = LGBMRegressor(random_state=0, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(lgbm, S3_train2, y_train2, scoring='neg_mean_squared_error', cv=skf, n_jobs=-1)
    return -np.sqrt(-np.mean(score))
BO_stk_lgbm = BayesianOptimization(lgbm_tri_stk_opt, pbounds, random_state=0)
BO_stk_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
max_params_rid = BO_stk_rid.max['params']
max_params_las = BO_stk_las.max['params']
max_params_ela = BO_stk_ela.max['params']
max_params_ard = BO_stk_ard.max['params']
max_params_bay = BO_stk_bay.max['params']
max_params_lgbm = BO_stk_lgbm.max['params']

In [None]:
max_params_ard['n_iter'] = int(round(max_params_ard['n_iter']))

max_params_bay['n_iter'] = int(round(max_params_bay['n_iter']))

max_params_lgbm['num_leaves'] = int(round(max_params_lgbm['num_leaves']))
max_params_lgbm['n_estimators'] = int(round(max_params_lgbm['n_estimators']))
max_params_lgbm['max_depth'] = int(round(max_params_lgbm['max_depth']))
max_params_lgbm['min_child_samples'] = int(round(max_params_lgbm['min_child_samples']))
max_params_lgbm['min_child_weight'] = int(round(max_params_lgbm['min_child_weight']))
max_params_lgbm['max_bin'] = int(round(max_params_lgbm['max_bin']))
max_params_lgbm['subsample'] = max(min(max_params_lgbm['subsample'], 1), 0)
max_params_lgbm['colsample_bytree'] = max(min(max_params_lgbm['colsample_bytree'], 1), 0)

In [None]:
print(max_params_rid,'\n',max_params_las,'\n',max_params_ela,'\n',max_params_ard,'\n',max_params_bay,'\n',max_params_lgbm)

In [None]:
tri_stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
tri_stks_tuned = [(str(stk).split('(')[0], stk) for stk in tri_stks_tuned]
tri_stks_tuned[-1] = list(tri_stks_tuned[-1])
tri_stks_tuned[-1][0] = 'CatBoostRegressor'
tri_stks_tuned[-1] = tuple(tri_stks_tuned[-1])

tri_stks_trained = [(name, stk.fit(S3_train2, y_train2),mean_squared_error(stk.predict(S3_dev),y_dev,squared=False))
                    for name, stk in tqdm(tri_stks_tuned.copy())]

In [None]:
tri_stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
tri_stks_tuned = [(str(stk).split('(')[0], stk) for stk in tri_stks_tuned]
tri_stks_tuned[-1] = list(tri_stks_tuned[-1])
tri_stks_tuned[-1][0] = 'CatBoostRegressor'
tri_stks_tuned[-1] = tuple(tri_stks_tuned[-1])

tri_stks_trained_for_submissions = [(name, stk.fit(S3_train,y_train)) for name, stk in tqdm(tri_stks_tuned.copy())]

In [None]:
tri_stks_tuned = [Ridge(random_state=0, **max_params_rid),Lasso(random_state=0, **max_params_las),ElasticNet(random_state=0, **max_params_ela),
             ARDRegression(**max_params_ard),BayesianRidge(**max_params_bay),LGBMRegressor(random_state=0,**max_params_lgbm),CatBoostRegressor(random_state=0)]
tri_stks_tuned = [(str(stk).split('(')[0], stk) for stk in tri_stks_tuned]
tri_stks_tuned[-1] = list(tri_stks_tuned[-1])
tri_stks_tuned[-1][0] = 'CatBoostRegressor'
tri_stks_tuned[-1] = tuple(tri_stks_tuned[-1])

In [None]:
pred.shape

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor',
            #'DeepNeuralNetwork'
            ]
models_for_ensemble = [(name,reg) for name,reg,score in stks_trained if name in selected]
w0,w1,w2 = weights_stk
stk_avg = (models_for_ensemble[0][1].predict(S_dev).flatten()*w0+models_for_ensemble[1][1].predict(S_dev)*w1
            + models_for_ensemble[2][1].predict(S_dev)*w2)/90
weights_ad = []
rmse_best = 1000
for i in tqdm(range(1, 100, 1)):
    for j in range(1, 100, 1):
        if (i+j) != 100:
            continue
        pred = (stk_avg*i + dnn.predict(x_dev).flatten()*j)/100
        rmse = np.sqrt(mean_squared_error(y_dev, pred))
        if rmse < rmse_best:
            weights_ad = [i,j]
            rmse_best = rmse 
            print(rmse, i,j)            

print(rmse_best, weights_ad)

### Deployment

In [None]:
w0,w1,w2 = weights_stk
stk_avg = (models_for_ensemble[0][1].predict(S_test).flatten()*w0+models_for_ensemble[1][1].predict(S_test)*w1
            + models_for_ensemble[2][1].predict(S_test)*w2)/90
w0,w1 = weights_ad
pred = (stk_avg*w0 + dnn.predict(x_test).flatten()*w1)/100
pd.DataFrame({'custid': test_id, 'age':pred}).to_csv('averagingstk77_ridlgbmcat_dnn23.csv', index=False)