# W2V 다른 것도 해보기


### Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font',family='malgun gothic')
plt.rc('axes',unicode_minus=False)
import seaborn as sns

# EDA
import klib

# Preprocessing&Feature Engineering
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from gensim.models import word2vec

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, ARDRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor

# Eveluation
from sklearn.metrics import mean_squared_error # squared=False시 RMSE
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import pickle

### Read Data

In [None]:
df_train = pd.read_csv(os.path.abspath("../input")+'/X_train.csv', encoding='cp949')
y_train = pd.read_csv(os.path.abspath("../input")+'/y_train.csv', encoding='cp949').set_index('custid')
df_test = pd.read_csv(os.path.abspath("../input")+'/X_test.csv', encoding='cp949')
test_id = df_test['custid'].unique()

### Creating Feature&Feature Engineering

In [None]:
tr = pd.concat([df_train, df_test])
tr.head()

### PCA

In [None]:
# 차원축소 매소드 
from sklearn.decomposition import PCA

def dummy_to_pca(tr, column_name:str) :
    max_seq = 300
    max_d = 15
    col_count = tr.groupby(column_name)[column_name].count()
    if len(col_count) > max_seq:
        tops = col_count.sort_values(ascending=False)[0:max_seq].index
        f =tr.loc[tr[column_name].isin(tops)][['custid', column_name]]
    else:
        tops = col_count.index
        f =tr[['custid', column_name]]
    f = pd.get_dummies(f, columns=[column_name])  # This method performs One-hot-encoding
    f = f.groupby('custid').mean()
    if len(tops) < max_d:
        max_d = len(tops)
    pca = PCA(n_components=max_d)
    pca.fit(f)
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합
    print(cumsum)
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 차원의 수
    if num_d == 1:
        num_d = max_d
    pca = PCA(n_components=num_d)    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns]
    result.index = f.index
    return result.reset_index()

In [None]:
features=pd.DataFrame({'custid':tr.custid.unique()})
f = dummy_to_pca(tr, 'brd_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'corner_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'pc_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'part_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'buyer_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'team_nm'); features = pd.merge(features,f,on='custid',how='left')
f = dummy_to_pca(tr, 'goodcd'); features = pd.merge(features,f,on='custid',how='left')

## W2V

In [None]:
def age_vec():
    sentences = []
    df_all = df_train
    for id in tqdm(df_all.custid.unique()):
        x = df_all.query('custid == @id')[level].unique()
        y = y_train.query('custid == @id').age
        for j in range(20):
            y = np.append(y, np.random.choice(x, len(x), replace=False))
        sentences.append(list(y))

### W2V-상품코드

In [None]:
level = 'goodcd' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 200 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          vector_size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling,)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train5 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test5 = np.array(features_wv)

X_train5 = pd.DataFrame(X_train5)
X_test5 = pd.DataFrame(X_test5)
X_train5.insert(0,'custid',df_train.custid.unique())
X_test5.insert(0,'custid',df_test.custid.unique())

### W2V-PC명

In [None]:
level = 'pc_nm' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 30 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          vector_size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling,)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train4 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test4 = np.array(features_wv)

X_train4 = pd.DataFrame(X_train4)
X_test4 = pd.DataFrame(X_test4)
X_train4.insert(0,'custid',df_train.custid.unique())
X_test4.insert(0,'custid',df_test.custid.unique())

### W2V-코너명

In [None]:
level = 'corner_nm' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 300 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
#                           size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
# Make features based on Word2Vec
# 고객별로 구매한 상품의 평균벡터를 feature로 사용한다.
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train2 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test2 = np.array(features_wv)

X_train2 = pd.DataFrame(X_train2)
X_test2 = pd.DataFrame(X_test2)
X_train2.insert(0,'custid',df_train.custid.unique())
X_test2.insert(0,'custid',df_test.custid.unique())

### W2V-브랜드명

In [None]:
### W2V-브랜드명
level = 'brd_nm' # 상품 분류 수준

# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 20배 oversampling을 수행
sentences = []
df_all = pd.concat([df_train, df_test])
for id in tqdm(df_all.custid.unique()):
    uw = df_all.query('custid == @id')[level].unique()
    bs = np.array([])
    for j in range(20):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    sentences.append(list(bs))
    sentences.append(list(df_all.query('custid == @id')[level].values))

In [None]:
max_features = 300 # 문자 벡터 차원 수
min_word_count = 1 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 3 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

from gensim.models import word2vec

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
#                           vector_size=max_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
# 필요없는 메모리 unload
model.init_sims(replace=True)

In [None]:
# Make features based on Word2Vec
# 고객별로 구매한 상품의 평균벡터를 feature로 사용한다.
features_wv = []
for id in tqdm(df_train.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_train3 = np.array(features_wv)

features_wv = []
for id in tqdm(df_test.custid.unique()):
    features_wv.append(df_all.query('custid == @id')[level] \
                              .apply(lambda x: model.wv[x]).mean())
X_test3 = np.array(features_wv)

In [None]:
X_train3 = pd.DataFrame(X_train3)
X_test3 = pd.DataFrame(X_test3)
X_train3.insert(0,'custid',df_train.custid.unique())
X_test3.insert(0,'custid',df_test.custid.unique())

In [None]:
X_train2 = pd.read_excel('X_train2.xlsx',index_col=0)
X_test2 = pd.read_excel('X_test2.xlsx',index_col=0)
X_train3 = pd.read_excel('X_train3.xlsx',index_col=0)
X_test3 = pd.read_excel('X_test3.xlsx',index_col=0)
X_train4 = pd.read_excel('X_train4.xlsx',index_col=0)
X_test4 = pd.read_excel('X_test4.xlsx',index_col=0)
X_train5 = pd.read_excel('X_train5.xlsx',index_col=0)
X_test5 = pd.read_excel('X_test5.xlsx',index_col=0)

In [None]:
def month_modify(x):
    if x > 12:
        return x-12
    else:
        return x

def extract_hour(x):
    if len(str(x))>3:
        return str(x)[:2]
    else: 
        return str(x)[:1]
def extract_season(x):
    if 3 <= x <= 5 :
        return('봄')
    elif 6 <= x <= 8 :
        return('여름')
    elif 9 <= x <= 11 :    
        return('가을')
    else :
        return('겨울') 
def time_(x):
    if 9 <= x <= 11 :
        return('아침_구매건수')
    elif 12 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')
def half_year(x):
    if 1<=x<=5:
        return('전반기')
    else:
        return('후반기')
def peak_season(x):
    if x in [7,8,12,1,2]:
        return('성수기')
    else:
        return('비성수기')
def div_month(x):
    if 1<=x<=10:
        return('월초')
    elif 11<=x<=20:
        return('월중')
    else:
        return('월말')
def noon(x):
    if int(str(x)[-2:])<=12:
        return('오전')
    else:
        return('오후')

일반변수

In [None]:
df = tr.copy()
df['sales_hour'] = df.sales_time.apply(extract_hour)
df['sales_hour'] = pd.to_numeric(df['sales_hour'])
df['sales_hour'] = np.where(df['sales_hour'] < 9, 21, df['sales_hour'])
df['방문시간대'] = df.sales_hour.apply(time_)
df['sales_month'] = df.sales_month.apply(month_modify)
df['반기'] = df.sales_month.apply(half_year)
df['성수기여부'] = df.sales_month.apply(peak_season)
df['월_초중말'] = df.sales_day.apply(div_month)
df['오전/오후'] = df.sales_time.apply(noon)

# 총구매액
f = df.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features = pd.merge(features,f, on = 'custid')
# 평균구매액
f = df.groupby('custid')['tot_amt'].agg([('평균구매액', 'mean')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 총할인금액
f = pd.DataFrame(df.groupby('custid').dis_amt.sum()).rename(columns={'dis_amt':'총할인금액'})
features = pd.merge(features,f, on = 'custid')
# 평균할인금액
f = pd.DataFrame(df.groupby('custid').dis_amt.mean()).rename(columns={'dis_amt':'평균할인금액'})
features = pd.merge(features,f, on = 'custid')

# 총실구매액
f = pd.DataFrame(df.groupby('custid').net_amt.sum()).rename(columns={'net_amt':'총실구매액'})
features = pd.merge(features,f, on = 'custid')
# 평균실구매액
f = pd.DataFrame(df.groupby('custid').net_amt.mean()).rename(columns={'net_amt':'평균실구매액'})
features = pd.merge(features,f, on = 'custid')

# 요일별 방문횟수
f = pd.crosstab(df.custid,df.sales_dayofweek, margins=True).reindex(columns=['월','화','수',
                                                        '목','금','토','일','All']).iloc[:-1,:].rename(columns={'All':'총방문횟수'})
features = pd.merge(features,f, on = 'custid')

# 시간별 방문횟수
f = pd.crosstab(df.custid, df.sales_hour).rename(columns=dict(zip(df.sales_hour.unique(),[str(i)+'시방문' for i in df.sales_hour.unique()])))
features = pd.merge(features,f, on = 'custid')

# 수입상품 구매 건수
f = pd.DataFrame(df.groupby('custid').import_flg.sum()).rename(columns={'import_flg':'수입상품구매건수'})
features = pd.merge(features,f, on = 'custid')

# x = df[df['import_flg'] == 1].groupby('custid').size()
# f = x.reset_index().rename(columns={0: '수입상품_구매건수'}).fillna(0)
# features = pd.merge(features,f, on = 'custid')

# 구매건수
f = df.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 평균할부개월수
f = df.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 지점별 방문횟수
f = pd.crosstab(df.custid, df.str_nm).rename(columns=dict(zip(df.str_nm.unique(),[i+'방문' for i in df.str_nm.unique()])))
features = pd.merge(features,f, on = 'custid')

# 방문지점 수
f = df.groupby('custid')['str_nm'].agg([('방문지점수',lambda x: x.nunique())])
features = pd.merge(features,f, on = 'custid')

# 월별 구매횟수
f = pd.crosstab(df.custid,df.sales_month).rename(columns=dict(zip(df.sales_month.unique(), [str(i)+'월방문' for i in df.sales_month.unique()])))
features = pd.merge(features,f, on = 'custid')

# 구매일수
df['sales_month'] = df['sales_month'].astype(str)
df['sales_day'] = df['sales_day'].astype(str)
df['판매일'] = df['sales_month'] + '-' + df['sales_day']
df.판매일 = pd.to_datetime(df.판매일,format='%m-%d')
f = df.groupby(by = 'custid')['판매일'].agg([('구매일수','nunique')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 구매주기
f = df.groupby('custid')['판매일'].agg([('구매주기', lambda x: int((x.astype('datetime64').max() - x.astype('datetime64').min()).days / x.nunique()))]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 최고 구입 금액
f = df.groupby('custid')['tot_amt'].agg([('최고구매금액', 'max')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 계절별 구매건수
df['sales_month'] = pd.to_numeric(df['sales_month'])
df['계절'] = df.sales_month.apply(extract_season)
f = pd.pivot_table(df, index = 'custid', columns = '계절', values = 'tot_amt',
                  aggfunc = np.size, fill_value = 0).reset_index()
features = pd.merge(features,f, on = 'custid')

# 반기별 구매건수
f = pd.crosstab(df.custid, df.반기).reset_index()
features = pd.merge(features,f, on = 'custid')

# 성수기 여부 별 구매건수
f = pd.crosstab(df.custid, df.성수기여부).reset_index()
features = pd.merge(features,f, on = 'custid')

# 월 시기별 구매건수
f = pd.crosstab(df.custid, df.월_초중말).reset_index()
features = pd.merge(features,f, on = 'custid')

# 오전/오후별 구매건수
f = pd.crosstab(df.custid, df['오전/오후']).reset_index()
features = pd.merge(features,f, on = 'custid')

# 방문지점 갯수
f = df.groupby(by = 'custid')['str_nm'].agg([('방문지점개수','nunique')]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 구매상품 다양성
n = df.corner_nm.nunique()
f = df.groupby('custid')['goodcd'].agg([('구매상품다양성', lambda x: len(x.unique()) / n)]).reset_index()
features = pd.merge(features,f, on = 'custid')

# 시간대별 방문횟수
f = pd.crosstab(df.custid, df.방문시간대)
features = pd.merge(features,f, on = 'custid')

# 주구매코너
#f = df.groupby('custid')['corner_nm'].agg([('주구매코너', lambda x: x.value_counts().index[0])]).reset_index()
#f = pd.get_dummies(f, columns=['주구매코너'])  # This method performs One-hot-encoding
#features = pd.merge(features,f, on = 'custid')

# 코너별 구매건수
f = pd.pivot_table(df, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')

# 바이어 이름 별 구매건수
f = pd.pivot_table(df, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0)
f = f.rename(columns=dict(zip(f.columns,[i+'_구매건수' for i in f.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')

# 무이자 할부 평균 가격
f = df.loc[df.inst_fee==1].groupby('custid').net_amt.agg([('무이자할부평균가격','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left').fillna(0)

# 할부결제건수
f = df.loc[df.inst_mon>1].groupby('custid').inst_mon.agg([('할부결제건수','count')])
features = pd.merge(features,f,on='custid',how='left').fillna(0)

# 내점 당 구매금액
f = df.groupby(['custid','판매일'])['tot_amt'].sum().reset_index().groupby('custid')['tot_amt'].agg([('내점당구매금액','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left')

# 내점 당 구매개수
f = df.groupby(['custid','판매일'])['tot_amt'].count().reset_index().groupby('custid')['tot_amt'].agg([('내점당구매개수','mean')]).reset_index()
features = pd.merge(features,f,on='custid',how='left')

# 평균쇼핑시간
f = df.groupby(['custid','판매일'])['sales_time'].agg(lambda x: x.max()-x.min()).reset_index().groupby('custid').sales_time.agg([('평균쇼핑시간','mean')])
features = pd.merge(features,f,on='custid',how='left')







f = pd.pivot_table(df, index='custid', columns='brd_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
del f['custid']
pca = PCA(n_components=5, random_state=0)
f_pca = pca.fit_transform(f)
f = pd.DataFrame(f_pca)
features = pd.concat([features,f], axis = 1)


f = pd.pivot_table(df, index='custid', columns='brd_nm', values='tot_amt', 
                   aggfunc='sum', fill_value=0).reset_index()
del f['custid']
pca = PCA(n_components=150, random_state=0)
f_pca = pca.fit_transform(f)
f = pd.DataFrame(f_pca)
features = pd.concat([features,f], axis = 1)

비율변수

In [None]:
features['custid'] = tr.custid.unique()

tr['sales_hour'] = tr.sales_time.apply(extract_hour)
weekdays = pd.crosstab(tr.custid,tr.sales_dayofweek, margins=True).reindex(columns=['월','화','수',
                                                        '목','금','토','일','All']).iloc[:-1,:].rename(columns={'All':'총방문횟수'})
sales_hour = pd.crosstab(df.custid, df.sales_hour, margins=True)
sales_hour = sales_hour.rename(columns=dict(zip(sales_hour.columns,[str(i)+'시방문' for i in sales_hour.columns])))
str_nm = pd.crosstab(tr.custid, tr.str_nm,margins=True)
str_nm = str_nm.rename(columns=dict(zip(str_nm.columns,[i+'방문' for i in str_nm.columns])))
sales_month = pd.crosstab(df.custid,df.sales_month, margins=True)
sales_month = sales_month.rename(columns=dict(zip(sales_month.columns, [str(i)+'월방문' for i in sales_month])))
tr['sales_month'] = pd.to_numeric(tr['sales_month'])
tr['sales_month'] = tr.sales_month.apply(month_modify)
tr['season'] = pd.DataFrame(tr.sales_month.apply(extract_season))
season_visit = pd.crosstab(tr.custid, tr.season)
tr['mln'] = tr.sales_time.apply(lambda x: int(str(x)[:2])).apply(time_)
mln = pd.crosstab(tr.custid, tr.mln,margins=True)
inv = tr.loc[tr.inst_mon>1].groupby('custid').inst_mon.agg([('할부결제건수','count')])
trans_amount = tr.groupby('custid')['tot_amt'].agg([('구매건수', 'size')])
peak = pd.crosstab(df.custid, df.성수기여부, margins=True)
peak = peak.divide(peak.iloc[:,-1],axis=0).iloc[:-1,:-1]
half = pd.crosstab(df.custid, df.반기, margins=True)
half = half.divide(half.iloc[:,-1],axis=0).iloc[:-1,:-1]
noon = pd.crosstab(df.custid, df['오전/오후'], margins=True)
noon = noon.divide(noon.iloc[:,-1],axis=0).iloc[:-1,:-1]

In [None]:
# 수입상품 구매비율
x = df[df['import_flg'] == 1].groupby('custid').size() / df.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1])
features = pd.merge(features,f, on = 'custid')

# 주말 방문 비율
day_to_int = {
    '월': 1,'화': 2,'수': 3,'목': 4,'금': 5,'토': 6,'일': 7}
df2 = df.copy()
df2['sales_dayofweek'] = df2['sales_dayofweek'].map(day_to_int)
df2 = pd.pivot_table(df, index='custid', columns='sales_dayofweek', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index();
df2['주말방문비율'] = ((df2.iloc[:,5]+df2.iloc[:,6]) / (df2.iloc[:,1]+df2.iloc[:,2]+df2.iloc[:,3]+df2.iloc[:,4]+
                                                df2.iloc[:,5]+df2.iloc[:,6]+df2.iloc[:,7]))
f = df2[['custid','주말방문비율']]
features = pd.merge(features,f, on = 'custid')

# 요일별 방문비율
f = weekdays.iloc[:,:-1].divide(weekdays.iloc[:,-1], axis=0).rename(columns=dict(zip(weekdays.columns,
                                                                                    [str(i)+'_prop' for i in weekdays.columns])))
features = pd.merge(features,f, on = 'custid')

# 시간별 방문비율
f = sales_hour.iloc[:-1,:-1].divide(sales_hour.iloc[:-1,-1],axis=0).rename(columns=dict(zip(sales_hour.columns,
                                                                                    [str(i)+'_prop' for i in sales_hour.columns])))
features = pd.merge(features,f, on = 'custid')
# 지점별 방문비율
f = str_nm.iloc[:-1,:-1].divide(str_nm.iloc[:-1,-1],axis=0).rename(columns=dict(zip(str_nm.columns,[str(i)+'_prop' for i in str_nm.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 구매월별 방문비율
f = sales_month.iloc[:-1,:-1].divide(sales_month.iloc[:-1,-1],axis=0).rename(columns=
                                                        dict(zip(sales_month.columns,[str(i)+'_prop' for i in sales_month.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 계절별 방문비율
f = season_visit.divide(weekdays.총방문횟수,axis=0).rename(columns=dict(zip(season_visit.columns,[column+'_prop' for column in season_visit.columns]))).reset_index()
features = pd.merge(features,f, on = 'custid')
# 시간대별 방문비율
f = mln.div(mln.iloc[:,-1], axis=0).iloc[:-1,:-1].reset_index().rename(columns=dict(zip(mln.columns,[i+'_prop' for i in mln.columns])))
features = pd.merge(features,f, on = 'custid')
# 할부결제비율
f =(inv['할부결제건수']/trans_amount['구매건수']).reset_index().rename(columns={0:'할부결제비율'}).fillna(0)
features = pd.merge(features,f, on = 'custid')
# 성수기 방문비율
f = peak.iloc[:,1].reset_index().rename(columns={'성수기':'성수기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 비성수기 방문비율
f = peak.iloc[:,0].reset_index().rename(columns={'비성수기':'비성수기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 전반기 방문비율
f = half.iloc[:,0].reset_index().rename(columns={'전반기':'전반기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 후반기 방문비율
f = half.iloc[:,1].reset_index().rename(columns={'후반기':'후반기방문비율'})
features = pd.merge(features,f, on = 'custid')
# 오전 방문비율
f = noon.iloc[:,0].reset_index().rename(columns={'오전':'오전방문비율'})
features = pd.merge(features,f, on = 'custid')
# 오후 방문비율
f = noon.iloc[:,1].reset_index().rename(columns={'오후':'오후방문비율'})
features = pd.merge(features,f, on = 'custid')
# 할인율 평균
df['할인율'] = df.dis_amt/df.tot_amt
f = df.groupby('custid')['할인율'].mean().reset_index()
features = pd.merge(features,f,on='custid',how='left')

In [None]:
del features['custid']

In [None]:
# 이상치 처리
features = features.apply(lambda x: x.clip(x.quantile(0.05), x.quantile(0.95)), axis=0)

# 표준화
features.loc[:,:] = RobustScaler().fit_transform(features)

In [None]:
features['custid'] = tr.custid.unique()

In [None]:
X_train = pd.DataFrame({'custid': df_train.custid.unique()})
X_train = pd.merge(X_train, features, how='left', on='custid')

X_test = pd.DataFrame({'custid': df_test.custid.unique()})
X_test = pd.merge(X_test, features, how='left', on='custid')

merge W2V

In [None]:
X_train = pd.merge(X_train, X_train2, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test2, how='left', on='custid')
X_test = X_test.set_index('custid')

In [None]:
X_train = pd.merge(X_train, X_train3, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test3, how='left', on='custid')
X_test = X_test.set_index('custid')

In [None]:
X_train = pd.merge(X_train, X_train4, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test4, how='left', on='custid')
X_test = X_test.set_index('custid')

In [None]:
X_train = pd.merge(X_train, X_train5, how='left', on='custid')
X_train = X_train.set_index('custid')
X_test = pd.merge(X_test, X_test5, how='left', on='custid')
X_test = X_test.set_index('custid')

One-hot Encoding

In [None]:
level = 'corner_nm'
df_train[level].nunique()
IDtest = df_test.custid.unique()

df_all = pd.concat([df_train, df_test])
x_train = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         set_index('custid')
x_test = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         set_index('custid')

x_train = pd.merge(x_train, X_train, on = 'custid')
x_test = pd.merge(x_test, X_test, on = 'custid')

### Feature Selection

In [None]:
features = x_train
# 사용할 모델 설정 (속도가 빠른 모델 사용 권장)
model = LinearRegression()

# 각 특성과 타깃(class) 사이에 유의한 통계적 관계가 있는지 계산하여 특성을 선택하는 방법 
# feature 개수 바꿔가며 성능 test한다.
cv_scores = []
for p in tqdm(range(5,100,1)):
    X_new = SelectPercentile(percentile=p).fit_transform(x_train, y_train)    
    cv_score = cross_val_score(model, X_new, y_train, scoring='neg_mean_squared_error', cv=5).mean()
    cv_scores.append((p,cv_score))

# Print the best percentile
best_score = cv_scores[np.argmax([score for _, score in cv_scores])]
print(best_score)

# Plot the performance change with p
plt.plot([k for k, _ in cv_scores], [score for _, score in cv_scores])
plt.xlabel('Percent of features')
plt.grid()

In [None]:
-70.99788356506613
(41, -70.84265044169952)


In [None]:
# 과적합을 피하기 위해 최적의 p값 주변의 값을 선택하는게 더 나은 결과를 얻을 수 있다. 
fs = SelectPercentile(percentile=best_score[0]).fit(x_train, y_train)
x_train = fs.transform(x_train)
x_test = fs.transform(x_test)

print(x_train.shape)
print(features.columns[fs.get_support()].tolist())

```python
pd.DataFrame(x_train,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train0606.csv')
pd.DataFrame(x_test,columns=features.columns[fs.get_support()].tolist()).to_csv('x_train0606.csv')
```

### Model Tuning

In [None]:
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

In [None]:
clfs = [
    (
    LinearRegression(),
    {
        
    }),
        (
    Ridge(random_state=0),
    {
        'alpha':[1]
    }),
        (
    Lasso(random_state=0),
    {
        'alpha':[0.010101010101010102]
    }),
        (
    ElasticNet(random_state=0),
    {
        'alpha':[0.010101010101010102]
    }),
#         (
#     ARDRegression(),
#     {
#         'alpha_1':[0],
#         'alpha_2':[1],
#         'lambda_1':[1],
#         'lambda_2':[0.6333]
#     }),
        (
    BayesianRidge(),
    {
        'alpha_1':[0.7272727272727273],
        'alpha_2':[0.030303030303030304],
        'lambda_1':[0.9494949494949496],
        'lambda_2':[0.9494949494949496]
    }),
#         (
#     RandomForestRegressor(random_state=0),
#     {
#         'n_estimators':[400],
#         'max_depth':[None]
#     }),
        (
    XGBRegressor(random_state=0),
    {
        'n_estimators':[400],
        'learning_rate':[0.1],
        'max_depth':[3],
        'gamma':[0.7272727272727273],
        'reg_alpha':[0.9494949494949496],
        'reg_lambda':[0.9494949494949496]
    }),
        (
    LGBMRegressor(random_state=0),
    {
        'n_estimators':[200],
        'num_leaves':[90],
        'learning_rate':[0.1],
        'max_depth':[5],
        'reg_alpha':[0.56],
        'reg_lambda':[0.73]
        
    }),
        (
    CatBoostRegressor(random_state=0),
    {
        
    })
]
clfs_tuned = []
for clf, param_grid in tqdm(clfs):
    start = time.time()
    grid_search = GridSearchCV(clf, param_grid, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    clf_name = type(clf).__name__
    clf_score = np.sqrt(-grid_search.score(x_dev, y_dev))
    clfs_tuned.append([clf_name, grid_search, clf_score])
    print('{:30s} {:30f} {:.1f}'.format(clf_name, clf_score, time.time() - start))

In [None]:
 11%|█████████▎                                                                          | 1/9 [00:07<00:58,  7.37s/it]
LinearRegression                                     8.411364 7.4
 22%|██████████████████▋                                                                 | 2/9 [00:08<00:39,  5.57s/it]
Ridge                                                8.382579 1.4
 33%|████████████████████████████                                                        | 3/9 [00:43<01:25, 14.23s/it]
Lasso                                                8.466564 34.4
 44%|█████████████████████████████████████▎                                              | 4/9 [01:12<01:33, 18.76s/it]
ElasticNet                                           8.458563 29.3
 56%|██████████████████████████████████████████████▋                                     | 5/9 [03:28<03:36, 54.03s/it]
ARDRegression                                        8.407394 136.3
 67%|████████████████████████████████████████████████████████                            | 6/9 [03:37<02:01, 40.45s/it]
BayesianRidge                                        8.417398 8.7
 78%|████████████████████████████████████████████████████████████████▌                  | 7/9 [11:04<05:25, 162.52s/it]
XGBRegressor                                         8.357070 447.4
 89%|█████████████████████████████████████████████████████████████████████████▊         | 8/9 [11:40<02:04, 124.28s/it]
LGBMRegressor                                        8.359857 35.1
100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [20:03<00:00, 133.71s/it]
CatBoostRegressor                                    8.294828 503.4

### Model Ensemble

In [None]:
pred_results = []
for name, clf, clf_score in clfs_tuned:
    pred = list(clf.predict(x_dev))
    name = f'{name} \n({clf_score:.4f})'
    pred_results.append(pd.Series(pred, name=name))
ensemble_results = pd.concat(pred_results, axis=1)
ensemble_results = ensemble_results.applymap(lambda x: float(x))

# 모형의 예측값 간의 상관관계를 보기 위해 hitmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results.corr(), annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
rmse = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'rmse': rmse, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="rmse", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.rmse[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.rmse.min()-0.01,df.rmse.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('RMSE')
plt.grid()
plt.show()

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
#             'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,clf) for name,clf,score in clfs_tuned if name in selected]
avg_reg = VotingRegressor(estimators=models_for_ensemble)
avg_reg.fit(x_train,y_train)
score = mean_squared_error(avg_reg.predict(x_dev), y_dev, squared=False)
print(score)
pd.DataFrame({'custid': test_id, 'age': avg_reg.predict(x_test)}).to_csv('averaging_rid_lgbm_cat.csv', index=False)

In [None]:
8.245049744129629 
8.242384614765621
8.239661654729673
8.220307683303453
8.21609175457467

In [None]:
model = []
for name, clf, score in clfs_tuned:
    model.append(clf.best_estimator_)

In [None]:
# model = []
# for name, clf, score in clfs_tuned:
#     model.append(clf.best_estimator_)
train_x_all, train_y_all = x_train.copy(), y_train.copy()
models = []
# model_Linear = model[0]
# model_Linear.fit(x_train, y_train)
# train_Linear = model_Linear.predict(x_dev)
# models.append(model_Linear)

model_Ridge = model[1]
model_Ridge.fit(x_train, y_train)
train_Ridge = model_Ridge.predict(x_dev)
models.append(model_Ridge)


# model_Lasso = model[2]
# model_Lasso.fit(x_train, y_train)
# train_Lasso = model_Lasso.predict(x_dev)
# models.append(model_Lasso)


# model_ENet = model[3]
# model_ENet.fit(x_train, y_train)
# train_ENet = model_ENet.predict(x_dev)
# models.append(model_ENet)


# model_ARD = model[4]
# model_ARD.fit(x_train, y_train)
# train_ARD = model_ARD.predict(x_dev)
# models.append(model_ARD)


# model_Bayesian = model[5]
# model_Bayesian.fit(x_train, y_train)
# train_Bayesian = model_Bayesian.predict(x_dev)
# models.append(model_Bayesian)


model_XGB = model[6]
model_XGB.fit(x_train, y_train, verbose=False)
train_XGB = model_XGB.predict(x_dev)
models.append(model_XGB)


# model_LGBM = model[7]
# model_LGBM.fit(x_train, y_train, verbose=False)
# train_LGBM = model_LGBM.predict(x_dev)
# models.append(model_LGBM)


model_Cat = model[8]
model_Cat.fit(x_train, y_train, verbose=False)
train_Cat = model_Cat.predict(x_dev)
models.append(model_Cat)

In [None]:
weights = []
rmse_best = 1000
for i in range(1, 10, 1):
    for j in range(1, 10, 1):
        for k in range(1, 10, 1):
            if (i+j+k) != 10:
                continue
            pred = (models[0].predict(x_dev).flatten() * i + models[1].predict(x_dev) * j + models[2].predict(x_dev) * k)/10
            rmse = mean_squared_error(pred, y_dev, squared=False)
            print(rmse, i,j,k)            
            if rmse < rmse_best:
                weights = [i,j,k]
                rmse_best = rmse 
print(rmse_best, weights)

In [None]:
for i, m in enumerate(models):
    m.fit(train_x_all, train_y_all)
    models[i] = m

In [None]:
pred = (models[0].predict(x_test).flatten() * weights[0] + models[1].predict(x_test) * weights[1] + models[2].predict(x_test) * weights[2])/10
t = pd.Timestamp.now()
fname = f"submission_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="age")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,clf) for name,clf,score in clfs_tuned if name in selected]
avg_reg = VotingRegressor(estimators=models_for_ensemble)
avg_reg.fit(x_train2,y_train2)
score = mean_squared_error(avg_reg.predict(x_dev), y_dev, squared=False)
score

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            #'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,clf) for name,clf,score in clfs_tuned if name in selected]
avg_reg = VotingRegressor(estimators=models_for_ensemble)
avg_reg.fit(x_train2,y_train2)
score = mean_squared_error(avg_reg.predict(x_dev), y_dev, squared=False)
score

### Model Stacking

In [None]:
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            #'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,clf) for name,clf,score in clfs_tuned if name in selected]
selected_s = [#'LinearRegression',
            #'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            #'LGBMRegressor',
            'CatBoostRegressor'
            ]
max_score = 0
models_for_stacking = [(name,clf) for name,clf,score in clfs_tuned if name in selected_s]
for name, clf in tqdm(models_for_stacking):
    stk_reg = StackingRegressor(estimators=models_for_ensemble, final_estimator=clf)
    stk_reg.fit(x_train2,y_train2)
    score = mean_squared_error(stk_reg.predict(x_dev),y_dev, squared=False)
    if max_score < score:
            best_stk_ensemble = (stk_reg, score)
            max_score = score
models, score = best_stk_ensemble
print('{}\n{}'.format(models, score))

### Deployment

In [None]:
# ensemble-
selected = [#'LinearRegression',
            'Ridge',
            #'Lasso',
            #'ElasticNet',
            #'ARDRegression',
            #'BayesianRidge',
            #'RandomForestRegressor',
            #'XGBRegressor',
            #'LGBMRegressor',
            'CatBoostRegressor'
            ]
models_for_ensemble = [(name,clf) for name,clf,score in clfs_tuned if name in selected]
avg_reg = VotingRegressor(estimators=models_for_ensemble)
avg_reg.fit(x_train,y_train)
pd.DataFrame({'custid': test_id, 'age': avg_reg.predict(x_test)}).to_csv('averaging_rid_cat.csv', index=False)

In [None]:
with open('averaging_rid_cat','wb') as f:
    pickle.dump(avg_reg, f)