In [None]:
import numpy as np
import numpy.random as random
import pandas as pd
import datetime
import math

import sklearn
from sklearn.model_selection import train_test_split
import lightgbm as lgb

from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Dropout
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding

pd.set_option('display.max_columns', 200)
%precision 3

In [None]:
#データの読み取り
train = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv")
test = pd.read_csv("/kaggle/input/rossmann-store-sales/test.csv")
store = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv")
state = pd.read_csv("/kaggle/input/rossmann-store-extra/store_states.csv")
state_name = pd.read_csv("/kaggle/input/rossmann-store-extra/state_names.csv")
weathers = pd.read_csv("/kaggle/input/rossmann-store-extra/weather.csv")
L = len(test)

In [None]:
def rmspe(y_pred, y_test):
    return np.sqrt(np.mean(((y_pred - y_test) / y_test)**2))

#Date系の特徴量生成
#Month, Day等はEmbeddingの為に0始まりにしている
def date(data):
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Year'] = data['Year'] - data['Year'].min()
    data['Month'] = data['Date'].dt.month - 1
    data['Day'] = data['Date'].dt.day - 1
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['WeekOfYear'] = data['Date'].dt.weekofyear
    data['Days'] = (data['Date'] - data['Date'].min()).dt.days
    data['Days'] = data['Days'] / data['Days'].max()
    data['DayOfMonth'] = [0 if i<=10 else 1 if i<=20 else 2 for i in data['Day']]
    data['QuadYear'] = [0 if i<=13 else 1 if i<=26 else 2 if i<=39 else 3 for i in data['WeekOfYear']]
    return data

#カテゴリカル変数のラベル付け
def label(data, column):
    unique = data[column].unique()
    k = 0
    for str in unique:
        data.loc[data[column] == str, column] = k
        k += 1

#カテゴリカル変数をEntityEmbeddingで得られた重みに置き換える
def replace(data, weights, features, drop=True):
    for feature in features:
        data = data.merge(weights[feature], how='left', on=[feature])
        if drop == True:
            data = data.drop([feature], axis=1)
    return data

#storeの前処理
def pre_store(store, state):
    store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)
    store.fillna(0, inplace = True)
    store = store.merge(state, how='left', on=['Store'])
    return store

#weatherの前処理
def pre_weather(data, weather):
    weather = weather.rename(columns={'file': 'StateName'})
    weather = weather.merge(state_name, how='left', on=['StateName'])
    weather['Date'] = pd.to_datetime(weather['Date'])
    data = data.merge(weather, how='left', on=['State', 'Date'])
    data['Events'].fillna('NaN', inplace = True)
    label(data, 'Events')
    label(data, 'State')
    data['Max_Gust_SpeedKm_h'].fillna(0, inplace=True)
    data['CloudCover'].fillna(data['CloudCover'].median(), inplace = True)
    data['Max_VisibilityKm'].fillna(data['Max_VisibilityKm'].median(), inplace = True)
    data['Mean_VisibilityKm'].fillna(data['Mean_VisibilityKm'].median(), inplace = True)
    data['Min_VisibilitykM'].fillna(data['Min_VisibilitykM'].median(), inplace = True)
    data['CloudCover'].fillna(data['CloudCover'].median(), inplace = True)
    data = data.drop(['Date', 'StateName'], axis=1)
    return data

#データの前処理
def preprocess(train, test, store):
    train['Id'] = 0
    test.fillna(1, inplace=True)
    data = pd.concat([train, test], sort=False)
    data = date(data)
    data = data[(data['Open']!=0) & (data['Sales']!=0)]
    data = pd.merge(data, store, how = 'inner', on = 'Store')
    data['Store'] = data['Store'] - 1
    data['CompetitionOpen'] = 12 * (2015 - data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    data['PromoOpen'] = 12 * (2015 - data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['SalesLog'] = data['Sales'].map(math.log)
    data['CustomersLog'] = data['Customers'].map(math.log)
    data.loc[data['StateHoliday'] == 0, 'StateHoliday'] = '0'
    label(data, 'StateHoliday')
    label(data, 'StoreType')
    label(data, 'Assortment')
    label(data, 'PromoInterval')
    data = data.drop(['Customers', 'Open'], axis=1)
    return data

In [None]:
class EntitiyEmbedding:
    def __init__(self):
        self.input_model = []
        self.output_model = []
        self.features = []
        self.embeddings = []

    def add(self, feature, input_shape, output_shape):
        self.features.append(feature)
        self.embeddings.append(feature)
        input_model = Input(shape=(1,), name=(feature + '_input'))
        output_model = Embedding(input_shape, output_shape, name=(feature + '_embedding'))(input_model)
        output_model = Reshape(target_shape=(output_shape,))(output_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def dense(self, feature, output_shape):
        self.features.append(feature)
        input_model = Input(shape=(1,), name=(feature + '_input'))
        output_model = Dense(output_shape, name=(feature + '_dense'))(input_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def concatenate(self):
        output_model = Concatenate()(self.output_model)
        output_model = Dense(1000, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(500, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)
        self.model = KerasModel(inputs=self.input_model, outputs=output_model)
        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def split_features(self, X):
        X_list = {}
        for feature in self.features:
            X_list[feature + '_input'] = X[[feature]]
        return X_list

    def fit(self, X_train, y_train, X_test, y_test, epochs=12, batch_size=128):
        self.X_test = X_test
        self.model.fit(self.split_features(X_train), y_train,
                       validation_data=(self.split_features(X_test), y_test),
                       epochs=epochs,
                       batch_size=batch_size)

    def predict(self, X=None):
        if X is None:
            X = self.X_test
        pred = self.model.predict(self.split_features(X))
        return pred

    def get_weight(self):
        weights = {}
        for feature in self.embeddings:
            w = self.model.get_layer(feature + '_embedding').get_weights()[0]
            columns = []
            for i in range(w.shape[1]):
                columns.append(feature + '_' + str(i))
            w = pd.DataFrame(w, columns=columns)
            w.index.names = [feature]
            weights[feature] = w
        return weights

In [None]:
#データの前処理を行う
store = pre_store(store, state)
data = preprocess(train, test, store)
data = pre_weather(data, weathers)

smax = data['SalesLog'].max()
cmax = data['CustomersLog'].max()

In [None]:
#EntitiyEmbeddingクラスを使い重みを学習する。
#カテゴリカル変数は model.add(カラム名，ラベル数，出力の次元数(任意，チューニング可))で層を追加
#連続値や0, 1のみの特徴量は model.dense(カラム名，出力の数)で層を追加

model = EntitiyEmbedding()
model.add('Store', input_shape=1115, output_shape=10)
model.add('DayOfWeek', input_shape=7, output_shape=6)
model.add('Year', input_shape=3, output_shape=2)
model.add('Day', input_shape=31, output_shape=10)
model.add('Month', input_shape=12, output_shape=6)
model.add('DayOfMonth', input_shape=3, output_shape=2)
model.add('QuadYear', input_shape=4, output_shape=3)
model.add('State', input_shape=12, output_shape=6)
model.add('StateHoliday', input_shape=4, output_shape=3)
model.add('WeekOfYear', input_shape=53, output_shape=10)
model.add('Assortment', input_shape=3, output_shape=2)
model.add('StoreType', input_shape=4, output_shape=3)
model.add('PromoInterval', input_shape=4, output_shape=3)
model.add('Events', input_shape=22, output_shape=10)
model.dense('Promo', output_shape=1)
model.concatenate()

In [None]:
#trainとtestを分割
train = data[data['Id']==0]
train = train.sample(frac=1, random_state=22)

test = data[data['Id']!=0]
Id = test['Id']
33
X = train.copy()
y = train[['CustomersLog', 'SalesLog']]

#20万個を重みの学習に使用
#重みの学習と予測器の学習でデータを分けて過学習を抑える。
X_train, X_ee, y_train, y_ee = train_test_split(X, y, test_size=200000, random_state=44)

In [None]:
#重みの学習
model.fit(X_ee, y_ee['CustomersLog']/cmax, X_train, y_train['CustomersLog']/cmax, epochs=12)

In [None]:
#学習した重みを辞書型で取得
weights = model.get_weight()

#カテゴリカル変数を学習した重みに置き換える
X_train = replace(X_train, weights, model.embeddings)
test = replace(test, weights, model.embeddings)

#使用しないカラムを除去
X_train = X_train.drop(['Sales', 'Id', 'SalesLog', 'CustomersLog', 
                        'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa', 
                  'Min_Sea_Level_PressurehPa'], axis=1)
test = test.drop(['Sales', 'Id', 'SalesLog', 'CustomersLog', 
                        'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa', 
                  'Min_Sea_Level_PressurehPa'], axis=1)

X_train.head()

In [None]:
#LightGBMで予測器の学習

params = {"objective" : "rmse",
          "boosting" : "gbdt", 
          "metric" : "rmse",
          "num_iterations" : 7500,
          "top_k" : 30, 
          "max_depth" : 8, 
          "num_leaves" : 800, 
          "min_data_in_leaf" : 20, 
          "learning_rate" : 0.02,
          "bagging_fraction" : 0.7, 
          "bagging_seed" : 3,
          "bagging_freq" : 5, 
          "feature_fraction" : 0.5, 
          "num_threads" : 4
         }

dataset_params = {"max_bin" : 200, 
                  "min_data_in_bin" : 3 
                 }
lgb_train = lgb.Dataset(X_train, label=y_train['SalesLog'], params=dataset_params)
model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True)

In [None]:
pred = model.predict(test)
pred = np.exp(pred)
pred = pd.DataFrame(pred.T, index=Id, columns=['Sales'])

df = pd.DataFrame(range(1, L+1), columns=['Id'])
df = df.merge(pred, how='left', on=['Id'])
df.fillna(0, inplace=True)
df.to_csv("/kaggle/working/submission.csv", index=False)