In [10]:
# General imports
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math
import random
import sys, gc, time
import os

# data
import datetime
import itertools
import json
import pickle

# visualize
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler #StandardScaler
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

# model
import lightgbm as lgb
from bayes_opt import BayesianOptimization

# custom modules
# from engine.features_yj import Features
from preprocess import load_df_added, drop_useless, check_na, run_label_all, remove_outliers, run_stdscale

In [4]:
#local_DIR = os.getcwd()
featured_DATA_DIR = '../data/20'
# PROCESSED_DATA_DIR = local_DIR +'/data/21'

df_wd_lag = pd.read_pickle(featured_DATA_DIR + '/train_fin_wd_lag.pkl')
df_wd_no_lag = pd.read_pickle(featured_DATA_DIR + '/train_fin_wd_no_lag.pkl')
df_wk_lag = pd.read_pickle(featured_DATA_DIR + '/train_fin_wk_lag.pkl')
df_wk_no_lag = pd.read_pickle(featured_DATA_DIR + '/train_fin_wk_no_lag.pkl')
#df_all_lag = pd.read_pickle(featured_DATA_DIR + '/train_fin_light_ver.pkl')
#df_all = pd.read_pickle(featured_DATA_DIR + '/train_fin_wk_lag.pkl')

In [5]:
## drop unnecessary lag columns
df_wd_lag = df_wd_lag.drop(columns = ['lag_sales_wk_1','lag_sales_wk_2'])
df_wk_lag = df_wk_lag.drop(columns = ['lag_sales_wd_1', 'lag_sales_wd_2','lag_sales_wd_3', 'lag_sales_wd_4', 'lag_sales_wd_5'])


In [6]:
lag_col1 = ['lag_scode_count','lag_mcode_price','lag_mcode_count','lag_bigcat_price','lag_bigcat_count',
            'lag_bigcat_price_day','lag_bigcat_count_day','lag_small_c_price','lag_small_c_count']

lag_col2 = ['rolling_mean_7', 'rolling_mean_14', 'lag_sales_wd_1', 'lag_sales_wd_2','lag_sales_wd_3',
            'lag_sales_wd_4', 'lag_sales_wd_5', 'lag_sales_wk_1','lag_sales_wk_2', 'ts_pred',
           'rolling_mean_mcode_7','rolling_mean_mcode_14',]

cat_col = ['상품군','weekdays','show_id','small_c','middle_c','big_c',
                        'pay','months','hours_inweek','weekends','japp','parttime',
                        'min_start','primetime','prime_origin','prime_smallc',
                        'freq','bpower','steady','men','pay','luxury',
                        'spring','summer','fall','winter','rain']

In [8]:
## simple function that will be used for run_preprocess
def na_to_zeroes(df):
    """
    :objective: Change all na's to zero.(just for original lag!)
    :return: pandas dataframe
    """
    xcol = [x for x in df.columns if x in lag_col1+lag_col2]
    for col in xcol:
        df[col] = df[col].fillna(0)

    return df

def drop_cat(df_pca):
    """
    :objective: Before PCA, drop categorical variables
    :return: pandas dataframe
    """
    xcol = [x for x in df_pca.columns if x in cat_col+lag_col2]
    df_pca = df_pca.drop(columns = xcol)
    df_pca = df_pca.drop(columns = '취급액')

    return df_pca

def run_pca(df_pca_scaled, n_components = 5):
    """
    :objective: Run PCA with n_components = 5
    :return: pandas dataframe
    """
    pca = PCA(n_components = 5)
    pca.fit(df_pca_scaled)
    df_pca = pca.transform(df_pca_scaled)

    return df_pca

## run preprocessing in a shot
## pca is optional and only applied to numeric features other than 'lag'
## NOTICE: removing outliers were run prior to dividing train/val
## if replace = True, new PCA will replace corresponding numerical columns
## if you want to simply add PCA columns to original data, set replace = False
def run_preprocess(df, pca = True, replace = True):
    """
    :objective: Run Feature deletion, NA imputation, label encoding, pca(optional)
    :return: pandas dataframe
    """
#     df = drop_useless(df)
    df = na_to_zeroes(df)
    df = remove_outliers(df)
    df = run_label_all(df)
    df1 = df.copy()
    if pca:
        xcol = [x for x in df1.columns if x in cat_col+lag_col2]
        df_pca = df1.copy()
        df_pca = drop_cat(df_pca).copy()
        df_pca = run_stdscale(df_pca)
        df_pca = run_pca(df_pca)
        if replace:
            df_pca1 = pd.concat([df1[xcol], pd.DataFrame(df_pca)], axis=1)
            return df_pca1
        else:
            df_pca2 = pd.concat([df1, pd.DataFrame(df_pca)], axis=1)
            return df_pca2
    else:
        return df1


In [54]:
# df_wd_lag_PP = run_preprocess(df_wd_lag, pca = False, replace = False)
# df_wd_no_lag_PP = run_preprocess(df_wd_no_lag, pca = True, replace =False)
#df_wk_lag_PP = run_preprocess(df_wk_lag, pca = True, replace = False)
#df_wk_no_lag_PP = run_preprocess(df_wk_no_lag, pca = True, replace = False)

In [11]:
# change here
df_wd_lag_PP = na_to_zeroes(df_wd_lag)
train1_x = df_wd_lag_PP.iloc[:16904,:].drop(columns = ['show_id','취급액'])
train1_y = df_wd_lag_PP.iloc[:16904]['취급액']
val1_x = df_wd_lag_PP.iloc[16904:,:].drop(columns = ['show_id','취급액'])
val1_y = df_wd_lag_PP.iloc[16904:]['취급액']

In [12]:
def neg_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    result = (-1)*mape
    return result

In [13]:
def get_rmse(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    return rmse

In [14]:
def get_mae(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mae = np.mean(np.abs(y_true - y_pred))
    return mae

## DNN with categorical embeddings

In [15]:
#  hours in week, small_c, middle_c, big_c, weekdays

check cardinality

In [17]:
train1_x[['hours_inweek','small_c', 'middle_c', 'big_c', 'weekdays']].nunique()

hours_inweek    100
small_c         169
middle_c         65
big_c             9
weekdays          5
dtype: int64

In [18]:
import tensorflow as tf
import tensorflow.keras as keras

#from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, concatenate, Flatten, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback
import matplotlib.pyplot as plt

# Bayesian Methods for Hackers style sheet
plt.style.use('bmh')

np.random.seed(1234567890)
import gc

In [19]:
class PeriodicLogger(Callback):
    """
    A helper callback class that only prints the losses once in 'display' epochs
    """
    def __init__(self, display=100):
        self.display = display

    def on_train_begin(self, logs={}):      
        self.epochs = 0    

    def on_epoch_end(self, batch, logs={}):    
        self.epochs += 1     
        if self.epochs % self.display == 0:
            print ("Epoch: %d - loss: %f - val_loss: %f" % (self.epochs, logs['loss'], logs['val_loss']))
 
            
periodic_logger_250 = PeriodicLogger(250)

#### Separate the continuous and categorical variables 

In [20]:
# change here

# continuous variables
continuous_cols = ['rolling_mean_7', 'rolling_mean_14', 'rolling_mean_mcode_7',
       'rolling_mean_mcode_14', 'lag_sales_wd_1', 'lag_sales_wd_2',
       'lag_sales_wd_3', 'lag_sales_wd_4', 'lag_sales_wd_5', 'ts_pred']
# categorical variables
categorical_cols = ['hours_inweek','small_c', 'middle_c', 'big_c', 'weekdays']

X_train_continuous = train1_x[continuous_cols]
X_train_categorical = train1_x[categorical_cols]
y_train = train1_y

X_val_continuous = val1_x[continuous_cols]
X_val_categorical = val1_x[categorical_cols]
y_val = val1_y

In [156]:
y_val.head(10)

16904     5600000.0
16905    10638000.0
16906    11241000.0
16907    25385000.0
16908    24011000.0
16909    42821000.0
16910    14583000.0
16911    26732000.0
16912    26254000.0
16913    25153000.0
Name: 취급액, dtype: float64

#### Normalization

In [157]:
# Normalizing both train and test sets to have 0 mean and std. of 1 using the train set mean and std.
# This will give each feature an equal initial importance and speed up the training time
train_mean = X_train_continuous.mean(axis=0)
train_std = X_train_continuous.std(axis=0)

X_train_continuous = X_train_continuous - train_mean
X_train_continuous /= train_std

X_val_continuous = X_val_continuous - train_mean
X_val_continuous /= train_std

# Build a model using a categorical variable

In [40]:
class EmbeddingMapping():
    """
    Helper class for handling categorical variables
    
    An instance of this class should be defined for each categorical variable we want to use.
    """
    def __init__(self, series):
        # get a list of unique values
        values = series.unique().tolist()
        
        # Set a dictionary mapping from values to integer value
        self.embedding_dict = {value: int_value+1 for int_value, value in enumerate(values)}
        
        # The num_values will be used as the input_dim when defining the embedding layer. 
        # It will also be returned for unseen values 
        self.num_values = len(values) + 1

    def get_mapping(self, value):
        # If the value was seen in the training set, return its integer mapping
        if value in self.embedding_dict:
            return self.embedding_dict[value]
        
        # Else, return the same integer for unseen values
        else:
            return self.num_values

### Create an embedding column for the train/validation sets

In [41]:
# change here
# if you change categorical variables used in the model,
# you have to create an embedding column 
hours_inweek_mapping = EmbeddingMapping(X_train_categorical['hours_inweek'])
small_c_mapping = EmbeddingMapping(X_train_categorical['small_c'])
middle_c_mapping = EmbeddingMapping(X_train_categorical['middle_c'])
big_c_mapping = EmbeddingMapping(X_train_categorical['big_c'])
weekdays_mapping = EmbeddingMapping(X_train_categorical['weekdays'])


X_train_categorical = X_train_categorical.assign(
    hours_inweek_mapping=X_train_categorical['hours_inweek'].apply(hours_inweek_mapping.get_mapping),
    small_c_mapping=X_train_categorical['small_c'].apply(small_c_mapping.get_mapping),
    middle_c_mapping=X_train_categorical['middle_c'].apply(middle_c_mapping.get_mapping),
    big_c_mapping=X_train_categorical['big_c'].apply(big_c_mapping.get_mapping),
    weekdays_mapping=X_train_categorical['weekdays'].apply(weekdays_mapping.get_mapping))

X_val_categorical = X_val_categorical.assign(
    hours_inweek_mapping=X_val_categorical['hours_inweek'].apply(hours_inweek_mapping.get_mapping),
    small_c_mapping=X_val_categorical['small_c'].apply(small_c_mapping.get_mapping),
    middle_c_mapping=X_val_categorical['middle_c'].apply(middle_c_mapping.get_mapping),
    big_c_mapping=X_val_categorical['big_c'].apply(big_c_mapping.get_mapping),
    weekdays_mapping=X_val_categorical['weekdays'].apply(weekdays_mapping.get_mapping))


# Model

In [50]:
num_dense_features = len(continuous_cols)
# change here
lr = 0.002
tf.keras.backend.clear_session()
gc.collect()

# Dense input
# for continuous variables
dense_input = Input(shape=(num_dense_features, ), name='dense1')

# Embedding input
# change here if you add another categorical variable
hours_inweek_input = Input(shape=(1,), name='hours_inweek')
small_c_input = Input(shape=(1,), name='small_c')
middle_c_input = Input(shape=(1,), name='middle_c')
big_c_input = Input(shape=(1,), name='big_c')
weekdays_input = Input(shape=(1,), name='weekdays')

# if cardinality of a categorical variable is over 50,
# Howard suggests to set the output dim to be 50 
# to avoid problems from high cardinality
hours_inweek_emb = Flatten()(Embedding(input_dim=hours_inweek_mapping.num_values+1, output_dim = 50)(hours_inweek_input))
small_c_emb = Flatten()(Embedding(input_dim=small_c_mapping.num_values+1, output_dim = 50)(small_c_input))
middle_c_emb = Flatten()(Embedding(input_dim=middle_c_mapping.num_values+1, output_dim = 50)(middle_c_input))
big_c_emb = Flatten()(Embedding(input_dim=big_c_mapping.num_values+1, output_dim = 9)(big_c_input))
weekdays_emb = Flatten()(Embedding(input_dim=weekdays_mapping.num_values+1, output_dim = 5)(weekdays_input))

# Combine dense and embedding parts and add dense layers. Exit on linear scale.
# change here if you add a categorical var
x = concatenate([dense_input, 
                 hours_inweek_emb, 
                 small_c_emb, 
                 middle_c_emb, 
                 big_c_emb, 
                 weekdays_emb])

# change here if needed
x = Dense(256*2, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(256*2, activation="relu")(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = Dense(128*2, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(128*2, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(64*2, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(16*2, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)
x = Dense(4*2, activation="relu")(x)
x = BatchNormalization()(x)

outputs = Dense(1, activation="linear", name='output')(x)

# change here if you add sth
inputs = {"dense1": dense_input,
          "hours_inweek": hours_inweek_input, 
          "small_c": small_c_input, 
          "middle_c": middle_c_input,
          "big_c": big_c_input, 
          "weekdays": weekdays_input}

# Connect input and output
model = Model(inputs, outputs)

model.compile(loss=keras.losses.mean_absolute_percentage_error,
              metrics=["mape"], 
              optimizer=keras.optimizers.Adam(learning_rate=lr))


In [51]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hours_inweek (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
small_c (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
middle_c (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
big_c (InputLayer)              [(None, 1)]          0                                            
_______________________________________________________________________________________

In [52]:
# change here depending on categorical variables used
train_inputs = {"dense1": X_train_continuous,
          "hours_inweek": X_train_categorical['hours_inweek_mapping'], 
          "small_c": X_train_categorical['small_c_mapping'], 
          "middle_c": X_train_categorical['middle_c_mapping'],
          "big_c": X_train_categorical['big_c_mapping'], 
          "weekdays": X_train_categorical['weekdays_mapping']}

val_inputs = {"dense1": X_val_continuous,
          "hours_inweek": X_val_categorical['hours_inweek_mapping'], 
          "small_c": X_val_categorical['small_c_mapping'], 
          "middle_c": X_val_categorical['middle_c_mapping'],
          "big_c": X_val_categorical['big_c_mapping'], 
          "weekdays": X_val_categorical['weekdays_mapping']}

In [None]:
# change here
epochs = 10000
# Note continuous and categorical columns are inserted in the same order as defined in all_inputs
history = model.fit(train_inputs, 
                    y_train, epochs=epochs, batch_size=128, 
          callbacks=[periodic_logger_250], verbose=0,
          validation_data=(val_inputs, y_val))


In [None]:
# Plot the train/validation loss values
plt.figure(figsize=(20,10))
_loss = history.history['loss'][250:]
_val_loss = history.history['val_loss'][250:]

train_loss_plot, = plt.plot(range(1, len(_loss)+1), _loss, label='Train Loss')
val_loss_plot, = plt.plot(range(1, len(_val_loss)+1), _val_loss, label='Validation Loss')

_ = plt.legend(handles=[train_loss_plot, val_loss_plot])

In [None]:
print ("This is the average value we are trying to predict: %d" % y_val.mean().iloc[0])

## How good are the model's predictions?

In [None]:
df = y_val.copy()

# Add a column for the model's predicted values
df['pred'] = model.predict(val_inputs)

# Calculate the difference between the predicted and the actual price
df['diff'] = df['pred'] - df['취급액']

# Calculate the absolute difference between the predicted and the actual price
df['abs_diff'] = np.abs(df['diff'])

# Calculate the percentage of the difference from the actual price
df['%diff'] = 100 * (df['diff'] / df['취급액'])

# Calculate the absolute percentage difference from the actual price
df['abs_%diff'] = np.abs(df['%diff'])

### What is the biggest difference in absolute values?

In [None]:
# Sort by the 'abs_diff' field and show the 5 largest mistakes in absolute values
df.sort_values("abs_diff", ascending=False).head(5)

In [None]:
# Calculate the mean and std. of the diff field
diff_mean, diff_std = df['diff'].mean(), df['diff'].std()
print("The mean is very close to 0 ({mean}) with std. {std}.".format(mean=round(diff_mean, 2), std=round(diff_std, 2)))

In [None]:
# Here is the histogram of the differences
plt.figure(figsize=(20,10))
plt.hist(df['diff'], bins=100)
plt.xlabel("$")
plt.ylabel("# samples")
_ = plt.title("Difference between predicted and actual price")


### What is the biggest difference in percentage?

In [None]:
# Sort by the '%diff' field and show the 5 largest proportional mistakes
df.sort_values("abs_%diff", ascending=False).head(5)

In [None]:
# Also, plot the histogram
plt.figure(figsize=(20,10))
plt.hist(df['%diff'], bins=100)
plt.xlabel("%")
plt.ylabel("# samples")
_ = plt.title("% of difference between predicted and actual price")