---
# [Tabular Playground Series - Jan 2022][1]
---

---
[1]: https://www.kaggle.com/c/tabular-playground-series-jan-2022

# 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import os
import pathlib
import gc
import sys
import re
import math 
import random
import time 
from tqdm import tqdm 
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers.experimental import preprocessing

import transformers 
import datasets 

print('import done!')

In [None]:
# global config
config = {'window_size': 7,
          'batch_size': 8,
          'valid_num': 200,
          #'rnn_hidden': 128,
          #'learning_rate': 5e-4,
          'num_epochs': 20,
          'max_trials': 20,
         }

AUTOTUNE = tf.data.experimental.AUTOTUNE

# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
    print('Seeds setted!')
global_seed = 42
seed_all(global_seed)

# 1. Data Preprocessing

## 1.1 Data Check

In [None]:
data_config = {'train_csv_path': '../input/tabular-playground-series-jan-2022/train.csv',
              'test_csv_path': '../input/tabular-playground-series-jan-2022/test.csv',
              'sample_submission_path': '../input/tabular-playground-series-jan-2022/sample_submission.csv',
              }

train_df = pd.read_csv(data_config['train_csv_path'])
test_df = pd.read_csv(data_config['test_csv_path'])
submission_df = pd.read_csv(data_config['sample_submission_path'])

print(train_df.shape, test_df.shape, submission_df.shape)
train_df.head()

In [None]:
train_df['num_sold'].describe()

In [None]:
print(len(train_df))
print()
train_df.dtypes

In [None]:
def unique_category(df, column):
    print(f'unique_category_number: {df[column].nunique()}')
    print(f'cagetories: {df[column].unique()}')
    print()

unique_category(train_df, 'country')
unique_category(train_df, 'store')
unique_category(train_df, 'product')

In [None]:
train_df.isnull().sum()

## 1.2 Feature Engneering -1

In [None]:
def date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    return df 

train_df = date_features(train_df)
train_df = train_df.drop(['date', 'year'], axis=1)

test_df = date_features(test_df)
test_df = test_df.drop(['date', 'year'], axis=1)

train_df.head()

## 1.3 Feature Engneering -2

In [None]:
train_df.head(20)

In [None]:
feature_num = len(train_df['country'].unique()) * len(train_df['store'].unique()) * len(train_df['product'].unique()) # 18
series_data_num = int(len(train_df) / feature_num) # 1461

series_features = []
for i in range(series_data_num):
    feature = list(train_df['num_sold'][i* feature_num : (i+1) * feature_num])
    series_features.append(feature)

series_columns = []
for country in train_df['country'].unique():
    for store in train_df['store'].unique():
        for product in train_df['product'].unique():
            name = f'{country}_{store}_{product}'
            series_columns.append(name)
            
series_df = pd.DataFrame(series_features, columns=series_columns)
print(len(series_df))

series_df.head()

In [None]:
series_df.describe()

In [None]:
train_series_df = series_df[:-200].copy()
valid_series_df = series_df[-200:].copy()

print(len(train_series_df), len(valid_series_df))

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(train_series_df)
print(sc.mean_, sc.scale_)

train_series_df = pd.DataFrame(sc.transform(train_series_df), columns=series_columns)
valid_series_df = pd.DataFrame(sc.transform(valid_series_df), columns=series_columns)
all_series_df = pd.DataFrame(sc.transform(series_df), columns=series_columns)
print(len(train_series_df), len(valid_series_df), len(all_series_df))

train_series_df.head(10)

# 2. Dataset

In [None]:
window_size = config['window_size']

train_X = []
train_y = []

for i in range(len(train_series_df) - window_size):
    tmp_X = np.array(train_series_df.iloc[i:(i+window_size)])
    tmp_y = np.array(train_series_df.iloc[(i+1):(i+window_size+1)])
    train_X.append(tmp_X)
    train_y.append(tmp_y)

train_X = tf.constant(train_X, dtype=tf.float32)
train_y = tf.constant(train_y, dtype=tf.float32)
print(train_X.shape, train_y.shape)

In [None]:
valid_X = []
valid_y = []
for i in range(len(valid_series_df) - window_size):
    tmp_X = np.array(valid_series_df.iloc[i:(i+window_size)])
    tmp_y = np.array(valid_series_df.iloc[(i+1):(i+window_size+1)])
    valid_X.append(tmp_X)
    valid_y.append(tmp_y)
valid_X = tf.constant(valid_X, dtype=tf.float32)
valid_y = tf.constant(valid_y, dtype=tf.float32)
print(valid_X.shape, valid_y.shape)

all_X = []
all_y = []
for i in range(len(all_series_df) - window_size):
    tmp_X = np.array(all_series_df.iloc[i:(i+window_size)])
    tmp_y = np.array(all_series_df.iloc[(i+1):(i+window_size+1)])
    all_X.append(tmp_X)
    all_y.append(tmp_y)
all_X = tf.constant(all_X, dtype=tf.float32)
all_y = tf.constant(all_y, dtype=tf.float32)
print(all_X.shape, all_y.shape)

In [None]:
train_X_ds = tf.data.Dataset.from_tensor_slices(train_X)
train_y_ds = tf.data.Dataset.from_tensor_slices(train_y)
train_ds = tf.data.Dataset.zip((train_X_ds, train_y_ds))
print(train_ds, len(train_ds) )

valid_X_ds = tf.data.Dataset.from_tensor_slices(valid_X)
valid_y_ds = tf.data.Dataset.from_tensor_slices(valid_y)
valid_ds = tf.data.Dataset.zip((valid_X_ds, valid_y_ds))
print(valid_ds, len(valid_ds))

all_X_ds = tf.data.Dataset.from_tensor_slices(all_X)
all_y_ds = tf.data.Dataset.from_tensor_slices(all_y)
all_ds = tf.data.Dataset.zip((all_X_ds, all_y_ds))
print(all_ds, len(all_ds))

In [None]:
BATCH_SIZE = config['batch_size']

train_ds = train_ds.batch(BATCH_SIZE)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
print(train_ds)

valid_ds = valid_ds.batch(BATCH_SIZE)
valid_ds = valid_ds.prefetch(buffer_size=AUTOTUNE)
print(valid_ds)

all_ds = all_ds.batch(BATCH_SIZE)
all_ds = all_ds.prefetch(buffer_size=AUTOTUNE)
print(all_ds)

# 3. Model Training

## 3.1 Hyperparameter Tuning with keras-tuner

In [None]:
import kerastuner as kt

NUM_TRAIN_STEPS = len(train_ds) * config['batch_size'] * config['num_epochs']

def build_model(hp):
    
    feature_num = len(train_series_df.columns)

    hp_rnn_hidden = hp.Int('rnn_hidden', min_value=16, max_value=256, step=16)
    hp_dense_hidden = hp.Int('dense_hidden', min_value=16, max_value=256, step=16)
    hp_activation = hp.Choice('activation', values=['selu', 'relu', 'tanh'])

    model = tf.keras.models.Sequential([
        tf.keras.layers.GRU(hp_rnn_hidden, return_sequences=True, input_shape=[None, feature_num]),
        tf.keras.layers.GRU(hp_rnn_hidden, return_sequences=True),
        tf.keras.layers.Dense(hp_dense_hidden, activation=hp_activation),
        tf.keras.layers.Dense(18)
        ])
    
    hp_initial_learning_rate = hp.Float('initial_learning_rate', 1e-4, 1e-3, sampling='log')
    lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=hp_initial_learning_rate,
        end_learning_rate=1e-5,
        decay_steps=NUM_TRAIN_STEPS)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
              loss=tf.keras.losses.MeanSquaredError(),
              )
  
    return model


tuner = kt.BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=config['max_trials'],
    directory = 'hp_tuning',
    project_name = 'ex_no_1',
)

In [None]:
tuner.search(train_ds, epochs=config['num_epochs'], validation_data=valid_ds)

In [None]:
tuner.results_summary(num_trials=3)

## 3.2 Model Training with Best Parameters

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
model.summary()

In [None]:
NUM_TRAIN_STEPS = len(all_ds) * config['batch_size'] * config['num_epochs']
model.fit(all_ds, epochs=config['num_epochs'])

# 3. Prediction and Submission

In [None]:
def prediction(model, prediction_num):
    input_for_predict = tf.constant(series_df[-window_size:], dtype=tf.float32)
    input_for_predict = tf.expand_dims(input_for_predict, 0)
    predictions = []

    for i in range(prediction_num):
        pred = model(input_for_predict) # TensorShape([1, window_size, 18])
        pred = pred[:, -1, :] # TensorShape([1, 18])
        predictions.append(pred)
        
        pred = tf.expand_dims(pred, 0)
        input_for_predict = input_for_predict[:, 1:, :]
        input_for_predict = tf.concat([input_for_predict, pred], axis=1)

    return np.array(predictions)

In [None]:
predictions = prediction(model, int(len(test_df)/feature_num) )
predictions = np.squeeze(predictions, axis=1)

pred_num_sold = (predictions * sc.scale_) + sc.mean_
pred_num_sold = pred_num_sold.ravel()

submission_df['num_sold'] = pred_num_sold
submission_df.to_csv("submission.csv", index=False)
submission_df.head()