In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import random
import math

from IPython import display as ipd
from tqdm import tqdm
import dateutil.easter as easter

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold,RepeatedKFold

from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Sequential, Model

from tensorflow.keras.layers import LSTM, Bidirectional, add, concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose, AveragePooling1D, UpSampling1D
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, TimeDistributed
from tensorflow.keras.layers import Multiply, Add, Concatenate, Flatten, Average, Lambda

from tensorflow.keras.optimizers import Adam, SGD, Adadelta, Nadam
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.constraints import unit_norm, max_norm

from tensorflow.keras import backend as K
from tensorflow_addons.optimizers import CyclicalLearningRate

### Utils

In [None]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')
    
## https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298201
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)    

### Data Load

In [None]:
RANDOM_SEED = 42
DEBUG = True

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
train.head()

### Adding holidays info from notebook: festivities-in-finland-norway-sweden

https://www.kaggle.com/lucamassaron/festivities-in-finland-norway-sweden

###  Easter dates idea is from notebook:

https://www.kaggle.com/ambrosm/tpsjan22-01-eda-which-makes-sense#Easter

In [None]:
import dateutil.easter as easter

holidays = pd.read_csv('../input/k/lucamassaron/festivities-in-finland-norway-sweden/nordic_holidays.csv')
holidays.date = pd.to_datetime(holidays.date)

easter_dates = []
years = [2015,2016,2017,2018,2019,2020]
for year in years:
     easter_dates.append(easter.easter(year))

def add_holidays( df, holidays, easter_dates ):
    df['holiday'] = 0
    df['easter'] = 0
    countries = ['Finland', 'Norway', 'Sweden']
    for country in countries:
        df.loc[ (df.date.isin(holidays.date)) & (df.country.isin(holidays.country)), 'holiday'] = 1

    df.loc[ (df.date.isin(easter_dates)), 'easter'] = 1
    return df
    
train = add_holidays( train, holidays, easter_dates )
test = add_holidays( test, holidays, easter_dates )

### Very simple date-based FE

sin and cos addition is from notebook by Ambros M:

https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model

In [None]:
def process_dates(df):
    df.date = pd.to_datetime(df.date)
    df['year'] = df.date.dt.year    
    df['month'] = df.date.dt.month
    df['week'] = df.date.dt.week
    df['wd4'] = df.date.dt.dayofweek == 4
    df['wd56'] = df.date.dt.dayofweek >= 5    
    df['wd4'] = df['wd4'].astype(int)
    df['wd56'] = df['wd56'].astype(int)
    df['day'] = df.date.dt.day    

    for k in [1, 2, 3, 12]:
        df[f'sin{k}'] = np.sin(df.date.dt.dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(df.date.dt.dayofyear / 365 * 2 * math.pi * k)    

    return df

train = process_dates(train)
test = process_dates(test)

### Adding GDP and population growth data

datasets: https://www.kaggle.com/vladlee/datasets

In [None]:
gdp_growth = pd.read_csv('../input/worldgdpgrowthbycountry20002020/API_NY.GDP.MKTP.KD.ZG_DS2_en_csv_v2_3469438-fixed.csv')
gdp_growth = gdp_growth[ gdp_growth['Country Name'].isin(['Sweden', 'Norway', 'Finland'])]
gdp_growth.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], axis=1, inplace=True)
gdp_growth = gdp_growth.T
gdp_growth.columns = ['Finland', 'Norway', 'Sweden']
gdp_growth.index = gdp_growth.index.astype(int)

pop_growth = pd.read_csv('../input/worldpopulationgrowthbycountry20002020/API_SP.POP.GROW_DS2_en_csv_v2_3469469-fixed.csv')
pop_growth = pop_growth[ pop_growth['Country Name'].isin(['Sweden', 'Norway', 'Finland'])]
pop_growth.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], axis=1, inplace=True)
pop_growth = pop_growth.T
pop_growth.columns = ['Finland', 'Norway', 'Sweden']
pop_growth.index = pop_growth.index.astype(int)

gdp = pd.read_csv('../input/worldgdpbycountry20002020/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3469429.csv')
gdp = gdp[ gdp['Country Name'].isin(['Sweden', 'Norway', 'Finland'])]
gdp.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', 'Unnamed: 65'], axis=1, inplace=True)
gdp = gdp.T
gdp.columns = ['Finland', 'Norway', 'Sweden']
gdp.index = gdp.index.astype(int)

pop = pd.read_csv('../input/worldpopulationbycountry20002020/API_SP.POP.TOTL_DS2_en_csv_v2_3469297.csv')
pop = pop[ pop['Country Name'].isin(['Sweden', 'Norway', 'Finland'])]
pop.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', 'Unnamed: 65'], axis=1, inplace=True)
pop = pop.T
pop.columns = ['Finland', 'Norway', 'Sweden']
pop.index = pop.index.astype(int)

In [None]:
def add_growth_features(df, gdp, pop, gdp_growth, pop_growth):
    years = [2015,2016,2017,2018,2019,2020]
    df['gdp_per_capita'] = 0
    df['gdp_growth'] = 0
    df['pop_growth'] = 0
    for column in gdp.columns:
        for year in years:
            df.loc[ (df.year == year) & (df.country == column), 'gdp_per_capita'] = gdp[column][year]/pop[column][year]
            df.loc[ (df.year == year) & (df.country == column), 'gdp_growth'] = gdp_growth[column][year]
            df.loc[ (df.year == year) & (df.country == column), 'pop_growth'] = pop_growth[column][year]
    
    df['gdp_per_capita'] = RobustScaler().fit_transform(df['gdp_per_capita'].values.reshape(-1, 1))
    return df

train = add_growth_features(train, gdp, pop, gdp_growth, pop_growth)
test = add_growth_features(test, gdp, pop, gdp_growth, pop_growth)

### Adding weather features from dataset:

https://www.kaggle.com/adamwurdits/finland-norway-and-sweden-weather-data-20152019

Thanks to Adam Wurdits

In [None]:
weather = pd.read_csv('../input/finland-norway-and-sweden-weather-data-20152019/nordics_weather.csv')
weather.date = pd.to_datetime(weather.date)
train = train.merge( weather, how='inner', left_on=['date', 'country'], right_on=['date', 'country'])
test = test.merge( weather, how='inner', left_on=['date', 'country'], right_on=['date', 'country'])

## Adding AmbrosM more features function from:

https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model#Simple-feature-engineering-(without-holidays)

(fixed for Keras)
witout thi

In [None]:
# Feature engineering for holidays
def engineer_more(df):
    """Return a new dataframe with more engineered features"""

    # End of year
    new_df = pd.concat([df,
                        pd.DataFrame({f"dec{d}":
                                      ((df.date.dt.month == 12) & (df.date.dt.day == d)).astype(int)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"n-dec{d}":
                                      ((df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')).astype(int)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"f-jan{d}":
                                      ((df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')).astype(int)
                                      for d in range(1, 14)}),
                        pd.DataFrame({f"jan{d}":
                                      ((df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')).astype(int)
                                      for d in range(1, 10)}),
                        pd.DataFrame({f"s-jan{d}":
                                      ((df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')).astype(int)
                                      for d in range(1, 15)})],
                       axis=1)
    
    # May
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      ((df.date.dt.month == 5) & (df.date.dt.day == d)).astype(int) 
                                      for d in list(range(1, 10))}), #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      ((df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')).astype(int)
                                      for d in list(range(19, 26))})],
                       axis=1)
    
    # June and July
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      ((df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')).astype(int)
                                      for d in list(range(8, 14))}),
                       ],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"wed_june{d}": 
                                      ((df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')).astype(int)
                                      for d in list(range(-4, 6))})],
                       axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      ((df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')).astype(int)
                                      for d in list(range(0, 9))})],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      ((df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')).astype(int)
                                      for d in list(range(6, 14))})],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      ((df.date - easter_date == np.timedelta64(d, "D"))).astype(int)
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})],
                       axis=1)
    
    return new_df

train = engineer_more(train)
test = engineer_more(test)

In [None]:
target = train.num_sold
target_sqrt = np.log(train.num_sold)
train.drop(['row_id','num_sold','date', 'year', 'day'], axis=1, inplace=True)
test.drop(['row_id', 'date', 'year', 'day'], axis=1, inplace=True)

In [None]:
features = list(test.columns)
print(list(features))

### Encode category columns 

In [None]:
country_encoder = LabelEncoder()
train['country_enc'] = country_encoder.fit_transform(train['country'])
test['country_enc'] = country_encoder.transform(test['country'])

store_encoder = LabelEncoder()
train['store_enc'] = store_encoder.fit_transform(train['store'])
test['store_enc'] = store_encoder.transform(test['store'])

product_encoder = LabelEncoder()
train['product_enc'] = product_encoder.fit_transform(train['product'])
test['product_enc'] = product_encoder.transform(test['product'])


train.drop(['country','store','product'], axis=1, inplace=True)
test.drop(['country','store','product'], axis=1, inplace=True)

In [None]:
scaler = RobustScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns)
test_scaled = scaler.fit_transform(test)

### Model and train

In [None]:
def create_model_cnn( units, X, optimizer=Adam()):
    n_timesteps = 1
    n_features  = X.shape[1]
    model = Sequential([
        Input(shape=(n_timesteps, n_features), dtype='float32'),
        Conv1D(filters=units*4, kernel_size=5, activation='relu', name='Conv1D-1'),
        Conv1D(filters=units*2, kernel_size=3, activation='relu', name='Conv1D-2'),
        Conv1D(filters=units, kernel_size=2, activation='relu', name='Conv1D-3'),
        MaxPooling1D(pool_size=4, name='MaxPool-4'),
        Flatten(name='Flatten-5'),
        Dense(units, activation='relu', name='Dense-6'),
        Dense(1, name='Dense-7'),
    ])
    model.compile(optimizer = optimizer, loss = 'mape')
    return model

In [None]:
def create_model( units, X, optimizer=Adam()):
    n_timesteps = 1
    n_features  = X.shape[1]    
    model = Sequential([
        Input(shape=(n_timesteps, n_features), dtype='float32'),
        ##LSTM(units*4, return_sequences=True, batch_input_shape=(1, 1, X.shape[1])),
        BatchNormalization(),
        LSTM(units*4, return_sequences=True),
        BatchNormalization(),
        LSTM(units*4, return_sequences=True),
        BatchNormalization(),
        LSTM(units*4, return_sequences=True),
        BatchNormalization(),
        LSTM(units*4, return_sequences=True),
        Dense(units, activation='linear'),
        Dense(1),
    ])  
    model.compile(optimizer = optimizer, loss = 'mape')
    return model

In [None]:
UNITS = 32
EPOCHS = 100
BATCH_SIZE = 512
TOTAL_SPLITS = 6
N_REPEATS = 3
LEARNING_RATE = 0.002123

m = create_model(UNITS, train, Adam(learning_rate=LEARNING_RATE))
m.summary()

In [None]:
%%time

tf.keras.backend.clear_session()
lr = ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=7, verbose=1,  min_lr=1e-7)
es = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=30, verbose=1, mode="min", restore_best_weights=True)


scores = []
models = []
histories = []
folds = RepeatedKFold(n_splits=TOTAL_SPLITS, n_repeats=N_REPEATS)
for fold_n, (train_index, valid_index) in enumerate(folds.split(train_scaled, target_sqrt)):
    print('-'*15, '>', f'Fold {fold_n+1}', '<', '-'*15)
    X_train, X_valid = train.iloc[train_index], train.iloc[valid_index]
    y_train, y_valid = target_sqrt.iloc[train_index], target_sqrt.iloc[valid_index]    
    
    X_train = X_train.values.reshape( -1, 1, X_train.shape[1])
    X_valid = X_valid.values.reshape( -1, 1, X_valid.shape[1])
    
    checkpoint_path = f'model_{fold_n}.h5'
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True)
    callbacks = [es, checkpoint, lr]    
    
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model = create_model(UNITS, train, optimizer)
    history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size = BATCH_SIZE, epochs = EPOCHS, 
                    verbose=1, shuffle=True, callbacks=callbacks)
    
    model.load_weights(checkpoint_path)
    
    oof_pred = model.predict(X_valid).reshape(-1)
    oof_score = SMAPE(y_valid, oof_pred)
    print(f'OOF SMAPE: {oof_score}')
    
    scores.append(oof_score)
    models.append(model)
    histories.append(history)

In [None]:
print(f'OOF SMAPE: { np.mean(scores)}')

### Plot metrics

In [None]:
xx = range(0, EPOCHS)
col_metrics = ["loss", "val_loss", "lr"]

f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(16, 8))
for hist in histories:
    df = pd.DataFrame(hist.history, columns=col_metrics)
    ax1.plot( df[[col_metrics[0], col_metrics[1]]])
ax2.plot( df[[col_metrics[2]]])

plt.show()

In [None]:
train.shape

In [None]:
## average predictions over all models

testX = test.values.reshape(len(test), 1, test.shape[1])
y_pred = np.zeros(len(testX))
for model in models:
    y_pred += model.predict(testX).reshape(-1)

y_pred = y_pred / len(models)

In [None]:
submission['num_sold'] = np.round(np.exp(y_pred)).astype(int)
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)