In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score, r2_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Tabular Playground Jan 2022

### GOAL

Create a model that will predict the num_sold for the given new data


### Overview

Due to the nature of the dataset, we'll probably want to preprocessing the features and then do some EDA to find the correlation. After that I'll implement the model with a small Neural Network using Keras.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
df_holidays = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv') #HOLIDAYS CALENDAR AVAILABLE HERE https://www.kaggle.com/vpallares/public-and-unofficial-holidays-nor-fin-swe-201519
df_oecd = pd.read_csv('../input/oecd-data-fin-nor-swe-20152019/oecd_monthly_data.csv') #ECONOMICS DATASET AVAILABLE HERE https://www.kaggle.com/siukeitin/oecd-data-fin-nor-swe-20152019
df_gdp = pd.read_csv('../input/consumer-price-index-20152019-nordic-countries/Best_CPI.csv')  #GDP DATASET https://www.kaggle.com/sardorabdirayimov/consumer-price-index-20152019-nordic-countries

#DATAFRAMES USED IN PREVIOUS VERSIONS, DIDN'T WORK WELL FOR THI PROJECT
#CHECK OUT ONE OF THE PREVIOUS VERSIONS FOR AN EXAMPLE OF STOCHASTIC REGRESSION WITH THE AMAZON DATAFRAME
#df_macro_comp = pd.read_csv('../input/macroeconomic-composite-finland-norway-sweden/macro_economic_idx.csv') #MACRO-ECONOMICS COMPOSITE DATASET https://www.kaggle.com/lucamassaron/macroeconomic-composite-finland-norway-sweden
#df_amazon = pd.read_csv('../input/amazon-surge-for-tps-jan-2022/Amazon search.csv') #AMAZON SEARCHES https://www.kaggle.com/anirudhyadav9784/amazon-surge-for-tps-jan-2022

In [None]:
df_train.isnull().sum() #there are no null values, so we don't have to clean the df

### Feature Engineering

The date field is not so useful for training the model. I'll break it up to year and month, just to have an idea about the time period.

Train and Test Dates

In [None]:
def set_date_features(df):
    df['date'] = pd.to_datetime(df['date'])          
    df['day_of_week']=df['date'].dt.dayofweek       
    df['day_of_month']=df['date'].dt.day            
    df['weekend']=(df['day_of_week']//5 == 1)       
    df['weekend']=df['weekend'].astype('int')       
    df['week']=df['date'].dt.isocalendar().week     
    df['week'][df['week']>52]=52                    
    df['week']=df['week'].astype('int')             
    df['month']=df['date'].dt.month                 
    df['quarter']=df['date'].dt.quarter             
    df['year']=df['date'].dt.year    
    return df

In [None]:
df_train = set_date_features(df_train)
df_test = set_date_features(df_test)

Holidays

In [None]:
df_holidays['date'] = pd.to_datetime(df_holidays['date'])   

In [None]:
def holiday_fe(row):    
    df = df_holidays[(df_holidays['date'] == row['date']) & (df_holidays['country'] == row['country'])]
    if len(df) > 0:
        retval = df.iloc[0]['event']
    else:
        retval = 'None'
    
    #THIS FIXES A PROBLEM WITH THE HOLIDAY DATASET, IN 2019 THE NEW YEAR'S EVE IS MISSING!
    if (retval == 'None') & (row['month'] == 12) & (row['day_of_month'] == 31):
        retval = "New Year's Eve"
        
    return retval 

In [None]:
df_train['Holiday'] = df_train.apply(lambda row: holiday_fe(row), axis = 1)
df_test['Holiday'] = df_test.apply(lambda row: holiday_fe(row), axis = 1)

OECD

In [None]:
df_oecd['year'] = df_oecd['date'].apply(lambda date: int(date.split('-')[0]))
df_oecd['month'] = df_oecd['date'].apply(lambda date: int(date.split('-')[1]))

In [None]:
df_oecd.drop('CCI', axis = 1, inplace = True) #THIS VALUES IS MISSING FOR NORWAY, WE DROP IT

In [None]:
def oecd_fe(df):
    df_tmp = pd.merge(left=df, right=df_oecd, how='left', on=['year','month','country'])
    df_tmp = df_tmp.drop('date_y', axis=1)
    df_tmp.rename(columns={'date_x':'date'}, inplace=True)
    return df_tmp

In [None]:
df_train = oecd_fe(df_train)
df_test = oecd_fe(df_test)

GDP

In [None]:
df_train = df_train.merge(df_gdp[['year','country','GDP']], how='left', on=['year','country'])
df_test = df_test.merge(df_gdp[['year','country','GDP']], how='left', on=['year','country'])

### Simple EDA

Let's plot out the num_sold based on the date

In [None]:
g = sns.FacetGrid(df_train, 
                  col_wrap=2,
                  col="year", 
                  hue='country',
                  height=5,
                  aspect=1,
                  sharex=True, 
                  xlim=(1, 12))
g.map_dataframe(sns.lineplot, 'month', 'num_sold')
g.add_legend()

### Dummy Variables

Next up, let's create dummy variables for country, store, product and Holiday

In [None]:
#Removing unnecessary features
df_train = df_train.drop('date', axis = 1)
df_test = df_test.drop('date', axis = 1)

In [None]:
def set_dummies(df):
    dummies = pd.get_dummies(df[['country', 'store', 'product','Holiday']])
    df = df.drop(columns=['country', 'store', 'product','Holiday'])
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
df_train = set_dummies(df_train)
df_test = set_dummies(df_test)

In [None]:
#WE HAVE TO DROP THE FEATURE 'Holiday_Fourth Sunday of Advent', THERE'S NO DATE FOR THE 2019
np.setdiff1d(df_train.columns,df_test.columns) 

In [None]:
df_train.drop(columns='Holiday_Fourth Sunday of Advent', inplace=True)

### Splitting

In [None]:
X = df_train.drop('num_sold',axis=1).values
y = df_train['num_sold'].values
X_for_real_test = df_test.values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

### Scaling

Before creating the module, I'll scale the data fitting only the training ones, in order to prevent data leakage

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_for_real_test = scaler.transform(X_for_real_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Modelling

In [None]:
model = Sequential()

# input layer - IT SHOULD ALWAYS HAVE THE SAME NUMBER OF NEURONS OF OUR FEATURES
model.add(Dense(66,  activation='relu'))

# hidden layer
model.add(Dense(32, activation='relu'))

# hidden layer
model.add(Dense(16, activation='relu'))

# hidden layer
model.add(Dense(8, activation='relu'))

# hidden layer
model.add(Dense(4, activation='relu'))

# hidden layer
model.add(Dense(2, activation='relu'))

# output layer
model.add(Dense(units=1,activation='relu'))

optimizer = Adam(learning_rate = 0.001)

# Compile model
model.compile(optimizer=optimizer,loss='mean_squared_error')

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

In [None]:
model.fit(x=X_train,
          y=y_train,
          validation_data=(X_test,y_test),
          epochs=100,
          batch_size=32,
          verbose = 1,
          callbacks=[early_stop])

### Model Evaluation

In [None]:
pd.DataFrame(model.history.history).plot()

In [None]:
predictions = model.predict(X_test)
predictions = np.squeeze(predictions)

In [None]:
mean_absolute_error(y_test,predictions)

In [None]:
np.sqrt(mean_squared_error(y_test,predictions))

In [None]:
explained_variance_score(y_test,predictions)

In [None]:
r2_score(y_test,predictions)

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [None]:
smape(y_test,predictions)

In [None]:
predictions = model.predict(X_for_real_test)
predictions = np.squeeze(predictions)
output = pd.DataFrame({'row_id': df_test['row_id'],
                       'num_sold': predictions})

output.to_csv('submission.csv', index=False)