In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from tensorflow.keras.callbacks import TensorBoard
print(tf.__version__)

## Read the data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
train_extra = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
train.head()

In [None]:
# row_id is excess and leads to overfitting
train = train.drop("row_id",axis=1)
test = test.drop("row_id",axis=1)

### EDA with date

In [None]:
from datetime import datetime

is_weekend = True
def data_time_process(data_type):
    time_column = pd.to_datetime(data_type.iloc[:,0],format='%Y-%m-%d')
    years = []
    months = []
    days = []
    weeks = []
    day_of_year = []
    for el in (time_column.astype("str").to_list()):
        year = int(el.split('-')[0])
        years.append(year)
        month = int(el.split('-')[1])
        months.append(month)
        day = int(el.split('-')[2])
        d = datetime(year, month, day)
        weeks.append(d.isocalendar()[1]) # [1] - is a week part
        day_of_year.append(int(d.timetuple().tm_yday))
        if (int(d.timetuple().tm_wday) < 4): # idea that Friday is also make sense
            days.append(0)
        elif (int(d.timetuple().tm_wday) == 4):
            days.append(1)
        elif (int(d.timetuple().tm_wday) >= 5):
            days.append(2)

    data_type['year'] = years
    data_type['month'] = months
    data_type['is_weekday'] = days
    data_type['day_of_year'] = day_of_year
    data_type['weeks'] = weeks


In [None]:
data_time_process(train)
data_time_process(test)

In [None]:
test = test.drop('date',axis=1)
train = train.drop('date',axis=1)

In [None]:
train.head()

### Let's work with categorical feature, because DNN works only with float and int

In [None]:
from datetime import datetime
from category_encoders import *

# I'll use simple one-hot encoder, also should try binary or ordinal encoder;
# Maybe it's better to learn N different models for different country e.g, and then combine it
train = OneHotEncoder(cols=['product']).fit(train).transform(train)
test = OneHotEncoder(cols=['product']).fit(test).transform(test)

train = OneHotEncoder(cols=['store']).fit(train).transform(train)
test = OneHotEncoder(cols=['store']).fit(test).transform(test)

train = OneHotEncoder(cols=['country']).fit(train).transform(train)
test = OneHotEncoder(cols=['country']).fit(test).transform(test)

In [None]:
train.head()

### Creating DNN model 

In [None]:
# define features that will be used in fitting
features = ['country_1','country_2','country_3','store_1','store_2','product_1','product_2','product_3','year','month','is_weekday','day_of_year','weeks']

In [None]:
train_dataset = train.sample(frac=0.8, random_state=0)
test_dataset = train.drop(train_dataset.index)

train_features = train.copy()
test_features = test_dataset.copy()

train_labels = pd.DataFrame([train_features.pop(x) for x in ['num_sold']]).T
test_labels = pd.DataFrame([test_features.pop(x) for x in ['num_sold']]).T

In [None]:
# First layer - normalize it
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(train_features[features]))

In [None]:
model = keras.Sequential([ normalizer,
                           layers.Dense(64,activation='relu'),
                           layers.Dense(32, activation='relu'),
                           #layers.Dropout(0.1),
                           layers.Dense(16, activation='relu'),
                           layers.Dense(8, activation='relu'),
                           layers.Dense(4, activation='relu'),
                           layers.Dense(1)])
model.summary()

### Define SMAPE loss for learning 

In [None]:
import tensorflow.keras.backend as K
#https://keras.io/api/losses/
def smape_loss(y_true, y_pred):
    denominator = (K.abs(y_true) + K.abs(y_pred)) / 200.0
    smape = K.abs(y_pred - y_true) / denominator
    return K.mean(smape)

In [None]:
# train model
model.compile(loss=smape_loss, optimizer=tf.keras.optimizers.Adam(5e-4))
history = model.fit(train_features[features],
                    np.asarray(train_labels[['num_sold']]).astype('float32'),
                    validation_split=0.2, verbose=2, epochs=35)

### Visualize training

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([5, 50])
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)
    plt.show()
plot_loss(history)

## Prediction and visualizing for validation data

In [None]:
test_predictions_orig = model.predict(test[features])
test_predictions = model.predict(test_features[features])

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)
SMAPE(np.asarray(test_labels['num_sold'].to_list()),test_predictions) # adding +100 for test_labels decrease SMAPE to 62

In [None]:
j = 0
x_axis = []
y_axis_pred = []
y_axis_orig = []
# each 0 + 18*n string (as 18 is period in data)
for i in range(0, 5260):
    if j == 18:
        x_axis.append(i/18)
        y_axis_pred.append(test_predictions[i])
        y_axis_orig.append(np.asarray(test_labels['num_sold'].to_list())[i])
        j = 0
    j += 1

In [None]:
plt.plot(x_axis, y_axis_orig)
plt.plot(x_axis, y_axis_pred)
# Looks almost the same, but too looks like overfitted

In [None]:
plt.plot(x_axis, y_axis_pred)


### Export and explore the result

In [None]:
result = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
result['num_sold'] = test_predictions_orig
result.to_csv('./example_dnn.csv',index=False)

In [None]:
result # score on PB ~ 17, as we can see data is undervaluated,
# maybe peak from December confuse the model, we should experiment with architecture
# nevertheless, it was experiment of using NN with tabular data!

In [None]:
j = 0
x_axis = []
y_axis = []
# each 0 + 18*n string (as 18 is period in data)
for i in range(0, 6570):
    if j == 18:
        x_axis.append(i/18)
        y_axis.append(result['num_sold'][i])
        j = 0
    j += 1

In [None]:
plt.plot(x_axis, y_axis)
# looks like result