In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
import math
import statistics
import scipy.stats
from scipy.stats import pearsonr
import time
from datetime import datetime
import matplotlib.dates as mdates
import dateutil.easter as easter
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from keras.layers import Bidirectional

import tensorflow as tf

# Import train and test dataset

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

# Create function for cleaning and augmenting dataset

In [None]:
def augment_dataset(df):
    dt= pd.DatetimeIndex(df['date'])
    df['month'] =dt.month
    df['year'] =dt.year
    df['day'] = dt.day
    df['season'] = df['month']%12 // 3 + 1
    df['dow'] = dt.dayofweek
    return df


def compute_lda(df):
    cols=['country','store','product','month','year','day','season','dow','num_sold']
    
    le = preprocessing.LabelEncoder()

  
    X = df
    X['country']=le.fit_transform(X['country'])
    X['store']=le.fit_transform(X['store'])
    X['product']=le.fit_transform(X['product'])


    return X 

def create_chunks(sequence, n_steps):
    sequence=list(sequence)
    output = []
    for i in range(len(sequence)):
        inner=[]
        for offset in list(reversed(range(0,n_steps))):
            if i-offset < 0:
                inner.append(0)
            else:
                inner.append(sequence[i-offset]) 
        output.append(inner)
    return np.array(output)


def create_df_chunks(dataframe, n_chunks):
    chunks = create_chunks(dataframe['num_sold'],n_chunks)
    col_names = ['num_sold-{}'.format(i) for i in range(n_chunks)]
    return dataframe.reset_index().join(pd.DataFrame(chunks,columns=col_names))

def execute_rnn(X,y):
    checkpoint_filepath = '.'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True)


    model = Sequential()
    model.add(LSTM(50, activation='relu',return_sequences=True, input_shape=(X.shape[1],1)), )
    model.add(LSTM(10, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='Adam', loss='mse',)

    model.fit(X, y, epochs=10, verbose=1,callbacks=[model_checkpoint_callback])
    
    return model

def prepare_dataset(df):
    list_of_df = []

    for name, group in df.groupby(["country","store","product"]):
        list_of_df.append(create_df_chunks(group,N_CHUNK))


    df_ml = pd.concat(list_of_df).reset_index().drop(columns=['level_0','index'])
    return df_ml


def prepare_dataset_ml_train(df):
    col_train = ['num_sold-{}'.format(i) for i in range(N_CHUNK)]
    col_train = col_train + ['country','store','product','day','dow','month','year']
    print(col_train)

    df_subset = df[(df['year'] != 2017)&(df['year'] != 2018)]
    X = df_subset[col_train].to_numpy()
    y = df_subset['num_sold'].to_numpy()
    X = X.reshape((X.shape[0], X.shape[1], 1))
    return X,y


def prepare_dataset_ml_test(df):
    col_train = ['num_sold-{}'.format(i) for i in range(N_CHUNK)]
    col_train = col_train + ['country','store','product','day','dow','month','year']
    print(col_train)

    df_subset = df[(df['year'] == 2017)|(df['year'] == 2018)]
    X = df_subset[col_train].to_numpy()
    y = df_subset['num_sold'].to_numpy()
    X = X.reshape((X.shape[0], X.shape[1], 1))
    return X,y

# Using 3 timesteps (chunk)
# Augment dataset with new variables

In [None]:

N_CHUNK = 3
df = augment_dataset(train).drop(columns= ['row_id'])
df


# Label Encoder

In [None]:
df_lda = compute_lda(df)
df_lda

In [None]:
df_total = prepare_dataset(df_lda)
df_total

In [None]:
X_train,y_train= prepare_dataset_ml_train(df_total)

In [None]:
model=execute_rnn(X_train,y_train)

In [None]:
X_test,y_test = prepare_dataset_ml_test(df_total)
pred = model.predict(X_test)
pred.shape

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(pred,y_test)

In [None]:
df_ml_show = df_total[(df_total['year'] == 2017)|(df_total['year'] == 2018)].copy()

df_ml_show['date']= df_ml_show.apply(lambda x : str(int(x['year']))+'-'+str(int(x['month']))+'-'+str(int(x['day'])),axis=1)
df_ml_show['date'] = pd.to_datetime(df_ml_show['date'],format='%Y-%m-%d')
df_ml_show['pred'] = pred

In [None]:
sns.set(rc={'figure.figsize':(22.7,8.27)})
sns.color_palette("tab10")
sns.lineplot(data = df_ml_show, x="date",y="num_sold")
sns.lineplot(data = df_ml_show, x="date",y="pred")

In [None]:
OFFSET = 10
df_total.iloc[-OFFSET:,:]

df_test = augment_dataset(test).drop(columns= ['row_id'])
df_total_test = compute_lda(df_test)

df_new  =df_total.iloc[-OFFSET:,:].copy()
df_new=pd.concat([df_new,df_total_test]).reset_index()

col_train = ['num_sold-{}'.format(i) for i in range(N_CHUNK)]
col_train = col_train + ['country','store','product','day','dow','month','year']


In [None]:
for index in range(OFFSET,df_new.shape[0]):

    last_values=list(df_new.iloc[index-N_CHUNK:index,:]['num_sold'])
    chunks=create_chunks(last_values,N_CHUNK)[N_CHUNK-1]
    
    df_new.loc[index,['num_sold-0','num_sold-1','num_sold-2']] = chunks
    arr_to_pred = df_new.iloc[index][col_train].to_numpy(dtype="float64").reshape(1,len(col_train),1)
    pred = model.predict(arr_to_pred)
    df_new.loc[index,'num_sold']=pred[0]
    index+=1
    
#     if index ==13:
#         break
df_new
