In [None]:
!pip install py7zr

# Set the needed data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import glob

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset

import os
import time

from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import log_loss, accuracy_score


import py7zr
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        with py7zr.SevenZipFile(os.path.join(dirname, filename), mode='r') as z:
            z.extractall()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ['WANDB_CONSOLE'] = 'off'

# 3.a. Loading, Exploring and Preprocessing the Data
We have a few files that gives diffrent feature for the data. We will seperate them and move them into a comftarble one dataframe for validation test and train 

In [None]:
stores = pd.read_csv('./stores.csv')
items = pd.read_csv('./items.csv')
oil = pd.read_csv('./oil.csv', parse_dates=['date'])

train = pd.read_csv('./train.csv',
                   converters={'unit_sales': lambda x: np.log1p(float(x)) if float(x) > 0 else 0},
                   parse_dates=['date'],
                   skiprows= range(1,66458909) # seeing that "train" is around 5gb we 
                    # take about half of the data from 1.1.2016
                   )
test = pd.read_csv('./test.csv', parse_dates=['date'])

We want to split the data 20% and 80% but we also don't want to cut days. In our estimation 20% for validation fall in about 2017-15-2

In [None]:
cut = train.loc[train['date']>=pd.datetime(2017,1,1)]
validation =cut.loc[cut['date']<=pd.datetime(2017,2,15)]

train = train.loc[train['date']>pd.datetime(2017,2,15)] # we cut to only 2017 beacuse 
# the data set is huge

We will show the end of the of the validation and the start of train and test

In [None]:
train.head()

In [None]:
validation.tail()

In [None]:
test.head()

We now want to prepare our train, validation and test dataframe with all possible features (will be relvant later). Some features like item_nbr are a string so we want to encode them to integer numbers.

In [None]:
items['family'] = items['family'].astype('category').cat.codes
stores['city'] = stores['city'].astype('category').cat.codes
stores['state'] = stores['state'].astype('category').cat.codes
stores['type'] = stores['type'].astype('category').cat.codes
items['item_enc'] = items['item_nbr'].astype('category').cat.codes
items['class'] = items['class'].astype('category').cat.codes
stores['store_enc'] = stores['store_nbr'].astype('category').cat.codes
stores['cluster'] = stores['cluster'].astype('category').cat.codes

In [None]:
# We will merge the the values from items.csv
df_train = pd.merge(train,items,on='item_nbr')
df_val = pd.merge(validation,items, on='item_nbr' )
df_test = pd.merge(test,items, on='item_nbr')

In [None]:
# We will merge the features based on the store number
df_train = pd.merge(df_train, stores, on='store_nbr')
df_val = pd.merge(df_val,stores, on='store_nbr' )
df_test = pd.merge(df_test,stores, on='store_nbr' )

In [None]:
oil = oil[oil['date']>= pd.datetime(2017,1,1)]
oil = oil.fillna(method='bfill')

In [None]:
df_train = pd.merge(df_train, oil, on='date')
df_val = pd.merge(df_val,oil, on='date' )
df_test = pd.merge(df_test,oil, on='date' )

In [None]:
df_train['day'] = df_train['date'].dt.dayofweek
df_val['day'] = df_val['date'].dt.dayofweek
df_test['day'] = df_test['date'].dt.dayofweek

In [None]:
df_train

We will now define a dataset class for our forecasting

## 3.b. Solid Benchmark

We will try and do a solid benchmark using random forest algorithm

In [None]:
print('Starting classic ML')
maxrow = 300000
y_train = df_train['unit_sales']
x_train = df_train.drop(columns=['date','store_nbr','item_nbr','unit_sales'])
rf = RandomForestRegressor(max_depth = x_train.shape[1], n_jobs = -1)
benchmark = rf.fit(x_train[:maxrow],y_train[:maxrow])
print('Finished fitting')

In [None]:
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import r2_score as r2
y_val = df_val['unit_sales']
x_val = df_val.drop(columns=['date','store_nbr','item_nbr','unit_sales'])
maxrow = 20000
preds_bm = rf.predict(x_val[:maxrow])
loss_val = msle(y_val[:maxrow].to_numpy(), preds_bm.astype(np.int))
print('The loss is: ',loss_val)
acc = r2(y_val[:maxrow].to_numpy(), preds_bm)
print(f'The R^2 is {acc}')

## 3.c. Preproccesing for embedding

Firstly we will define a dataset that does not take date column. After that we will make a LSTM model with embedding layers, which in it we will create embbeding for the store_nbr and item_nbr 

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
class TimeDataset(Dataset):
    def __init__(self, data):
        # df to ndarray to tensor
        
        self.data = torch.from_numpy(data.iloc[:,1:].astype(np.float32).to_numpy()) # avoid unnecessary copy
    
    def __len__(self):
        return len(self.data) 

    def __getitem__(self,idx):
        # features
        x = self.data[idx ,1:]
        # labels
        y = self.data[idx,0]
        return x, y

In [None]:
# counting the number of of values in item_nbr and store_nbr
count_store = stores['store_nbr'].nunique()
count_city = stores['city'].nunique()
count_item = items['item_nbr'].nunique()


In [None]:
class SalesModel(nn.Module):
    def __init__(self):
        super(SalesModel,self).__init__()
        
        self.city_embd = nn.Embedding(count_city,5)
        n_embd_city = self.city_embd.embedding_dim
        self.str_embd = nn.Embedding(count_store,10)
        n_embd_str = self.str_embd.embedding_dim
        self.item_embd = nn.Embedding(count_item,100)
        n_embd_item = self.item_embd.embedding_dim
        self.day_embd = nn.Embedding(7,5)
        n_embd_day = self.day_embd.embedding_dim
        self.dropout = nn.Dropout(0.3)
        n_emb = n_embd_city + n_embd_item + n_embd_str+n_embd_day
        self.n_emb = n_emb
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(self.n_emb,70)
        self.bn = nn.BatchNorm1d(70)
        self.fc2 = nn.Linear(70,20)
        self.fc3 = nn.Linear(20,1)
        
        
    def forward(self,x):
        #x = [e(x[:,i]) for i,e in enumerate(self.embeddings)]
        a = self.item_embd(x[:,0].long())
        b = self.str_embd(x[:,1].long())
        c = self.city_embd(x[:,2].long())
        d = self.day_embd(x[:,3].long())
        x = torch.cat((a,b,c,d),1)
        x = self.dropout(x)
        #x = x.view(x.size(0),-1)
        x = self.relu(self.fc1(x))
        x = self.bn(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model =SalesModel()

In [None]:
def train_model(val_loader,train_loader, learn_rate,s_model,EPOCHS=5):
    
    # Setting common hyperparameters
    train_logs = []
    val_logs = []
    model = s_model.to(device)
    
    # Defining loss function and optimizer
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    
    
    print("Starting Training of GRU model")
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.process_time()
        model.train()
        # intisalizing the weights
        avg_loss = 0. # the . is for float as opposed to int
        counter = 0
        for x, label in train_loader:
            counter += 1
            model.zero_grad()
            out = model(x.to(device).float())
            
            loss = criterion(out.float().squeeze(1), label.to(device).float())
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if counter%4000 == 0:
                print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(train_loader), avg_loss/counter))
                train_logs.append(avg_loss/counter)
        
        print("Starting validation for epoch {}".format(epoch))
        
        model.eval()
        val_loss = 0. # the . is for float as opposed to int
        counter_val = 0
        val_preds = None
        val_labels = None
        
        for x, label in val_loader:
            
            if val_labels is None:
                val_labels = label.clone()
            else:
                val_labels = torch.cat((val_labels, label),dim=0)
            
            label = label.to(device, dtype=torch.int)
            
            counter_val += 1
            
            with torch.no_grad():
                out = model(x.to(device).float())
                loss = criterion(out.float().squeeze(1), label.to(device).float())
            
                val_loss += loss.item()
                
                preds = out.data.cpu()
                
                if val_preds is None:
                    val_preds = preds
                else:
                    val_preds = torch.cat((val_preds,preds),dim=0)
                    
                if counter_val%2000 == 0:
                    print("Epoch {}......Step: {}/{}....... Validation loss for Epoch: {}".format(epoch, counter_val, len(val_loader), val_loss/counter_val))
                    val_logs.append(val_loss / counter_val)
        current_time = time.process_time()
        print("Epoch {}/{} Done, Total Training Loss: {}".format(epoch, EPOCHS, avg_loss/len(train_loader)))
        print("Total Validation Loss: {}".format(val_loss/len(val_loader)))
        print("Time Elapsed for Epoch: {} seconds".format(str(current_time-start_time)))
        epoch_times.append(current_time-start_time)
    print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
    return model,train_logs,val_logs

## 3.d. Training the model and evaluating

In [None]:
batch_size = 1024
num_workers = 0
features = ['date','unit_sales','item_enc','store_enc','city','day']
features_test =['date','id','item_enc','store_enc','city','day']

test_set = TimeDataset(df_test.loc[:,features_test])
train_set = TimeDataset(df_train.loc[:,features])
val_set = TimeDataset(df_val.loc[:,features])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True,drop_last=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True,drop_last=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True,drop_last=True)

In [None]:
lr = 0.01
model_trained , train_logs, val_logs = train_model(val_loader,train_loader,learn_rate=lr,s_model=model)

In [None]:
plt.title('Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.plot(val_logs)
plt.show()
plt.title('Train Loss')
plt.plot(train_logs)
plt.ylabel('Loss')
plt.xlabel('Epochs')

In [None]:
def NWRMSLE(y, pred, w):
  y = y
  pred = pred
  w = w
  
  return (((w*(np.log(pred+1) - np.log(y+1)))**2).sum() / w.sum())**0.5

In [None]:
def evaluate(model, test_loader):
    model.eval()
    outputs = []
    targets = []
    start_time = time.process_time()
    for x, label in test_loader:
        inp = x
        out = model(inp.to(device).long())
        
        outputs.append(out.cpu().detach().numpy().reshape(-1))
    print("Evaluation Time: {}".format(str(time.process_time()-start_time)))
    return outputs

In [None]:
# we have no labels in test set so we will just get them
out = evaluate(model, test_loader)

In our run we got a train loss of about 3.84 and validation loss of about 5.32. We saw that in each epoch the loss in both the train and validation increased and in total the validation increased. We figure from this that we have overfitting to a  pretty small feature size.

## 3.e. Better model

We will now add more features to our model to try and improve our prediction. Some of these features will not be embbeded.


In [None]:
count_fam = items['family'].nunique()
count_class = items['class'].nunique()
count_typ = stores['type'].nunique()
count_clst = stores['cluster'].nunique()
count_days = df_train['day'].nunique() # oddly enough there areonly 5 days not 7

In [None]:
print(count_clst,count_typ,count_class,count_fam)

In [None]:
class StoreNet(nn.Module):
    def __init__(self):
        super(StoreNet, self).__init__()
        self.city_embd = nn.Embedding(count_city,10)
        n_embd_city = self.city_embd.embedding_dim
        self.str_embd = nn.Embedding(count_store,15)
        n_embd_str = self.str_embd.embedding_dim
        self.item_embd = nn.Embedding(count_item,100)
        n_embd_item = self.item_embd.embedding_dim
        self.day_embd = nn.Embedding(count_days,5)
        n_embd_day = self.day_embd.embedding_dim
        self.clst_embd = nn.Embedding(count_clst,5)
        n_embd_clst = self.clst_embd.embedding_dim
        self.class_embd = nn.Embedding(count_class,40)
        n_embd_class = self.class_embd.embedding_dim
        self.fam_embd = nn.Embedding(count_fam,20)
        n_embd_fam = self.fam_embd.embedding_dim
        
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.2)
        n_emb = n_embd_city + n_embd_item + n_embd_str+n_embd_day+n_embd_clst+n_embd_class+n_embd_fam
        self.n_emb = n_emb
        self.relu = nn.ReLU()
        self.inpt = n_emb+2
        
        self.fc1 = nn.Linear(self.inpt,60)
        self.bn_emb = nn.BatchNorm1d(self.n_emb)
        self.bn1 = nn.BatchNorm1d(60)
        self.bn2 = nn.BatchNorm1d(20)
        self.fc2 = nn.Linear(60, 20)
        self.fc3 = nn.Linear(20, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        a = self.item_embd(x[:,0].long())
        b = self.str_embd(x[:,1].long())
        c = self.city_embd(x[:,2].long())
        d = self.day_embd(x[:,3].long())
        e = self.clst_embd(x[:,4].long())
        f = self.class_embd(x[:,5].long())
        g = self.fam_embd(x[:,6].long())
        seq = x[:,7:9]
        
        x = torch.cat((a,b,c,d,e,f,g),1)
        
        x = self.bn_emb(x)
        x= self.dropout1(x)
        x = torch.cat([x,seq],1)
        x = x.view(x.size(0),-1)
        
        x = self.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout2(x)
        
        x = self.relu(self.fc2(x))
        x = self.bn2(x)
        
        x = self.fc3(x)
        return x

In [None]:
batch_size = 1024
num_workers = 2
features = ['date','unit_sales','item_enc','store_enc','city','day','cluster','class','family','dcoilwtico','onpromotion']
features_test =['date','id','item_enc','store_enc','city','day','cluster','class','family','dcoilwtico','onpromotion']

test_set = TimeDataset(df_test.loc[:,features_test])
train_set = TimeDataset(df_train.loc[:,features])
val_set = TimeDataset(df_val.loc[:,features])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last=True)#, pin_memory=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last=True)#, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last=True)#, pin_memory=True)

In [None]:
lr = 0.01
model2, train_logs2, val_logs2 = train_model(val_loader,train_loader,learn_rate=lr,s_model=StoreNet())

In [None]:
plt.title('Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.plot(val_logs2)
plt.show()
plt.title('Train Loss')
plt.plot(train_logs2)
plt.ylabel('Loss')
plt.xlabel('Epochs')

Now matter what we did with the model, adding features, increasing or decreasing the model complexity, playing with the learning rate, we still got roughly the same results as the first model with train loss of about 4.75 and unreasonable validation loss. We can also see that the validation loss increased as the model continue to train, which leads to us thinking we have over fitting 

### **When we run the code we got a warnning 'can only test child proccess' which accroding to the Pytorch forums is still an active problem that needs to be solved on their end**

## 3.f. Insights

We will plot the embedding of the store and item numbers we recived from the training

In [None]:
maxrow = 30000 # for mmemory allocation reasons
features_test =['item_enc','store_enc','city','day','cluster','class','family','dcoilwtico','onpromotion']
x_test = torch.tensor(df_test.loc[:maxrow,features_test].astype(np.float32).values).to(device).float()
preds = model2(x_test)

In [None]:
result = pd.DataFrame()
result['id'] = df_test.loc[:maxrow,'id']
result['pred'] = preds.detach().cpu()

In [None]:
df_train_redu =df_train.loc[:maxrow,:]
indx = df_train_redu.loc[:,'item_enc'].unique()[:5]
y_max = np.zeros(len(indx))
y_min = np.zeros(len(indx))
list_result =[]
for i in range(0,len(indx)):
    y_max[i] = np.amax(np.array(df_train_redu.loc[df_train_redu['item_enc']==indx[i],'unit_sales'],dtype=np.float))
    y_min[i] = np.amin(np.array(df_train_redu.loc[df_train_redu['item_enc']==indx[i],'unit_sales'],dtype=np.float))
    list_result += [result.loc[df_train_redu['item_enc']==indx[i],'pred']]

In [None]:
plt.scatter(indx,y_max,color='r')
plt.scatter(indx,y_min,color='r')
for i in range(0,len(indx)):
    for j in list_result[i]:
        plt.scatter(indx[i],j,color='b')
plt.ylabel('Unit Sales')
plt.xlabel('Item index')

We have plotted in red the max and min values of the unit sales of each of the items in the x axis, and in blue our prediction. We want to see our model prediceted in a reasonable range of values

In [None]:
# getting the embedding layer of store and item number
store_embd = model2.str_embd.weight
item_embd = model2.item_embd.weight

In [None]:
from sklearn.manifold import TSNE

show_str = {(i+1):v.detach().cpu().numpy() for i,v in enumerate(store_embd)}
df_show_str = pd.DataFrame.from_dict(show_str,orient='index')
tsne = TSNE(perplexity=25,n_components=2,random_state=0)
f = tsne.fit_transform(df_show_str) 
x = f[:,0]
y = f[:,1]
plt.figure(figsize=(12, 9)) 
plt.plot(x, y, 'ro')

for label, x, y in zip(df_show_str.index, x, y):
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points')
plt.title('Store Embedding in 2 dimension')

In [None]:
item_show = {(i+1):v.detach().cpu().numpy() for i,v in enumerate(item_embd)}
df_item_show = pd.DataFrame.from_dict(item_show,orient='index')
f2 = tsne.fit_transform(df_item_show) 


In [None]:
x2 = f2[:,0]
y2 = f2[:,1]
plt.figure(figsize=(12, 9)) 
plt.plot(x2, y2, 'ro')
plt.title('Item Embedding in 2 dimension no text')
plt.show()

plt.figure(figsize=(12, 9)) 
plt.plot(x2, y2, 'ro')
for label, x, y in zip(df_item_show.index, x2, y2):
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points')
plt.title('Item Embedding in 2 dimension with text')        

We can see that store are pretty much uniformly distribute, but the items all algin on some curve, which we can't infer much from

## 3.g. Feature Extractor

We will now use the model that we got as a feature extractor and use classical machine learning to get a similar result. We will agian use random forest 

In [None]:
class StoreNet_fex(nn.Module):
    def __init__(self):
        super(StoreNet_fex, self).__init__()
        self.backbone = model2
        
    def forward(self, x):
        a = self.backbone.item_embd(x[:,0].long())
        b = self.backbone.str_embd(x[:,1].long())
        c = self.backbone.city_embd(x[:,2].long())
        d = self.backbone.day_embd(x[:,3].long())
        e = self.backbone.clst_embd(x[:,4].long())
        f = self.backbone.class_embd(x[:,5].long())
        g = self.backbone.fam_embd(x[:,6].long())
        seq = x[:,7:9]
        
        x = torch.cat((a,b,c,d,e,f,g),1)
        
        x = self.backbone.bn_emb(x)
        x= self.backbone.dropout1(x)
        x = torch.cat([x,seq],1)
        x = x.view(x.size(0),-1)
        
        x = self.backbone.relu(self.backbone.fc1(x))
        x = self.backbone.bn1(x)
        x = self.backbone.dropout2(x)
        
        x = self.backbone.relu(self.backbone.fc2(x))
        x = self.backbone.bn2(x)
    
        return x

In [None]:
maxrow = 300000
weight = df_val.loc[:maxrow,'perishable']
for z in weight:
    if weight[z]==0:
        weight[z]=1
    else:
        weight[z]=1.25

In [None]:
from sklearn.metrics import mean_squared_log_error as msle
maxrow = 300000
features = ['item_enc','store_enc','city','day','cluster','class','family','dcoilwtico','onpromotion']
print('Getting model as a feature extrator')
model_fex = StoreNet_fex()
y_train = torch.tensor(df_train.loc[:maxrow,'unit_sales'].astype(np.float32).values).to(device).float()
x_train = torch.tensor(df_train.loc[:maxrow,features].astype(np.float32).values).to(device).float()
x_new = model_fex(x_train)

print('Starting classic ML')

rf = RandomForestRegressor(max_depth = x_train.shape[1], n_jobs = -1)
fittedmodel = rf.fit(x_new.detach().cpu(),y_train.cpu())
print('Finished fitting')

In [None]:
features = ['item_enc','store_enc','city','day','cluster','class','family','dcoilwtico','onpromotion']
y_val = torch.tensor(df_val.loc[:maxrow,'unit_sales'].astype(np.float32).values).to(device).float()
x_val = torch.tensor(df_val.loc[:maxrow,features].astype(np.float32).values).to(device).float()
x_val_new = model_fex(x_val)

preds_bm = rf.predict(x_val_new.detach().cpu())
loss_val = NWRMSLE(y_val.cpu().numpy(), preds_bm.astype(np.int),weight)
print('The loss is: ',loss_val)

In [None]:
# we have no labels in test set so we will just get them
out = evaluate(model2, test_loader)

We can see that with a feature extractor we get a bad loss of about 2.54 with only a small part of the rows. We still do not fully understand where our neural network failed to predict the correct values, but using is as a feature extractor has led to even the classical machine learning which gave good result at the start of this question, to fail.