In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import dateutil.easter as easter

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
from accelerate import Accelerator
import torch.optim as optim

In [5]:
import time

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [9]:
torch.cuda.is_available()

True

In [10]:
original_train_df = pd.read_csv('./data/train.csv', parse_dates=['date'])
original_test_df = pd.read_csv('./data//test.csv', parse_dates=['date'])
gdp_df = pd.read_csv('./data/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv',
                    index_col='year')

original_train_df.head(2)

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520


In [11]:
original_train_df['product'].unique()

array(['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker'], dtype=object)

In [12]:
def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

In [13]:
gdp_df

Unnamed: 0_level_0,GDP_Finland,GDP_Norway,GDP_Sweden
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,234.44,385.802,505.104
2016,240.608,368.827,515.655
2017,255.017,398.394,541.019
2018,275.58,437.0,555.455
2019,268.782,405.51,533.88


# Feature engineering

In [14]:
# Feature engineering
def get_gdp(row):
    """Return the GDP based on row.country and row.date.year"""
    country = 'GDP_' + row.country
    return gdp_df.loc[row.date.year, country]

le_dict = {feature: LabelEncoder().fit(original_train_df[feature]) for feature in ['country', 'product', 'store']}

def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    new_df = pd.DataFrame({'gdp': df.apply(get_gdp, axis=1),
                           'dayofyear': df.date.dt.dayofyear,
                           'wd4': df.date.dt.weekday == 4, # Friday
                           'wd56': df.date.dt.weekday >= 5, # Saturday and Sunday
                          })

    new_df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), 'dayofyear'] += 1 # fix for leap years
    
    for feature in ['country', 'product', 'store']:
        new_df[feature] = le_dict[feature].transform(df[feature])
        
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df['days_from_easter'] = (df.date - easter_date).dt.days.clip(-5, 65)
    
    # Last Sunday of May (Mother's Day)
    sun_may_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-5-31')),
                                         2016: pd.Timestamp(('2016-5-29')),
                                         2017: pd.Timestamp(('2017-5-28')),
                                         2018: pd.Timestamp(('2018-5-27')),
                                         2019: pd.Timestamp(('2019-5-26'))})
    #new_df['days_from_sun_may'] = (df.date - sun_may_date).dt.days.clip(-1, 9)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date # used in GroupKFold
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
train_df['target'] = np.log(train_df['num_sold'] / train_df['gdp'])
test_df = engineer(original_test_df)
test_df['date'] = original_test_df.date # used in GroupKFold

In [15]:
test_df

Unnamed: 0,gdp,dayofyear,wd4,wd56,country,product,store,days_from_easter,days_from_wed_jun,days_from_sun_nov,date
0,268.782,1,False,False,0,1,0,-5,-5,-1,2019-01-01
1,268.782,1,False,False,0,0,0,-5,-5,-1,2019-01-01
2,268.782,1,False,False,0,2,0,-5,-5,-1,2019-01-01
3,268.782,1,False,False,0,1,1,-5,-5,-1,2019-01-01
4,268.782,1,False,False,0,0,1,-5,-5,-1,2019-01-01
...,...,...,...,...,...,...,...,...,...,...,...
6565,533.880,366,False,False,2,0,0,65,5,9,2019-12-31
6566,533.880,366,False,False,2,2,0,65,5,9,2019-12-31
6567,533.880,366,False,False,2,1,1,65,5,9,2019-12-31
6568,533.880,366,False,False,2,0,1,65,5,9,2019-12-31


In [16]:
in_features = ['dayofyear', 'days_from_easter', 'days_from_sun_nov', 'days_from_wed_jun', 'wd4', 'wd56','country','store','product', 'target']

In [17]:
train_df

Unnamed: 0,gdp,dayofyear,wd4,wd56,country,product,store,days_from_easter,days_from_wed_jun,days_from_sun_nov,date,num_sold,target
0,234.440,1,False,False,0,1,0,-5,-5,-1,2015-01-01,329.0,0.338858
1,234.440,1,False,False,0,0,0,-5,-5,-1,2015-01-01,520.0,0.796629
2,234.440,1,False,False,0,2,0,-5,-5,-1,2015-01-01,146.0,-0.473593
3,234.440,1,False,False,0,1,1,-5,-5,-1,2015-01-01,572.0,0.891939
4,234.440,1,False,False,0,0,1,-5,-5,-1,2015-01-01,911.0,1.357343
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,555.455,366,False,False,2,0,0,65,5,9,2018-12-31,823.0,0.393169
26294,555.455,366,False,False,2,2,0,65,5,9,2018-12-31,250.0,-0.798327
26295,555.455,366,False,False,2,1,1,65,5,9,2018-12-31,1004.0,0.591960
26296,555.455,366,False,False,2,0,1,65,5,9,2018-12-31,1441.0,0.953305


In [18]:
grp_df_train = train_df.groupby('date', as_index=False).agg(target = ('target', list),
                             dayofyear =('dayofyear', list),
                             wd4 =('wd4', list),
                             wd56 = ('wd56', list),
                             country =('country', list),
                             product =('product', list),
                             store =('store', list),
                             days_from_easter =('days_from_easter', list),
                             days_from_wed_jun =('days_from_wed_jun', list),
                             days_from_sun_nov =('days_from_sun_nov', list),
                            )

train_df2 = pd.DataFrame({'date': grp_df_train['date'].values,
              'features':grp_df_train.apply(lambda x: np.array([np.array(x[f]) for f in in_features]), axis=1)  
})

In [19]:
grp_df_test = test_df.groupby('date', as_index=False).agg(
                             dayofyear =('dayofyear', list),
                             wd4 =('wd4', list),
                             wd56 = ('wd56', list),
                             country =('country', list),
                             product =('product', list),
                             store =('store', list),
                             days_from_easter =('days_from_easter', list),
                             days_from_wed_jun =('days_from_wed_jun', list),
                             days_from_sun_nov =('days_from_sun_nov', list),
                            )

test_df2 = pd.DataFrame({'date': grp_df_test['date'].values,
              'features':grp_df_test.apply(lambda x: np.array([np.array(x[f]) for f in in_features if f!='target']), axis=1)  
})

#### Config #####

In [20]:
config = {
    'seq_length' : 60,
    'num_epochs' : 300,
    'lr' : 0.001,
    'input_size' : 180,
    'hidden_size' : 360,
    'num_layers' : 2,
    'num_classes' :18, ## This is  output dimension
    'train_shuffle': True,
    'val_shuffle': True,
    'batch_size' : 30,
    'best_model_name' : 'lstm_tsp_mlp_head_drpOut_1.bin',
    'bidirectional' : False,
    'only_last_hidden': False
}
# config_lr = {'T_max':20,
#              'eta_min':0
#             }

#### Make sequences ####

In [21]:
def sliding_windows(data, seq_length):
        x = []
        y = []

        for i in range(len(data)-seq_length-1):
            _x = data[i:(i+seq_length),:].transpose(0,2,1).reshape(seq_length,-1)
            _y = data[i+seq_length,-1]
            x.append(_x)
            y.append(_y)

        return np.array(x),np.array(y)
    
def make_sequences(df,seq_length):
    data = np.rollaxis(np.dstack(df['features'].values.tolist()),-1)
    print('Data Shape', data.shape)
    
    x, y = sliding_windows(data, seq_length)

    print('X,y shapes', x.shape,y.shape)
    
    return x,y
    

In [22]:
X,y = make_sequences(train_df2,config['seq_length'])

Data Shape (1461, 10, 18)
X,y shapes (1400, 60, 180) (1400, 18)


## Model ##

In [23]:
num_epochs = config['num_epochs']
lr = config['lr']
input_size = config['input_size']
hidden_size = config['hidden_size']
num_layers = config['num_layers']
num_classes = config['num_classes']
seq_length = config['seq_length']
bidirectional = config['bidirectional']
only_last_hidden = config['only_last_hidden']

In [24]:
class LSTMTpsModel(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers,seq_length):
        super(LSTMTpsModel, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True,bidirectional=bidirectional)
        
        if bidirectional:
            m=2
        else:
            m=1
        
        if only_last_hidden:
            input_dim = hidden_size*m
        else:
            input_dim = self.seq_length*hidden_size*m
        
        # self.fc = nn.Linear(input_dim, self.num_classes)
        
        self.fc = nn.Sequential(nn.Linear(input_dim, input_dim//8),
                                # nn.BatchNorm1d(num_features=input_dim//8),
                                nn.Dropout(0.2),
                                nn.ReLU(),
                                
                                nn.Linear(input_dim//8, input_dim//16),
                                # nn.BatchNorm1d(num_features=input_dim//16),
                                nn.Dropout(0.2),
                                nn.ReLU(),
                                
                                nn.Linear(input_dim//16, input_dim//32),
                                # nn.BatchNorm1d(num_features=input_dim//32),
                                nn.Dropout(0.2),
                                nn.ReLU(),
                                nn.Linear(input_dim//32, self.num_classes)
                                )

    def forward(self, x):
        # Propagate input through LSTM
        h_out, (_, _) = self.lstm(x)
        if only_last_hidden:
            h_out = h_out[:,-1:,:]
        h_out = h_out.flatten(start_dim=1)
        
        out = self.fc(h_out)
        return out

In [25]:
model = LSTMTpsModel(num_classes, input_size, hidden_size, num_layers,seq_length)

In [26]:
model.load_state_dict(torch.load(config['best_model_name']))
model.eval()

LSTMTpsModel(
  (lstm): LSTM(180, 360, num_layers=2, batch_first=True)
  (fc): Sequential(
    (0): Linear(in_features=21600, out_features=2700, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=2700, out_features=1350, bias=True)
    (4): Dropout(p=0.2, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=1350, out_features=675, bias=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): ReLU()
    (9): Linear(in_features=675, out_features=18, bias=True)
  )
)

##### Predictions #####

In [27]:
data_train = np.rollaxis(np.dstack(train_df2['features'].values.tolist()),-1).transpose(0,2,1)

In [28]:
data_test = np.rollaxis(np.dstack(test_df2['features'].values.tolist()),-1).transpose(0,2,1)

In [29]:
print(data_train.shape,data_test.shape)

(1461, 18, 10) (365, 18, 9)


In [30]:
last_years_data = data_train[-seq_length:,:,:]

In [31]:
last_years_data.shape

(60, 18, 10)

In [32]:
predictions = []
with torch.no_grad():
    for i in range(len(test_df2)):
        if i == 0:
            inpt = torch.Tensor(last_years_data.reshape(seq_length,-1)).unsqueeze(dim=0)
            # print(inpt.shape, i)
        elif i < seq_length:
            inpt1 = torch.Tensor(last_years_data[-seq_length+i:])
            inpt2 = torch.Tensor(data_test[:i])
            inpt3 = torch.cat(predictions[:i], dim=0).unsqueeze(dim=2)
            inpt4 = torch.cat([inpt2,inpt3], dim=2)
            
            inpt = torch.cat([inpt1,inpt4],dim=0).reshape(seq_length,-1).unsqueeze(dim=0)
            # print(inpt.shape, i)
        else:
            inpt2 = torch.Tensor(data_test[i-seq_length:i])
            inpt3 = torch.cat(predictions[i-seq_length:i], dim=0).unsqueeze(dim=2)
            inpt = torch.cat([inpt2,inpt3], dim=2).reshape(seq_length,-1).unsqueeze(dim=0)
            # print(inpt.shape, i)
        
        
        out = model(inpt)
        predictions.append(out)

In [33]:
final_preds = [pred.squeeze().tolist() for pred in predictions]

In [34]:
grp_df_test['num_sold'] = final_preds

In [35]:
grp_df_test = grp_df_test[['date','country','store','product','num_sold']]

In [36]:
test_results = grp_df_test.explode(['country','store','product','num_sold'])

In [37]:
test_results

Unnamed: 0,date,country,store,product,num_sold
0,2019-01-01,0,0,1,0.550932
0,2019-01-01,0,0,0,0.984858
0,2019-01-01,0,0,2,-0.265351
0,2019-01-01,0,1,1,1.117378
0,2019-01-01,0,1,0,1.554794
...,...,...,...,...,...
364,2019-12-31,2,0,0,0.398305
364,2019-12-31,2,0,2,-0.868827
364,2019-12-31,2,1,1,0.52475
364,2019-12-31,2,1,0,0.945948


In [38]:
for feature in ['country', 'product', 'store']:
    test_results[feature] = le_dict[feature].inverse_transform(test_results[feature].values.tolist())

In [39]:
test_results['gdp'] = test_results.apply(get_gdp, axis=1)

In [40]:
test_results['num_sold'] = test_results.apply(lambda x: np.exp(x.num_sold)* x.gdp, axis=1)

In [41]:
test_results = test_results.drop(columns = 'gdp')

In [42]:
test_results = original_test_df.merge(test_results, on = ['date','country','store','product'])

In [43]:
test_results[['row_id', 'num_sold']].to_csv('submission_'+config['best_model_name']+'.csv',index=False)