In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

Load data and get correct format.

In [None]:
df = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

df['date'] = pd.to_datetime(df['date'],format='%d.%m.%Y')
df = pd.merge(df,items[['item_id','item_category_id']], how='left', on='item_id')
df.sort_values(by='date',ascending=True, inplace=True)
# df.set_index('date', inplace=True)
df['month'] = df['date'].dt.month

df.head(5)

Define categorical and continuous features.

In [None]:
cat = ['date_block_num','shop_id', 'item_id','item_category_id','month'] # categorical features
cont = ['item_price'] # continuous features
output = 'item_cnt_day'

Get monthly level by agg over dates.

In [None]:
df_agg = df.groupby(cat).agg({cont[0]:np.mean,output:np.sum}).reset_index() 
del df

In [None]:
df_agg.info()

Claculate clusters to get patterns in any high dimensional categorical features.

In [None]:
# fitting multiple k-means algorithms and storing the values in an empty list
list_cluster = ['shop_id','item_id','item_category_id','item_price']
SSE = []
for cluster in range(1,20):
    kmeans = MiniBatchKMeans(n_clusters = cluster, init='k-means++')
    kmeans.fit(df_agg[list_cluster])
    SSE.append(kmeans.inertia_)

# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [None]:
kmeans = MiniBatchKMeans(n_clusters = 6, init='k-means++')
kmeans.fit(df_agg[list_cluster])
new_agg = kmeans.predict(df_agg[list_cluster])
df_agg['new_agg'] = new_agg

In [None]:
new_cat = ['date_block_num','shop_id', 'new_agg','item_category_id','month'] # categorical features

In [None]:
df_agg_level_2 = df_agg.groupby(new_cat).agg({cont[0]:np.mean,output:np.sum}).reset_index() 

Encode to int categorical features.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Append number max + 1 (month)
list_dfs = [df_agg_level_2.append(pd.DataFrame({'date_block_num':[df_agg_level_2['date_block_num'].max() + 1]})), 
            shops, 
            df_agg_level_2, 
            item_categories,
            df_agg_level_2]

label_encoders = {}

for i, cat_col in enumerate(new_cat):
    label_encoders[cat_col] = LabelEncoder()
    label_encoders[cat_col].fit(list_dfs[i][cat_col])
    df_agg_level_2[cat_col] = label_encoders[cat_col].transform(df_agg_level_2[cat_col])

In [None]:
# set index from past values
def get_index(*args):
    index = ""
    for word in args:
        index += str(word) + '_'
    return index 

df_agg_level_2['index_act'] = np.vectorize(get_index)(df_agg_level_2['date_block_num'], 
                                                      df_agg_level_2['shop_id'], 
                                                      df_agg_level_2['new_agg'], 
                                                      df_agg_level_2['item_category_id'])

Get windows of 24 months.

In [None]:
n_lags = 24 # define number of lags, might it be a hyperparameter

for i in range(n_lags) : 
    print(i)
    new_idx = 'index_ant_' + str(i + 1)
    df_agg_level_2[new_idx] = np.vectorize(get_index)(df_agg_level_2['date_block_num'] + i + 1, 
                                                      df_agg_level_2['shop_id'], 
                                                      df_agg_level_2['new_agg'],
                                                      df_agg_level_2['item_category_id'])
    
    df_agg_level_2 = pd.merge(df_agg_level_2[[i for i in df_agg_level_2.columns if i not in [new_idx]]],
                              df_agg_level_2[[output,new_idx]].rename(columns={output: output + '_t_' +  str(i + 1) }),
                              how='left',
                              left_on='index_act',
                              right_on=new_idx)
        
    df_agg_level_2[output + '_t_' +  str(i + 1)].fillna(0,inplace=True)
    print('Length of dataframe: {}'.format(str(len(df_agg_level_2))))

    cont.append(output + '_t_' +  str(i + 1)) # append new features to my continuous list

    del df_agg_level_2[new_idx]
del df_agg_level_2['index_act']

Split data, 20% for test.

In [None]:
train = df_agg_level_2.loc[df_agg_level_2['date_block_num']>n_lags-1].copy() # perform delete inplace requiere a copy DF
test = train.sample(frac=0.2,random_state=101)
train.drop(test.index,inplace=True, axis=0)

In [None]:
print('Number of samples in train: {}'.format(len(train)))
print('Number of samples in test: {}'.format(len(test)))

In [None]:
df_agg_level_2.info()

Create tabular dataset for dataloaders.

In [None]:

class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        """
        Characterizes a Dataset for PyTorch

        Parameters
        ----------

        data: pandas data frame
        The data frame object for the input data. It must
        contain all the continuous, categorical and the
        output columns to be used.

        cat_cols: List of strings
        The names of the categorical columns in the data.
        These columns will be passed through the embedding
        layers in the model. These columns must be
        label encoded beforehand. 

        output_col: string
        The name of the output variable column in the data
        provided.
        """

        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y =  np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

Get embeddings dimension.

In [None]:
cat_dim = [len(label_encoders[l].classes_) for l in new_cat]
emb_dim = [(x, min(50,(x+1)//2)) for x in cat_dim]

In [None]:
emb_dim

In [None]:
"""
class TabularModel(nn.Module):
    def __init__(self,emb_size,n_cont,out_size,layers, hidden_lstm,p):
        
        super().__init__()
        
        self.embeds = nn.ModuleList([nn.Embedding(ns,nd) for ns, nd in emb_size]) 
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        self.hidden_lstm = hidden_lstm

        layerlist = []
        n_emb = sum((nd for ns,nd in emb_size))
        n_in = n_emb + n_cont

        self.n_in = n_in
        self.conv1d_1 = nn.Conv1d(n_in,32,kernel_size=1)
        self.conv1d_2 = nn.Conv1d(32,64,kernel_size=1)
        self.relu = nn.ReLU(inplace=True)

        # Add an LSTM layer:
        self.lstm = nn.LSTM(64,hidden_lstm)      
        self.linear = nn.Linear(hidden_lstm,out_size)        
        self.hidden = (torch.zeros(1,1,hidden_lstm),
                       torch.zeros(1,1,hidden_lstm))

    def forward(self,x_cont,x_cat):
        embeddings = []
        for i, e in enumerate(self.embeds) : 
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings,1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x,x_cont],1)
         
        x = x.view(-1,self.n_in,1)
        x = self.conv1d_1(x)  
        x = self.relu(x)  
        x = self.conv1d_2(x)  
        x = self.relu(x)  

        lstm_out, self.hidden = self.lstm(x.view(len(x),1,-1), self.hidden)
        pred = self.linear(lstm_out.view(len(x),-1))

        return pred
"""

Create tabulra model using torch.

In [None]:
class TabularModel(nn.Module):
    
    def __init__(self,emb_size,n_cont,out_size,layers, hidden_lstm,p):

        """
        Tabular model embedding layer + Linear layer for continuous features

        Parameters
        ----------

        emb_size: list of Tuples 
        Size of embedding layers

        p: float
        Percentage of dropout layers 

        n_cont: int
        Number of continuous features

        layers: list of Integers
        Dimension of hidden layers

        out_size: int 
        One dimension of output 

        """ 


        super().__init__()
        
        self.embeds = nn.ModuleList([nn.Embedding(ns,nd) for ns, nd in emb_size]) 
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        self.hidden_lstm = hidden_lstm

        layerlist = []
        n_emb = sum((nd for ns,nd in emb_size))
        n_in = n_emb + n_cont

        for i in layers : 
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_size))
        self.layers = nn.Sequential(*layerlist)

    def forward(self,x_cont,x_cat):
        embeddings = []
        for i, e in enumerate(self.embeds) : 
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings,1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x,x_cont],1)
        x = self.layers(x)  

        return x


Define model to train.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(100)
# model = TabularModel(emb_dim, len(cont), 1, [500], 50, p=0.6).to(device)
model = TabularModel(emb_dim, len(cont), 1, [200,100],50,p=0.5).to(device)

In [None]:
model

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
batchsize_train = len(train)
train_tabular = TabularDataset(train, cat_cols= new_cat, output_col=output)
dataloader_train = DataLoader(train_tabular, batchsize_train, shuffle=True, num_workers=1)

batchsize_test = len(test)
test_tabular = TabularDataset(test, cat_cols= new_cat, output_col=output)
dataloader_test = DataLoader(test_tabular, batchsize_test, shuffle=True, num_workers=1)

Train model.

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

import time
start_time = time.time()

epochs = 60
losses = []
test_losses = []

for i in range(epochs):
    i += 1
    for k, (y,x_cont,x_cat) in enumerate(dataloader_train): # vect_data -> y, cont_x, cat_x
        k+=1
        y = y.to(device)
        x_cont = x_cont.to(device)
        x_cat = x_cat.to(device)

        y_pred = model(x_cont, x_cat)
        loss = torch.sqrt(criterion(y_pred,y)) # RMSE
        

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    losses.append(loss)

    train_loss = loss.item()

    with torch.no_grad():
        for k, (y,x_cont,x_cat) in enumerate(dataloader_test):

            # Apply the model
            y_val = model(x_cont, x_cat)
    loss = torch.sqrt(criterion(y_val,y))
    test_losses.append(loss)

    if i%5 == 0:
        print(f'epoch: {i:3} batch: {k:3}/{(i+1)*len(dataloader_train):3}  loss: {train_loss:10.8f} validation: {loss.item():10.8f}')


print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:

# We can attemp whit 48 EPOCHS

plt.figure(figsize=(17,5))
plt.plot(losses, label='training loss')
plt.plot(test_losses, label='validation loss')
plt.title('Loss at the end of each epoch')
plt.legend();

Fit model whit all data.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(100)
final_model = TabularModel(emb_dim, len(cont), 1, [200,100],50,p=0.5).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(final_model.parameters(), lr=0.01)

In [None]:
all_data = df_agg_level_2.loc[df_agg_level_2['date_block_num']>n_lags-1].copy()

batchsize_all = len(all_data)
all_tabular = TabularDataset(all_data, cat_cols= new_cat, output_col=output)
dataloader_all = DataLoader(all_tabular, batchsize_all, shuffle=True, num_workers=1)


In [None]:
import time
start_time = time.time()

epochs = 45
losses = []

for i in range(epochs):
    i += 1
    for k, (y,x_cont,x_cat) in enumerate(dataloader_all): # vect_data -> y, cont_x, cat_x
        k+=1
        y = y.to(device)
        x_cont = x_cont.to(device)
        x_cat = x_cat.to(device)
        
        y_pred = final_model(x_cont, x_cat)
        loss = torch.sqrt(criterion(y_pred,y)) # RMSE
        

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    losses.append(loss)

    if i%5 == 0:
        print(f'epoch: {i:3} batch: {k:3}/{(i+1)*len(dataloader_all):3}  loss: {loss.item():10.8f} ')


In [None]:
plt.figure(figsize=(17,5))
plt.plot(losses, label='training loss')
plt.title('Loss at the end of each epoch')
plt.legend();

Get values (price and item category) for test.

In [None]:
# Simple load of data test

df_test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
df_test = pd.merge(df_test,items[['item_id','item_category_id']], how='left', on='item_id')
df_test['month'] = 11
df_test['date_block_num'] = df_agg_level_2['date_block_num'].max() + 1

#Join whit dim_price to calculate clusters

dim_price = df_agg.groupby(['item_id','item_category_id']).agg({'item_price':np.mean}).reset_index()
df_test = pd.merge(df_test,dim_price, how='left', on=['item_id','item_category_id'])
df_test['item_price'].fillna(0, inplace=True)
df_test.info()


Calculate clusters.

In [None]:

df_test['new_agg'] = kmeans.predict(df_test[list_cluster])
df_test.info()

Encode test.

In [None]:
# encode for embedding layers
for cat_col in new_cat:
    df_test[cat_col] = label_encoders[cat_col].transform(df_test[cat_col])

In [None]:
# Get indx to join whit test
df_agg['index_act'] = np.vectorize(get_index)(df_agg['date_block_num'], 
                                              df_agg['shop_id'], 
                                              df_agg['item_id'],
                                              df_agg['item_category_id'])

Get windows for test.

In [None]:
n_lags = 24 # define number of lags, might it be a hyperparameter

for i in range(n_lags) : 
    print(i)
    new_idx = 'index_ant_' + str(i + 1)
    df_test[new_idx] = np.vectorize(get_index)(df_test['date_block_num'] - i - 1, 
                                               df_test['shop_id'], 
                                               df_test['item_id'],
                                               df_test['item_category_id'])
    
    df_test = pd.merge(df_test,
                       df_agg[[output,'index_act']].rename(columns={output: output + '_t_' +  str(i + 1) }),
                       how='left',
                       left_on=new_idx,
                       right_on='index_act')
        
    df_test[output + '_t_' +  str(i + 1)].fillna(0,inplace=True)
#     df_agg.drop(df_agg.loc[df_agg['date_block_num']<=i].index,axis=0,inplace=True)
    print('Length of dataframe: {}'.format(str(len(df_test))))

#    cont.append(output + '_t_' +  str(i + 1)) # append new features to my continuous list

    del df_test[new_idx]
    del df_test['index_act']
del df_agg['index_act']


Calculate predictions whit final model.

In [None]:
test_final = df_test[new_cat + cont]
batchsize_test = len(test_final)

test_tabular = TabularDataset(test_final, cat_cols= new_cat)
dataloader_test = DataLoader(test_tabular, batchsize_test, shuffle=True, num_workers=1)

In [None]:
for k, (y,x_cont,x_cat) in enumerate(dataloader_test):
    break

In [None]:
final_model.eval()
with torch.no_grad() : 
    for k, (y,x_cont,x_cat) in enumerate(dataloader_test):
            # Apply the model
            y_val = final_model(x_cont, x_cat)            

In [None]:
y_val_clip = y_val.clip(min=0,max=y_val.max().item())
y_val_clip_round = y_val_clip.round().numpy()

In [None]:
df_test['item_cnt_month'] = y_val_clip_round

Little validation.

In [None]:
print('0: ' + str(df_test['item_cnt_month'].sum()))
print('1: ' + str(df_test['item_cnt_day_t_1'].sum()))
print('2: ' + str(df_test['item_cnt_day_t_2'].sum()))
print('12: ' + str(df_test['item_cnt_day_t_12'].sum()))

Get sample submission.

In [None]:
df_test[['ID','item_cnt_month']].to_csv('sample_submission.csv', index=False)