In [None]:
!pip install py7zr
!pip install shap
!pip install catboost

In [None]:
%load_ext tensorboard
import os
import py7zr
import pandas as pd
import pickle
import datetime
from datetime import date, timedelta
import matplotlib.pyplot as plt
import shap
import numpy as np
import tensorflow as tf
from catboost import CatBoostRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score, mean_squared_error
from tensorflow.keras.layers import Input, Embedding, dot, Dot, add,Flatten, concatenate,Dropout, Dense,BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model 
from sklearn.preprocessing import MinMaxScaler

In [None]:
PATH_DIR = '/kaggle/input/favoritagrocerysalesforecastingextracted/'

# Data Extractor

In [None]:

oil = pd.read_csv(PATH_DIR+'oil.csv', parse_dates=["date"])
items = pd.read_csv(PATH_DIR+'items.csv')
stores = pd.read_csv(PATH_DIR+'stores.csv')

test = pd.read_csv(PATH_DIR+'test.csv', parse_dates=["date"])

train = pd.read_csv(
    PATH_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)
train.unit_sales = train.unit_sales.astype(pd.np.float64)

train_1 = train.loc[train.date>=pd.datetime(2017,5,1)]
train = train_1.loc[train_1.date>pd.datetime(2017,5,25)]
validation = train_1.loc[train_1.date<=pd.datetime(2017,5,25)]

In [None]:
train.tail()

In [None]:
validation.head()

In [None]:
print('Length of the train',len(train))
print('Length of the validation',len(validation))

In [None]:
test

We are exploring the csv files that we have and print the information about them

In [None]:
dataframes = [(oil,"oil"),(items,"items"),(stores,"stores"),(test,"test"),(train,"train")]
for df in dataframes:
  print("df NAME:", df[1])
  print(df[0].head())
  print(df[0].apply(lambda x: sum(x.isnull()),axis=0))
  print('========================')


Filling nan values in oil table with 10 moving average technique

In [None]:
dates = train.date
dates = dates.append(test.date)
dates = dates.append(validation.date)
unique_dates = pd.DataFrame({'date': dates.unique()})

oil = oil.merge(unique_dates, on='date', how='outer')
oil = oil.sort_values('date')

oil = oil.fillna(oil['dcoilwtico'].rolling(10, min_periods=1, center=True, win_type='gaussian')
                 .mean(std=1).to_frame())

Preprossing the datas, transforam valutes into [0...length] format and preformes minMax scaling
  on сontinuous variables

In [None]:
def preprocessing(items,stores,oil,train,test,validation):
  """
  Preprossing the datas, transforam valutes into [0...length] format and preformes minMax scaling
  on сontinuous variables

  """
  item_enc = {v:k for (k,v) in enumerate(items.item_nbr.unique())}
  class_enc = {v:k for (k,v) in enumerate(items['class'].unique())}
  family_enc = {v:k for (k,v) in enumerate(items['family'].unique())}

  store_enc = {v:k for (k,v) in enumerate(stores.store_nbr.unique())}
  city_enc = {v:k for (k,v) in enumerate(stores['city'].unique())}
  type_enc = {v:k for (k,v) in enumerate(stores['type'].unique())}
  state_enc = {v:k for (k,v) in enumerate(stores['state'].unique())}
  cluster_enc = {v:k for (k,v) in enumerate(stores['cluster'].unique())}

  train['store_nbr'] = [store_enc[x] for x in train['store_nbr']]
  train['item_nbr'] = [item_enc[x] for x in train['item_nbr']]

  test['store_nbr'] = [store_enc[x] for x in test['store_nbr']]
  test['item_nbr'] = [item_enc[x] for x in test['item_nbr']]

  validation['store_nbr'] = [store_enc[x] for x in validation['store_nbr']]
  validation['item_nbr'] = [item_enc[x] for x in validation['item_nbr']]

  items['item_nbr'] = [item_enc[x] for x in items['item_nbr']]
  items['class'] = [class_enc[x] for x in items['class']]
  items['family'] = [family_enc[x] for x in items['family']]

  stores['store_nbr'] = [store_enc[x] for x in stores['store_nbr']]
  stores['city'] = [city_enc[x] for x in stores['city']]
  stores['type'] = [type_enc[x] for x in stores['type']]
  stores['state'] = [state_enc[x] for x in stores['state']]
  stores['cluster'] = [cluster_enc[x] for x in stores['cluster']]

  scaler = MinMaxScaler()

  scaler.fit(oil[['dcoilwtico']])
  oil['dcoilwtico'] = scaler.transform(oil[['dcoilwtico']])
  oil['dcoilwtico'] = oil['dcoilwtico'].astype('float32')

  return items, stores, oil, train, test, validation, item_enc, cluster_enc, store_enc


items, stores, oil, train, test, validation, item_enc, cluster_enc, store_enc = preprocessing(items,stores,oil,train,test,validation)

Mergging different dataframes into one dataframe and describe the informaion

In [None]:
def merge_dfs(main_df, oil, stores, items):
    """
    Merging different dataframes into one df
    Args:
     main_df(DataFrame)
     oil(DataFrame)
     stores(DataFrame)
     items(DataFrame)
    Output:
      df. merged data frame
      wights. wights of the samples for evalution
    """
    df = main_df.merge(oil, on='date', how='left')
    df = df.merge(stores, on='store_nbr', how='left')
    df = df.merge(items, on='item_nbr', how='left')

    df['day'] = df.date.dt.dayofweek



    wightes = df['perishable'].map({0:1.0, 1:1.25})
    df = df.drop(['date','perishable'], axis=1)

    return df, wightes


def information(items,stores):
 """
 Returns the legnth of unique vaues of the relavent features
 """
 n_items = len(items.item_nbr.unique())

 n_class = len(items['class'].unique())

 n_family = len(items['family'].unique())

 n_store = len(stores['store_nbr'].unique())

 n_type = len(stores['type'].unique())

 n_city = len(stores['city'].unique())

 n_state = len(stores['state'].unique())

 n_cluster = len(stores['cluster'].unique())

 return n_class, n_store, n_type, n_city, n_state, n_cluster, n_items, n_family


df_train, wightes_train = merge_dfs(train, oil, stores, items)
df_test, wightes_test= merge_dfs(test, oil, stores, items)
df_validation, wightes_validtion = merge_dfs(validation, oil, stores, items)

n_class, n_store, n_type, n_city, n_state, n_cluster, n_items, n_family = information(items,stores)

print(df_train.describe())
print()
print(df_train.dtypes)


In [None]:
df_train

In [None]:
def simple_to_Xy_array(df):
  """
  Extract features for the first DP regressor model
  Args:
    df(DataFrame):  data
  Output:
    x. features
    y. target
  """
  features = ['store_nbr',
              'item_nbr',
              'cluster',
              'day']

  target_class = 'unit_sales'

  X = []
  y = []
  for f in features:
      X.append(df[f])
  if target_class is not None:    
      y = df[target_class]
  return X, y

def to_Xy_array(df):
  """
  Extract features for the second DP regressor model
    Args:
    df(DataFrame):  data
  Output:
    x. features
    y. target
  """
  features = ['store_nbr',
              'item_nbr',
              'cluster',
              'day',
              'city',
              'family',
              'type',
              'state',
              'class',
              'onpromotion',
              'dcoilwtico']

  target_class = 'unit_sales'

  X = []
  y = []
  for f in features:
      X.append(df[f])
  if target_class is not None:    
      y = df[target_class]
  return X, y

def to_Xy_df(df):
  """
  Extract features for ML model
    Args:
    df(DataFrame):  data
  Output:
    x. features
    y. target
  """
  features = ['store_nbr',
              'item_nbr',
              'onpromotion',
              'dcoilwtico',
              'cluster',
              'family',
              'city',
              'day']
  target_class = 'unit_sales'

  X = df.drop(columns=[target_class])
  y = df[target_class]

  return X, y
        
x_train_df, Y_train_df = to_Xy_df(df_train)
x_train_s, Y_train_s = simple_to_Xy_array(df_train)
x_train, Y_train = to_Xy_array(df_train)

x_val_df, Y_val_df = to_Xy_df(df_validation)
x_val_s, Y_val_s = simple_to_Xy_array(df_validation)
x_val, Y_val = to_Xy_array(df_validation)



In [None]:
def NWRMSLE(y, pred, w):
  """
  Normalized wighted root mean squer logirtimic error metric
  Args:
    y(np.array):  actuall label
    pred(np.arry):  prediction label
    w(np.array):  wights of the samples
  Output:
    int. metric score
  """
  y = np.array(y)
  pred = np.array(pred)
  w = np.array(w)
  
  return (((w*(np.log(pred+1) - np.log(y+1)))**2).sum() / w.sum())**0.5

# 3b. training CatBoost Model

In [None]:
model = CatBoostRegressor()
history_cat = model.fit(x_train_df, Y_train_df)


In [None]:
predictions = model.predict(x_val_df)
predictions = np.clip(predictions, 0, max(0, predictions.max()))
score = NWRMSLE(Y_val_df, predictions, wightes_validtion.values)
print('NWRMSLE CAT BOOST:',score)

In [None]:
def shap_expliner(model,X_train):
  shap.initjs()
  explainer = shap.Explainer(model)
  shap_values = explainer(X_train)
  shap.plots.bar(shap_values)
  # visualize the first prediction's explanation
#   shap.plots.waterfall(shap_values[0])
  # summarize the effects of all the features
#   shap.plots.beeswarm(shap_values)


shap_expliner(history_cat,x_train_df)

# #3c+e. Preprocessing steps to create an embedding models

* For the first model we embedded the following features:

  1.   store id
  2.  item id
  3. clusted
  4. day

* For the second model we also embedded city, family, type, state, class, features and added onpromotion ,dcoilwtico features to the model.



In [None]:
msle = tf.keras.losses.MeanSquaredLogarithmicError()

def get_embedding_model():
  """
  First simple model for 3c
  """
  store_in = Input(shape=(1,), dtype='int64', name='dim_store')
  s = Embedding(n_store, 15, input_length=1, embeddings_regularizer=l2(1e-4),)(store_in)

  item_in = Input(shape=(1,), dtype='int64', name='dim_item')
  m = Embedding(n_items, 50, input_length=1, embeddings_regularizer=l2(1e-4))(item_in)

  cluster_in = Input(shape=(1,), dtype='int64', name='din_cluster')
  c = Embedding(n_cluster, 5, input_length=1, embeddings_regularizer=l2(1e-4))(cluster_in)

  day_in = Input(shape=(1,), dtype='int64', name='day_in')
  d = Embedding(7, 4, input_length=1, embeddings_regularizer=l2(1e-4))(day_in)



  x = concatenate([s, m, c, d])
  x = Flatten()(x)
  x = BatchNormalization()(x)
  x = Dense(100, activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.5)(x)
  x = Dense(50, activation='relu')(x)
  x = Dropout(0.5)(x)
  x = Dense(1)(x)
  nn = Model([store_in,item_in,cluster_in,day_in], x)
  nn.compile(Adam(0.001), loss='mse',metrics=['mae',msle])
  nn.summary()
  return nn

def get_embedding_model_2():
  """
  More complex model for 3e
  """
  store_in = Input(shape=(1,), dtype='int64', name='dim_store')
  s = Embedding(n_store, 15, input_length=1, embeddings_regularizer=l2(1e-4),)(store_in)

  item_in = Input(shape=(1,), dtype='int64', name='dim_item')
  m = Embedding(n_items, 50, input_length=1, embeddings_regularizer=l2(1e-4))(item_in)

  cluster_in = Input(shape=(1,), dtype='int64', name='din_cluster')
  c = Embedding(n_cluster, 5, input_length=1, embeddings_regularizer=l2(1e-4))(cluster_in)

  day_in = Input(shape=(1,), dtype='int64', name='day_in')
  d = Embedding(7, 4, input_length=1, embeddings_regularizer=l2(1e-4))(day_in)

  city_in = Input(shape=(1,), dtype='int64', name='city_in')
  t = Embedding(n_city, 7, input_length=1, embeddings_regularizer=l2(1e-4))(city_in)

  family_in = Input(shape=(1,), dtype='int64', name='family_in')
  f = Embedding(n_family, 15, input_length=1, embeddings_regularizer=l2(1e-4))(family_in)

  type_in = Input(shape=(1,), dtype='int64', name='type_in')
  t = Embedding(n_type, 3, input_length=1, embeddings_regularizer=l2(1e-4))(type_in)
  
  state_in = Input(shape=(1,), dtype='int64', name='state_in')
  st = Embedding(n_state, 5, input_length=1, embeddings_regularizer=l2(1e-4))(state_in)

  class_in = Input(shape=(1,), dtype='int64', name='class_inday_in')
  cl = Embedding(n_class, 20, input_length=1, embeddings_regularizer=l2(1e-4))(class_in)

  onpromotion_in = Input(shape=(1,1), dtype='float32', name='onpromotion_in')
  dcoilwtico_in = Input(shape=(1,1), dtype='float32', name='dcoilwtico_in')


  x = concatenate([s,m,c,d,t,f,t,st,cl,onpromotion_in,dcoilwtico_in])
  x = Flatten()(x)
  x = BatchNormalization()(x)
  x = Dense(150, activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.5)(x)
  x = Dense(100, activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.2)(x)
  x = Dense(50, activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.1)(x)
  x = Dense(1)(x)
  nn = Model([store_in,item_in,cluster_in,day_in,city_in,family_in,type_in,state_in,class_in,onpromotion_in,dcoilwtico_in], x)
  nn.compile(optimizer='adam', loss='mse',metrics=['mae'])
  nn.summary()
  return nn

def train_embedding(X,y,X_val,y_val, better_model=False):
  """
  train a NN model 
  """

  callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
  if not better_model:
    embedding_model = get_embedding_model()
  else:
    embedding_model = get_embedding_model_2()  
  log_dir = '/kaggle/working/'+"logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
  history = embedding_model.fit(X, y, batch_size=512, epochs=10, validation_data=(X_val, y_val),callbacks=[callback,tensorboard_callback])

                           

  return embedding_model, history

# 3d. Predected targets using only the features embeddings in a DL regressor

In [None]:
embedding_model_1, history_1 = train_embedding(x_train_s,Y_train_s,x_val_s,Y_val_s)

In [None]:
%tensorboard --logdir ./logs

In [None]:
def predict(model, x):
    y = model.predict(x, batch_size=128, verbose=1, workers=4, use_multiprocessing=True)
    y = y.reshape(-1)
    y = np.clip(y, 0, max(0, y.max()))
    return y

predictions = predict(embedding_model_1, x_val_s)
score = NWRMSLE(Y_val_s, predictions, wightes_validtion.values)
print('NWRMSLE VALIDATION:',score)

In [None]:
def save_to_csv_test(df_test,embedding_model,name, complex_m=False):

  def simple_to_Xy_array(df):
    features = ['store_nbr',
                'item_nbr',
                'cluster',
                'day']

    X = []
    for f in features:
        X.append(df[f])
    return X

  def to_Xy_array_csv(df):
    features = ['store_nbr',
                'item_nbr',
                'cluster',
                'day',
                'city',
                'family',
                'type',
                'state',
                'class',
                'onpromotion',
                'dcoilwtico']

    target_class = 'unit_sales'

    X = []

    for f in features:
        X.append(df[f])

    return X

  def save_test_csv(model,X_test,name):

        y_test_tag = model.predict(X_test,verbose=1)
        y_test_tag = np.clip(y_test_tag, 0, max(0, y_test_tag.max()))
        aaa = y_test_tag.reshape(1,-1)
        predictions = pd.DataFrame({'id': test['id'].values, 'unit_sales': aaa[0]})
        predictions.to_csv('/kaggle/working/model_'+name+'.csv', index=False)


  # df_test, wightes_test
  if not complex_m:
    x_test = simple_to_Xy_array(df_test)
  else:
     x_test = to_Xy_array_csv(df_test)
  y_test_tag = save_test_csv(embedding_model,x_test,name)

save_to_csv_test(df_test,embedding_model_1,'1',complex_m=False)

# 3d Fitting second model

In [None]:
embedding_model_2, history_2 = train_embedding(x_train,Y_train,x_val,Y_val,better_model=True)

In [None]:
predictions = predict(embedding_model_2, x_val)
score = NWRMSLE(Y_val_s, predictions, wightes_validtion.values)
print('NWRMSLE VALIDATION:',score)

In [None]:
save_to_csv_test(df_test,embedding_model_2,'2',complex_m=True)

# 3f. Insights from the embeddings of categorical features

We are plotting the embeddings of store and item ids

In [None]:
from sklearn.manifold import TSNE

weights = embedding_model_1.get_weights()
store_embeddings = weights[0]
item_embeddings = weights[1]

def display_wordlist(store_enc, store_embeddings,name):
    vectors = [store_embeddings[word-1] for word in store_enc]
    word_labels = [word for word in store_enc]
    word_vec_zip = zip(word_labels, vectors)

    # Convert to a dict and then to a DataFrame
    word_vec_dict = dict(word_vec_zip)
    df = pd.DataFrame.from_dict(word_vec_dict, orient='index')

    # Use tsne to reduce to 2 dimensions
    tsne = TSNE(perplexity=65,n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(df)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display plot
    plt.figure(figsize=(16, 8)) 
    plt.plot(x_coords, y_coords, 'ro')

    for label, x, y in zip(df.index, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

display_wordlist(store_enc,store_embeddings,'store')
display_wordlist(item_enc,item_embeddings,'item')

We can notice that there is no significant difference between stores in opposite to the items where we can notice that there are several groups of items. This results are corollated with SHAP results.

# 3g. Feature extractor” for a classical ML algorithm 

We extraced 50 feturess from the last Danse layer from the first model and used them as input to catboost model.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import classification_report
import time
      
      
extract = Model(embedding_model_1.inputs, embedding_model_1.layers[-3].output)
features = extract.predict(x_train_s)
features_test = extract.predict(x_val_s)

# Generate a print
print('------------------------------Cat Boost--------------------------------')

# clf=RandomForestRegressor(max_depth=features.shape[1], n_jobs=-1, random_state=42, verbose=2)
clf = CatBoostRegressor()

# measure time to train model:
start = time.time()

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(features,Y_train_df)  

end = time.time()
print(f' time for pretrained model to train: {end - start}')

y_pred=clf.predict(features_test)

# Model Accuracy, how often is the classifier correct?
score = NWRMSLE(Y_val_df, y_pred, wightes_validtion.values)
print('NWRMSLE:',score)





In [None]:
score = NWRMSLE(Y_val_df, y_pred, wightes_validtion.values)
print('NWRMSLE:',score)