# Inference

In [1]:
#imported python packages
import numpy as np
import pandas as pd
from math import sqrt

from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from xgboost import plot_importance
import lightgbm as lgb
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import mean_squared_error

import time
import pickle
import joblib

In [8]:
#imported the testing dataset
test = pd.read_csv('Sales/test.csv')
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   ID       214200 non-null  int64
 1   shop_id  214200 non-null  int64
 2   item_id  214200 non-null  int64
dtypes: int64(3)
memory usage: 4.9 MB


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [2]:
# imported training dataset
train_df = pd.read_csv('train.csv')
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150047 entries, 0 to 3150046
Data columns (total 28 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   date_block_num                  int64  
 1   shop_id                         int64  
 2   item_id                         int64  
 3   item_price                      float64
 4   item_cnt_day                    float64
 5   revenue                         float64
 6   item_cnt_month                  float64
 7   city_code                       int64  
 8   item_category_id                int64  
 9   type_code                       int64  
 10  subtype_code                    int64  
 11  date_avg_item_cnt               float64
 12  date_item_avg_item_cnt          float64
 13  date_shop_avg_item_cnt          float64
 14  date_cat_avg_item_cnt           float64
 15  date_shop_cat_avg_item_cnt      float64
 16  date_shop_type_avg_item_cnt     float64
 17  date_shop_subtype_avg_item_

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue,item_cnt_month,city_code,item_category_id,type_code,...,date_city_avg_item_cnt,date_item_city_avg_item_cnt,date_type_avg_item_cnt,date_subtype_avg_item_cnt,item_avg_item_price,date_item_avg_item_price,date_shop_revenue,shop_avg_revenue,delta_revenue,month
0,0,59,22154,999.0,1.0,999.0,1.0,31,37,11,...,3.836,1.0,3.697,2.463,410.5,999.0,1633431.0,1292188.0,0.2642,0
1,0,25,2552,899.0,1.0,899.0,0.0,14,58,13,...,5.152,0.0,2.076,1.253,938.0,899.0,5376478.0,6185159.0,-0.1307,0
2,0,25,2552,899.0,-1.0,-899.0,0.0,14,58,13,...,5.152,0.0,2.076,1.253,938.0,899.0,5376478.0,6185159.0,-0.1307,0
3,0,25,2554,1709.05,1.0,1709.05,1.0,14,58,13,...,5.152,1.0,2.076,1.253,1709.0,1709.0,5376478.0,6185159.0,-0.1307,0
4,0,25,2555,1099.0,1.0,1099.0,1.0,14,56,13,...,5.152,1.0,2.076,1.226,1123.0,1099.0,5376478.0,6185159.0,-0.1307,0


In [3]:
# dropped item_id as decided in vizualization notebook
train = train_df.drop(['item_id'],axis=1)

In [4]:
# training, validation, and test dataset is created.
X_train_df = train[train.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train_df = train[train.date_block_num < 33]['item_cnt_month']
X_valid = train[train.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = train[train.date_block_num == 33]['item_cnt_month']
X_test = train[train.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [5]:
# XGB model loaded
xgb_model = joblib.load('xgb_model.joblib')

In [6]:
# get the predictions for the validation data
Y_pred_xgb = xgb_model.predict(X_test).clip(0,20)

In [9]:
# submission dataset of items sold per month for each test ID
submission_xgb = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_pred_xgb
})

In [10]:
submission_xgb

Unnamed: 0,ID,item_cnt_month
0,0,0.007294
1,1,0.007135
2,2,0.007294
3,3,0.007294
4,4,0.007294
...,...,...
214195,214195,0.007135
214196,214196,0.007135
214197,214197,0.007135
214198,214198,0.007135


In [11]:
# LGBM Model loaded
lgb_model = joblib.load('lgb_model.joblib')

In [12]:
# get the predictions for the validation data
Y_pred_lgb = lgb_model.predict(X_test).clip(0,20)

In [13]:
# submission dataset of items sold per month for each test ID
submission_lgb = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_pred_lgb
})

In [14]:
submission_lgb

Unnamed: 0,ID,item_cnt_month
0,0,0.182818
1,1,0.182818
2,2,0.182818
3,3,0.182818
4,4,0.182818
...,...,...
214195,214195,0.183010
214196,214196,0.183010
214197,214197,0.183010
214198,214198,0.183010


In [15]:
# reshaping training, validation and test data for LSTM model
X_train=X_train_df.values.reshape((X_train_df.shape[0],X_train_df.shape[1],1))
X_valid=X_valid.values.reshape((X_valid.shape[0],X_valid.shape[1],1))

Y_train=Y_train_df.values.reshape((Y_train_df.shape[0],1))
Y_valid=Y_valid.values.reshape((Y_valid.shape[0],1))

X_test=X_test.values.reshape((X_test.shape[0],X_test.shape[1],1))

In [16]:
# LSTM Model loaded
lstm_model = joblib.load('lstm_model.joblib')

2022-12-12 21:00:28.455484: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 21:00:28.811066: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-12 21:00:28.952195: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-12 21:00:28.952231: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [17]:
# get the predictions for the validation data
Y_pred_lstm = lstm_model.predict(X_test).clip(0,20)



In [18]:
# submission dataset of items sold per month for each test ID
submission_lstm = pd.DataFrame(Y_pred_lstm,columns=['item_cnt_month'])

In [19]:
submission_lstm

Unnamed: 0,item_cnt_month
0,0.666517
1,0.708629
2,0.670850
3,0.660386
4,0.710944
...,...
214195,0.640677
214196,0.630435
214197,0.640631
214198,0.653305


In [20]:
submission_xgb.to_csv('submission_xgb.csv',index=False)

In [21]:
submission_lstm.to_csv('lstm_submission.csv',index_label='ID')

In [22]:
submission_lgb.to_csv('submission_lgb.csv',index=False)