In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Library for Mathemtical Computation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Library for Modelling 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error 

#Libraries for Deep Learning
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten

#Library for Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Manipulation

In [None]:
#Loading data into training and testing sets
train_data=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test_data=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
#Displaying Training Data
train_data.head()

In [None]:
#Displaying Test Data
test_data.head()

**Preparing Training Data**

In [None]:
#Displaying Metadata/Information about Training Data
train_data.info()

In [None]:
#Displaying Stats on Training Data
train_data.describe()

Here, it could be noticed that item_price and item_cnt_day consist of Negative Numbers. These values should be changed into positive because as the value is in negative it does not make any sense, item price being in negative.

In [None]:
#Changing Negative numbers into Positive
train_data['item_price']=train_data['item_price'].abs()
train_data['item_cnt_day']=train_data['item_cnt_day'].abs()

In [None]:
#Checking if the training data has any Null values or not
null_percent=100*(train_data.isnull().sum()/len(train_data))
print(null_percent)

As the item count per month is to be predicted, item_cnt_day will be converted to item_cnt_mnt

In [None]:
#Grouping item price and item count per month
train_data=train_data.groupby(['date_block_num','shop_id','item_id']).agg({'item_price':'last','item_cnt_day':'sum'}).reset_index()
#Changing the name of the item_cnt_day to item_cnt_month
train_data=train_data.rename(columns={'item_cnt_day':'item_cnt_month'})
print(train_data)

**Correlation is a good way to find whether the data columns have linear relationship with item_cnt_day or not. If the correlation value is closer to one, then the data columns are positively related.**

In [None]:
#Calculating Correlation
correlation=train_data.corr()

#Plotting correlation
plt.figure(figsize=(12,12))
corr_heatmap=sns.heatmap(correlation,annot=True,cmap="GnBu")

# Preparing Test Data

As Test data only consists of two data columns shop_id and item_id, other data columns like date_block_num and item price should be added.

In [None]:
#Adding date_block_num to the Test data
test_data['date_block_num']=34
test_data=test_data[['date_block_num','shop_id','item_id']]
print(test_data)

In [None]:
#Adding the latest item price from the training data into test data into their respective item id
item_price=dict(train_data.groupby('item_id')['item_price'].last().reset_index().values)
test_data['item_price']=test_data.item_id.map(item_price)
print(test_data)

In [None]:
#Replacing Missing Value with median price
print(test_data['item_price'].unique())
test_data['item_price']=test_data['item_price'].fillna(test_data['item_price'].median())
test_data['item_price']

# Preparing for Modelling

In [None]:
#allocating training data into X and y training sets
x_train=train_data.drop('item_cnt_month',axis=1)
y_train=train_data["item_cnt_month"]
x_test=test_data
x_train.shape,y_train.shape

# Training the Model

**Linear Regression**

In [None]:
#Training the model
linear_model=LinearRegression()
linear_model.fit(x_train,y_train)

In [None]:
#Testing the model
linear_prediction=linear_model.predict(x_test)

In [None]:
#output is the predicted cnt_per_month
print(linear_prediction)

**Random Forest Regression**

In [None]:
#Buliding Model in Random Forest Regressor
rr_model=RandomForestRegressor(n_estimators=50)
rr_model.fit(x_train,y_train)

In [None]:
#Testing the model
rr_predict=rr_model.predict(x_test)
print(rr_predict)

**Lasso Regression**

In [None]:
#Building the Model in Lasso Regression Model
lasso_model=Lasso(alpha=1.0)
lasso_model.fit(x_train,y_train)

#Testing the Model
lasso_predict=lasso_model.predict(x_test)
print(lasso_predict)

**Ridge Regression**

In [None]:
#Building the model in Ridge Regression Model
r_model=Ridge(alpha=1.0)
r_model.fit(x_train,y_train)

#Testing the model
r_predict=r_model.predict(x_test)
print(r_predict)

# Deep Learning

**Building Model**

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    NN_model = Sequential()
    # The Input Layer :
    NN_model.add(Dense(128, kernel_initializer='normal',input_dim = x_train.shape[1], activation='relu'))

# The Hidden Layers :
    NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
    NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))
# Compile the network :
    NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

**Define a checkpoint callback**

In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

**Training Model**

In [None]:
NN_model.fit(x_train, y_train, epochs=500, batch_size=268435456 * tpu_strategy.num_replicas_in_sync, validation_split = 0.2, callbacks=callbacks_list)

In [None]:
DNN_predict=NN_model.predict(x_test)

print(DNN_predict)

# Saving the Output in CSV file

In [None]:
#Creating Dataframe to Display the output, The Id is the item id from the test data and output is the predicted cnt_per_month
sample_submission= pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
sample_submission.item_cnt_month=DNN_predict[:,0]
DNN_result=sample_submission
print(DNN_result)

In [None]:
DNN_result.to_csv("Sales_Prediction.csv",index=False)
print("Completed")