In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR as sk_SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
train.info()

In [None]:
item_category = items.groupby('item_category_id')['item_id'].count().reset_index()
    
fig, ax = plt.subplots(figsize=(20,4))
sns.barplot(x = item_category.item_category_id, y = item_category.item_id, color='mediumblue')
    
ax.set(xlabel = "Item Category ID",
       ylabel = "Number of Item",
       title = "Total Item Per Category")
sns.despine()

In [None]:
sns.boxplot(x = train['item_price'])

In [None]:
sns.boxplot(x = train['item_cnt_day'])

In [None]:
train["shop_id"] = train["shop_id"].replace({0: 57, 1: 58, 11: 10, 40: 39}) # Grouping same shop_id
train = train.loc[train.shop_id.isin(test["shop_id"].unique()), :] # Remove unused shop_id

# Remove Outliers
train = train[(train["item_price"] > 0) & (train["item_price"] < 45000)]
train = train[(train["item_cnt_day"] > 0) & (train["item_cnt_day"] < 800)]

# Add item_categories column to training dataset
item_categories = []
for i in train['item_id']:
    item_categories.append(items['item_category_id'].iloc[i])
train['item_categories'] = item_categories

# Set Datatype
# train['shop_id'] = train['shop_id'].astype(str)
# train['item_id'] = train['item_id'].astype(str)
# train['item_categories'] = train['item_id'].astype(str)

train.info()

In [None]:
shop_item = train.groupby('shop_id')['item_cnt_day'].sum().reset_index()
    
fig, ax = plt.subplots(figsize=(20,4))
sns.barplot(x = shop_item.shop_id, y = shop_item.item_cnt_day, color='mediumblue')
    
ax.set(xlabel = "Shop ID",
       ylabel = "Number of Sales",
       title = "Total Sales Per Shop")
sns.despine()

In [None]:
def monthly_sales(data):    
    data = data.copy()     
    # Drop the day indicator from the date column    
    data.date = data.date.apply(lambda x: str(x)[3:])
    # Sum item_cnt per month 
    data = data.groupby(['date','date_block_num','shop_id','item_id','item_price','item_categories'])['item_cnt_day'].sum().reset_index()
    data['date'] = data['date'].apply(lambda x: x.replace('.','-'))
    data['date'] = pd.to_datetime(data['date'])
    return data

monthly_data = monthly_sales(train)

In [None]:
def time_plot(data, x_col, y_col, title):
    fig, ax = plt.subplots(figsize=(20,5))
    sns.lineplot(x = x_col, y = y_col, data=data, ax=ax, color='mediumblue', label='Total Sales')
    
    second = data.groupby(data.date.dt.year)[y_col].mean().reset_index()
    second.date = pd.to_datetime(second.date, format='%Y')
    sns.lineplot(x = (second.date + datetime.timedelta(6*365/12)), y = y_col, data=second, ax=ax, color='red', label='Mean Sales')   
    
    ax.set(xlabel = "Date",
           ylabel = "Sales",
           title = title)
    
    sns.despine()

In [None]:
time_plot(monthly_data, 'date', 'item_cnt_day', "Sales Trend")

In [None]:
train.info()

In [None]:
# train_data = train[train['date_block_num']<33]
# test_data = train[train['date_block_num']>=33]
# dl_train = train_data.drop(['date','date_block_num','item_price','item_cnt_day'],axis=1)
# dl_target = train_data['item_cnt_day']
# test_train = test_data.drop(['date','date_block_num','item_price','item_cnt_day'],axis=1)
# test_target = test_data['item_cnt_day']

# dl_train = np.array(dl_train)
# dl_target = np.array(dl_target)
# test_train = np.array(test_train)
# test_target = np.array(test_target)

# from sklearn.preprocessing import StandardScaler
# dl_train = StandardScaler().fit_transform(dl_train)
# test_train = StandardScaler().fit_transform(test_train)

In [None]:
dl_train = train.drop(['date','date_block_num','item_cnt_day','item_price'],axis=1)
dl_target = train['item_cnt_day']
dl_train = np.array(dl_train)
dl_target = np.array(dl_target)

from sklearn.preprocessing import StandardScaler
dl_train = StandardScaler().fit_transform(dl_train)

In [None]:
# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(dl_train, dl_target)
# yhat_val_lr = lr.predict(test_train).clip(0, 20)
# print('Validation RMSE:', mean_squared_error(test_target, yhat_val_lr, squared=False))

In [None]:
model = Sequential()
model.add(Dense(16, activation = 'LeakyReLU', input_shape = (dl_train.shape[1],)))
model.add(Dropout(rate = 0.2))
model.add(Dense(8, activation='sigmoid'))
model.add(Dense(4, activation='sigmoid'))
model.add(Dense(1))
model.compile(optimizer= 'adam',
              loss = 'mse', 
              metrics = ['mse','accuracy'])
model.summary()

history = model.fit(dl_train, dl_target, epochs = 16, batch_size = 1024, validation_split = 0.1)

In [None]:
# prediction = model.predict(test_train)

In [None]:
# import math
# print(math.sqrt(mean_squared_error(test_target,prediction)))

In [None]:
df_DL = pd.DataFrame(history.history)
df_DL.head()

In [None]:
plt.plot(df_DL.index, df_DL['loss'], label = 'loss')
plt.xlabel( 'Epochs')
plt.ylabel('Mean Square Root')
plt.title('DL loss function')
plt.legend()

In [None]:
test.head()

In [None]:
test_Data = test.copy()

#Add the item_categories columns
item_categories = []
for i in test_Data['item_id']:
    item_categories.append(items['item_category_id'].iloc[i])

test_Data['item_categories'] = item_categories

test_Data = test_Data.drop(columns = 'ID')
test_Data = np.array(test_Data)
test_Data = StandardScaler().fit_transform(test_Data)
test['item_cnt_month'] = model.predict(test_Data)

In [None]:
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
submission['item_cnt_month'] = test['item_cnt_month']
submission.to_csv('submission.csv', index=False)

In [None]:
submission