In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.options.display.max_columns = 1000
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

# 1. Load Data

In [None]:
# Load each data file into a data frame
item_categories = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv", sep=",")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv", sep=",")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv", sep=",")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv", sep=",")
sales_train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv", sep=",")

# 2. Explore Data

In [None]:
# View each dataset
display(item_categories.head(1))
display(items.head(1))
display(shops.head(1))
display(test.head(1))
display(sales_train.head(1))

# Check for null values in sales_train
print("-------------------------------------------")
print("\nNull values:")
display(sales_train.isnull().sum())
print("-------------------------------------------")

# Check columns in sales_train
display(sales_train.info())
print("-------------------------------------------")
print("Max value in col 'item_cnt_day': ", sales_train['item_cnt_day'].max())
print("Min value in col 'item_cnt_day': ", sales_train['item_cnt_day'].min())
print("-------------------------------------------")

**Observations:**

No null values in col 'sales_train'

# 3. Data Preprocessing

**3.1 Find Outliers**

In [None]:
# Find outliers in the columns sales_train.'item_cnt_day',sales_train.'item_price'
cols = ['item_cnt_day','item_price']
fig, ax = plt.subplots(ncols = len(cols), figsize = (5 * len(cols),6), sharex = True)
fig.subplots_adjust(wspace=0.5)

for i in range(len(cols)):
  ax[i].boxplot(sales_train[cols[i]])
  ax[i].set_xlabel(cols[i])
  ax[i].set_ylabel("Count")

**Observations:**
* item_cnt_day has outlier > 2000
* item_price has outlier > 300000

**Next:**
* Remove observed outliers

In [None]:
# Remove Outliers
outlier1 = sales_train[sales_train['item_cnt_day'] > 2000].index[0]
outlier2 = sales_train[sales_train['item_price'] > 300000].index[0]
sales_train.drop([outlier1,outlier2], axis = 0, inplace = True)

# Reset index
sales_train.reset_index(inplace=True,drop=True)

**3.2 Find Anomalies**

In [None]:
# Find anomalies in the columns sales_train.'item_cnt_day',sales_train.'item_price'
cols = ['item_cnt_day','item_price']
fig, ax = plt.subplots(ncols = len(cols), figsize = (5 * len(cols),6), sharex = True)
fig.subplots_adjust(wspace=0.5)

for i in range(len(cols)):
  ax[i].plot(sales_train[cols[i]])
  ax[i].set_xlabel(cols[i])
  ax[i].set_ylabel("Count")

**Observations:**

* Column 'item_cnt_day' has some negative values which could be items that were 'returned'. Since we will be considering monthly counts, we will leave the negative values in there so that we get a correct count of items sold when we aggregrate by month

# 4. Feature Engineering

In [None]:
# Get maximum number in column 'date_block_num' representing the last month in data
sales_train_max_month = sales_train.date_block_num.max()

# Add column 'date_block_num' to the test dataset. The values would be sales_train_max_month + 1 representing next month
test['date_block_num'] = sales_train_max_month + 1

# Create temp table to concat modified sales_train and test datasets
sales_temp = pd.concat([sales_train,test])

# Create monthly sales data by aggregating on column 'item_cnt_day' and then rename column to 'item_cnt_month'
sales_monthly = sales_temp.groupby(by = ['date_block_num','shop_id','item_id'], as_index=False).agg({'item_cnt_day':'sum'})
sales_monthly = sales_monthly.rename(columns={'item_cnt_day':'item_cnt_month'})

**4.1 Add Lag Feature**

In [None]:
# Add new lag column 'lag_item_cnt_month' by shifting values from 'item_cnt_month' by 1
sales_monthly['lag_item_cnt_month'] = sales_monthly['item_cnt_month'].shift(1)

# Remove na values in the lag column
sales_monthly.fillna(0, inplace=True)
sales_monthly.isna().sum()

# 5. Data Preparation

In [None]:
# The 'date_block_num' must be split as contiguos months of training data and validation data
split_ratio = 0.8
train_valid_split = np.floor(sales_train_max_month*split_ratio)
train_data = sales_monthly[sales_monthly['date_block_num'] <= train_valid_split]
valid_data = sales_monthly[(sales_monthly['date_block_num'] > train_valid_split) & (sales_monthly['date_block_num'] < sales_train_max_month+1)]

# Test data will be the 'date_block_num' is sales_train_max_month+1
test_data = sales_monthly[sales_monthly['date_block_num'] == sales_train_max_month+1]

In [None]:
# Create X and y variables for train, validation and test sets
X_train = train_data.drop('item_cnt_month',axis=1)
y_train = train_data['item_cnt_month']

X_valid = valid_data.drop('item_cnt_month',axis=1)
y_valid = valid_data['item_cnt_month']

X_test = test_data.drop('item_cnt_month',axis=1)
y_test = test_data['item_cnt_month']

# 6. Model Training and Prediction

**6.1 Model Prediction**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Fit linear regression model to training set
model = LinearRegression()
model.fit(X_train,y_train)

# Use model to Predict labels for validation and test sets 
train_pred =  model.predict(X_train)
valid_pred = model.predict(X_valid)
test_pred = model.predict(X_test)

# Error metrics
print(f'Root Mean Square Train Data = {np.sqrt(mean_squared_error(y_train,train_pred))}')
print(f'Root Mean Square Validation Data = {np.sqrt(mean_squared_error(y_valid,valid_pred))}')  

**6.2 Model Evaluation using K-Fold cross-validaiton**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_score

# Training Regressors
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = LinearRegression()

# Check accuracy of the Regressors using K-Fold evaluation
for reg, label in zip([reg1, reg2, reg3], ['GB Regression', 'Random Forest', 'Linear Regressor']):
  scores = cross_val_score(reg, X_valid, y_valid, scoring='neg_mean_squared_error', cv=5)
  print("Accuracy: %0.2f [%s]" % (-scores.mean(), label))

**Submit Predictions**

In [None]:
submission = pd.DataFrame(test['ID'])
submission['item_cnt_month'] = model.predict(X_test)
submission.to_csv('submission.csv',index=False)