**Prediction Time Series Data using Linear Regression and RandomForestRegressor**

Approach:

 - Clean the data
 - Remove outliers
 - Build a model
 - Apply the model to test data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from dateutil.parser import parse 
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'

In [None]:
item_categories=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/item_categories.csv')
items=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/items.csv')
raw_sales=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/shops.csv')

In [None]:
sample=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample

In [None]:
sales1 = pd.merge(raw_sales, items, on ='item_id',how ='left')
sales2 = pd.merge(sales1, item_categories, on ='item_category_id',how ='left')
sales3 = pd.merge(sales2, shops, on ='shop_id',how ='left')
sales3

In [None]:
sales3.isna().sum()

**Check for Correlations**

In [None]:
y=sales3[['date', 'item_cnt_day']]
y.head()

In [None]:
y = y.groupby('date')['item_cnt_day'].sum().reset_index()
y.head()

In [None]:
y.shape[0]

In [None]:
for i in range(0,y.shape[0]):
  y['date'][i] = pd.concat(y['date'][i].str[-4:], y['date'][i].str[3:4], y['date'][i].str[0:2])

In [None]:
pd.concat([y['date'].str[-4:], y['date'].str[3:5], y['date'].str[0:2]])

In [None]:
y['date'].str[3:5]

In [None]:
y['date'].str[0:2]

In [None]:
y['date'] = pd.concat(y['date'].str[-4:], y['date'].str[3:4], y['date'].str[0:2])

In [None]:
y.set_index('date')
y.index

In [None]:
y

In [None]:
x = y['item_cnt_day'].resample('MS').mean()

In [None]:
y.plot(figsize=(15, 6))
plt.show()

In [None]:
sns.heatmap(sales3.corr())

**Sumif the count per days according to months, items & shops**

In [None]:
cnt_by_month=sales3.groupby('date_block_num').sum()
cnt_by_month=cnt_by_month.drop(['shop_id', 'item_id', 'item_price', 'item_category_id'], axis=1)
cnt_by_month.reset_index(inplace=True)

cnt_by_item=sales3.groupby('item_id').sum()
cnt_by_item=cnt_by_item.drop(['shop_id', 'date_block_num', 'item_price', 'item_category_id'], axis=1)
cnt_by_item.reset_index(inplace=True)

cnt_by_shop=sales3.groupby('shop_id').sum()
cnt_by_shop=cnt_by_shop.drop(['item_id', 'date_block_num', 'item_price', 'item_category_id'], axis=1)
cnt_by_shop.reset_index(inplace=True)

**Plot the data to see the outliers**

In [None]:
fig1 = sns.relplot(x='date_block_num', y='item_cnt_day', data=cnt_by_month)
fig1a = fig1.fig 
fig1a.suptitle("Total solds Time Series", fontsize=12)


fig2 = sns.relplot(x='item_id', y='item_cnt_day', data=cnt_by_item)
fig2a = fig2.fig 
fig2a.suptitle("Items sold", fontsize=12)


fig3 = sns.relplot(x='shop_id', y='item_cnt_day', data=cnt_by_shop)
fig3a = fig3.fig 
fig3a.suptitle("Solds per Shop", fontsize=12)

In [None]:
print('Best Selling Months:')
date_outlier=[i for i in cnt_by_month['item_cnt_day'] if i > 160000]
date_outlier1=pd.DataFrame({"item_cnt_day": date_outlier})
date_outlier2 = pd.merge(date_outlier1, cnt_by_month, on ='item_cnt_day', how ='inner')
date_outlier2

In [None]:
sales3.loc[sales3['date_block_num'] == 11, 'date']

In [None]:
sales3.loc[sales3['date_block_num'] == 23, 'date']

In [None]:
#Peak sales were before Winter Holidays, seasonal outliers

In [None]:
print('Best Selling Items:')
item_outlier=[i for i in cnt_by_item['item_cnt_day'] if i > 150000]
item_outlier1=pd.DataFrame({"item_cnt_day": item_outlier})
item_outlier2 = pd.merge(item_outlier1, cnt_by_item, on ='item_cnt_day', how ='inner')
item_outlier2

In [None]:
print('Best Selling Shops:')
shop_outlier=[i for i in cnt_by_shop['item_cnt_day'] if i > 180000]
shop_outlier1=pd.DataFrame({"item_cnt_day": shop_outlier})
shop_outlier2 = pd.merge(shop_outlier1, cnt_by_shop, on ='item_cnt_day', how ='inner')
shop_outlier2

In [None]:
sales4=sales3.copy()
sales4 = sales4[(sales4.date_block_num != 11) & (sales4.date_block_num != 23)]
sales4 = sales4[sales4.item_id != 20949]
sales4 = sales4[(sales4.shop_id != 25) & (sales4.shop_id != 28) & (sales4.shop_id != 31 & (sales4.shop_id != 54))]

In [None]:
sales4

In [None]:
sales5=sales4.drop(['date', 'item_name', 'item_category_name', 'shop_name'], axis=1)
sales5

In [None]:
sales6=sales5.copy()
sales6=sales6.drop(['item_price', 'item_category_id'], axis=1)

In [None]:
sales6 = sales6.groupby(['date_block_num','shop_id','item_id']).sum()
sales6.reset_index(inplace=True)

In [None]:
sales6

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
sales6[['item_cnt_day']] = scale.fit_transform(sales6[['item_cnt_day']])

In [None]:
sales7 = sales6.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day', fill_value=0)
sales7.reset_index(inplace=True)

In [None]:
sales7

In [None]:
sales8 = sales6.groupby(['shop_id','item_id']).sum()
sales8.reset_index(inplace=True)

In [None]:
sales8

In [None]:
sales7['mean'] = sales7.iloc[:, 2:].mean(axis=1)

In [None]:
sales7['mean']

In [None]:
x=sales7.drop('mean', axis=1)
y=sales7['mean']

In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

**Build the Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mean_squared_error(y_train, model.predict(X_train))

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
RFR = RandomForestRegressor(n_estimators = 10)
RFR.fit(X_train,y_train)

In [None]:
RFR.score(X_train,y_train)

In [None]:
y_pred = RFR.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
test=pd.read_csv(r'../input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
test

In [None]:
test1=test.merge(x, how="left", on = ['shop_id', 'item_id'])

In [None]:
test1

In [None]:
test2=test1.copy()
test2=test2.fillna(0)
test2=test2.drop('ID', axis=1)
test2

In [None]:
y_pred1=RFR.predict(test2)

In [None]:
y_new_inverse = scale.inverse_transform(y_pred1)

In [None]:
y_pred2=pd.DataFrame(y_new_inverse, columns=['item_cnt_month'])
y_pred2

In [None]:
submission=pd.concat([test['ID'], y_pred2], axis=1)

**Export the outcome**

In [None]:
submission.to_csv(r'./submission.csv', index = False)