### Problem Statement

You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.


## Importing Libraries

In [None]:
# Import all the important    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import optimize, stats    
%matplotlib inline

### Importing Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
df_train.head(4)

In [None]:
df_shops.head(4)


In [None]:
df_items.head(4)

In [None]:
df_item_categories.head(4)

In [None]:
df_test.head(4)

In [None]:
# we check data type of all columns
df_train.info()

In [None]:
#date is object so we will change Dtype from object to datetime64
df_train['date']=pd.to_datetime(df_train['date'])

In [None]:
#now we check if there is any null values present in dataset
df_train.isnull().sum()

In [None]:
df_train['date']=df_train['date'].dt.strftime('%Y-%m')

In [None]:
df_train.head().sort_values(by='date')

In [None]:
df_train.drop(['date_block_num','item_price'] , axis =1, inplace= True)

In [None]:
df_train.head(10)

In [None]:
# sorting data acording to date sort 
df_train.head().sort_values(by='date')

In [None]:
df=df_train.groupby(['date','shop_id','item_id']).sum()
df.head()

In [None]:
df = df.pivot_table(index=['shop_id','item_id'], columns='date', values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)
df.head()

In [None]:
df_test= pd.merge(df_test , df , on = ['shop_id', 'item_id'], how = 'left')
df_test.drop(['ID', '2013-01'], axis =1, inplace=True)
df_test= df_test.fillna(0)

In [None]:
df_test.head()

### Machine Learning Models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.utils import resample
from sklearn.pipeline import Pipeline

### Train Test splitting

In [None]:
Y_train = df['2015-10'].values
X_train = df.drop(['2015-10'], axis = 1)
X_test = df_test

In [None]:
x_train, x_test, y_train, y_test = train_test_split( X_train, Y_train, test_size=0.2, random_state=101)

### Linear Regression

In [None]:
LR = LinearRegression()
LR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LR.predict(x_test)))
print('Test set score:', LR.score(x_train,y_train))

### Random Forest Regressor

In [None]:
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, RFR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, RFR.predict(x_test)))
print('Test set score:', RFR.score(x_train,y_train))

In [None]:
prediction = RFR.predict(X_test)

In [None]:
prediction = list(map(round, prediction))

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()