In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Dataset**

In [None]:
item_categories= pd.read_csv(filepath_or_buffer = "../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv(filepath_or_buffer = "../input/competitive-data-science-predict-future-sales/items.csv")
sales_train =  pd.read_csv(filepath_or_buffer ="../input/competitive-data-science-predict-future-sales/sales_train.csv")
shops = pd.read_csv(filepath_or_buffer ="../input/competitive-data-science-predict-future-sales/shops.csv")
test =  pd.read_csv(filepath_or_buffer ="../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
import datetime as dt

# Convert the date column to pandas datetime type
sales_train['date'] = pd.to_datetime(sales_train['date'])

# sales_train.drop('date',axis=1,inplace=True)
sales_train['date'] = sales_train['date'].apply(lambda x: x.strftime('%Y-%m'))

sales_train.head()

In [None]:
sales_train.drop(['date_block_num','item_price'] , axis =1, inplace= True)

sales_train.sort_values(by='date')

In [None]:
# Aggregate the data by date,shopid and item id
train_data = sales_train.groupby(['date','shop_id','item_id']).sum()
 
train_data

**Train data**

In [None]:
train_data = sales_train.pivot_table(index=['shop_id','item_id'], columns='date', values='item_cnt_day', fill_value=0)

train_data.reset_index(inplace=True)

train_data.head()

**Test data:**

In [None]:
test_data = pd.merge( test , train_data , on = ['shop_id', 'item_id'], how = 'left')
test_data.drop(['ID', '2013-01'], axis =1, inplace=True)
test_data= test_data.fillna(0)
test_data.head()

### Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
Y_train = train_data['2015-10'].values
X_train = train_data.drop(['2015-10'], axis = 1)
X_test = test_data

In [None]:
x_train, x_test, y_train, y_test = train_test_split( X_train, Y_train, test_size=0.2, random_state=101)

#### **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LR.predict(x_test)))
print('Test set score:', LR.score(x_train,y_train))

#### **Ridge Regression**

In [None]:
from sklearn import linear_model
rid = linear_model.Ridge(alpha=.5)

rid.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, rid.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, rid.predict(x_test)))
print('Test set score:', rid.score(x_train,y_train))

#### **Lasso Regression**

In [None]:
lasso = linear_model.Lasso(alpha=0.1)

lasso.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, lasso.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, lasso.predict(x_test)))
print('Test set score:', lasso.score(x_train,y_train))

#### **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, RFR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, RFR.predict(x_test)))
print('Test set score:', RFR.score(x_train,y_train))

#### **Voting Regressor**

In [None]:
from sklearn.ensemble import VotingRegressor

model= VotingRegressor([("Linear Regression",LR),
                        ("Ridge Regression",rid),
                        ("Lasso Regression",lasso),
                        ("Random Forest Reressor",RFR)
                        ])

model.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, model.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, model.predict(x_test)))
print('Test set score:', model.score(x_train,y_train))

#### **Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()

gbr.fit(x_train, y_train)

print('Train set mse:', mean_squared_error(y_train, gbr.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, gbr.predict(x_test)))
print('Test set score:', gbr.score(x_train,y_train))

#### **Hyper Parameter Tuning of Gradient Boosting Regressor**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
params = {
    "learning_rate": [0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth": range(5,21,2),
    "n_estimators" : range(20,101,10),
    'min_samples_split':range(200,1001,200),
    'min_samples_leaf':range(30,71,10)
}

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()

random_search = RandomizedSearchCV(gbr, param_distributions =params ,n_iter =5 ,n_jobs=-1,cv=5,verbose = 3)
random_search.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, random_search.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, random_search.predict(x_test)))
print('Test set score:', random_search.score(x_train,y_train))


In [None]:
prediction = random_search.predict(X_test)

### **Submission file**

In [None]:
prediction = list(map(round, prediction))
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
submission['item_cnt_month'] = prediction
submission.to_csv('prediction.csv', index=False)
submission.head()