
This challenge serves as final project for the "How to win a data science competition" Coursera course.

In this competition you will work with a challenging time-series dataset consisting of daily sales data, kindly provided by one of the largest Russian software firms - 1C Company.

We are asking you to predict total sales for every product and store in the next month. By solving this competition you will be able to apply and enhance your data science skills.

Load libraries

In [None]:
#load modules
import pandas as pd
import numpy as np
from pandas import read_csv
from pandas import datetime

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



Load datasets

In [None]:
#Load datasets
train=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
submission=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
items=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_cat=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")

In [None]:
submission

Convert date to datetime format

In [None]:
#convert date to datetime format
train['date'] = pd.to_datetime(train['date'],format = '%d.%m.%Y')
train

Create dataset

In [None]:
#create pivot table
dataset = train.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')
dataset.reset_index(inplace = True)
dataset

Preprocess data

In [None]:
test = test.drop(['ID'], axis=1)
test

In [None]:
#merge pivot table with test set
dataset = pd.merge(test,dataset,on = ['item_id','shop_id'],how = 'left')
dataset

In [None]:
#check for any null values
dataset.isnull().sum().sum()

In [None]:
#fill all NaN values with 0
dataset.fillna(0,inplace = True)
dataset.isnull().sum().sum()

In [None]:
#drop shop_id and item_id
dataset.drop(['shop_id','item_id'],inplace = True, axis = 1)
dataset

Create X, y and X_test

In [None]:
#split the dataset in two
# the last column is our label
y_train = dataset.iloc[:,-1:]
#drop last column of data
X_train = dataset.iloc[:, :-1]
#drop first colum of data
X_test = dataset.iloc[:,1:]
# lets have a look on the shape 
print(X_train.shape,y_train.shape,X_test.shape)

Define model

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

model = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=10), random_state=0, n_estimators=3000).fit(X_train, y_train)
print(model.score(X_train, y_train))

In [None]:

y_pred = model.predict(X_train)
y_pred

Make predictions

In [None]:
pred = model.predict(X_test)
pred[pred < 0] = 0
pred

Submit file

In [None]:
# creating submission file 
submission['item_cnt_month'] = pred.clip(0,20)
# creating csv file from dataframe
submission.to_csv('submission.csv',index = False)
submission