In [1]:
# install Kaggle Lib

! pip install kaggle --upgrade



In [2]:
from google.colab import files

uploaded = files.upload()
uploaded

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bhargava39","key":"66bc8f96f9f32e3dc1b9962d882cfc99"}'}

In [3]:
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! kaggle -v

Kaggle API 1.5.12


# Get data from kaggle

In [4]:
! kaggle competitions download -c competitive-data-science-predict-future-sales

Downloading competitive-data-science-predict-future-sales.zip to /content
 33% 5.00M/15.1M [00:00<00:00, 50.4MB/s]
100% 15.1M/15.1M [00:00<00:00, 96.2MB/s]


In [5]:
! unzip competitive-data-science-predict-future-sales.zip

Archive:  competitive-data-science-predict-future-sales.zip
  inflating: item_categories.csv     
  inflating: items.csv               
  inflating: sales_train.csv         
  inflating: sample_submission.csv   
  inflating: shops.csv               
  inflating: test.csv                


In [6]:
! ls

competitive-data-science-predict-future-sales.zip  sample_data
item_categories.csv				   sample_submission.csv
items.csv					   shops.csv
kaggle.json					   test.csv
sales_train.csv


# Import Dependencies


In [8]:
# basic packages
import numpy as np
import pandas as pd
import random as rd
import datetime

# visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot

# Models
from xgboost import XGBRegressor, plot_importance

# Scikit Learn
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# settings
import warnings
warnings.filterwarnings("ignore")

sns.set(style='darkgrid')
pd.set_option('display.float_format', lambda x: '%.2f' %x)
%matplotlib inline

# Load the data

In [35]:
train = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
items = pd.read_csv("items.csv")
item_cat = pd.read_csv('item_categories.csv')
shops = pd.read_csv("shops.csv")

In [36]:
train['date'] = pd.to_datetime(train['date'], format = '%d.%m.%Y')
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [37]:
dataset = train.pivot_table(index = ['shop_id', 'item_id'], 
                            values = ['item_cnt_day'], columns = ['date_block_num'], fill_value=0,aggfunc = 'sum')

dataset.reset_index(inplace = True)
dataset.head()

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
test.drop(['ID'], axis = 1, inplace=True)
test.head()

Unnamed: 0,shop_id,item_id
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [39]:
dataset = pd.merge(test, dataset, on=['item_id', 'shop_id'], how='left')
dataset.head()

Unnamed: 0,shop_id,item_id,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)","(item_cnt_day, 7)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,5,5320,,,,,,,,,...,,,,,,,,,,
2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,5268,,,,,,,,,...,,,,,,,,,,


In [16]:
dataset.isna().sum()

shop_id                    0
item_id                    0
(item_cnt_day, 0)     102796
(item_cnt_day, 1)     102796
(item_cnt_day, 2)     102796
(item_cnt_day, 3)     102796
(item_cnt_day, 4)     102796
(item_cnt_day, 5)     102796
(item_cnt_day, 6)     102796
(item_cnt_day, 7)     102796
(item_cnt_day, 8)     102796
(item_cnt_day, 9)     102796
(item_cnt_day, 10)    102796
(item_cnt_day, 11)    102796
(item_cnt_day, 12)    102796
(item_cnt_day, 13)    102796
(item_cnt_day, 14)    102796
(item_cnt_day, 15)    102796
(item_cnt_day, 16)    102796
(item_cnt_day, 17)    102796
(item_cnt_day, 18)    102796
(item_cnt_day, 19)    102796
(item_cnt_day, 20)    102796
(item_cnt_day, 21)    102796
(item_cnt_day, 22)    102796
(item_cnt_day, 23)    102796
(item_cnt_day, 24)    102796
(item_cnt_day, 25)    102796
(item_cnt_day, 26)    102796
(item_cnt_day, 27)    102796
(item_cnt_day, 28)    102796
(item_cnt_day, 29)    102796
(item_cnt_day, 30)    102796
(item_cnt_day, 31)    102796
(item_cnt_day,

In [18]:
dataset.isnull().sum().sum()

3495064

In [19]:
dataset.fillna(0, inplace=True)
dataset.isnull().sum().sum()

0

In [41]:
dataset.drop(['shop_id', 'item_id'], inplace = True, axis=1)
dataset.head()

Unnamed: 0,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)","(item_cnt_day, 7)","(item_cnt_day, 8)","(item_cnt_day, 9)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0


In [44]:
y_train = dataset.iloc[:,-1:] # the last column is our label
X_train = dataset.iloc[:,:-1] # drop the last column
X_test = dataset.iloc[:,1:] # drop first column of data

X_train.shape, y_train.shape, X_test.shape

((111404, 33), (111404, 1), (111404, 33))

In [45]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

model = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=10), 
                          random_state=0, n_estimators=3000)

model.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=RandomForestRegressor(max_depth=10),
                  n_estimators=3000, random_state=0)

In [46]:
y_pred = model.predict(X_train)
y_pred

array([0.6606194 , 1.3200661 , 0.39569305, ..., 0.23446086, 0.35026661,
       0.28576335])

In [47]:
model.score(X_train, y_train)

0.9476436046219625

In [48]:
pred = model.predict(X_train)
pred

array([0.6606194 , 1.3200661 , 0.39569305, ..., 0.23446086, 0.35026661,
       0.28576335])

In [None]:
sub = pd.read_csv("sample_submission.csv")  # Export Submission pd.read_csv("my_file.csv", nrows=5000)
sub['item_cnt_month'] = pd.DataFrame(pred)  # Format DataFrame

sub.head()

In [33]:
sub.to_csv("submission2.csv", index = False)

In [34]:
! kaggle competitions submit -c competitive-data-science-predict-future-sales -f Submission.csv -m "AdaBoost & Random"

100% 5.36M/5.36M [00:03<00:00, 1.63MB/s]
Successfully submitted to Predict Future Sales