In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
salesdata = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')

In [None]:
salesdata.head()

In [None]:
salesdata.info()

In [None]:
itemdf = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')

In [None]:
itemdf.head()

In [None]:
itemcat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

In [None]:
iteminfo = pd.merge(itemdf, itemcat, on='item_category_id')

In [None]:
iteminfo.head()

In [None]:
salesitemdata = pd.merge(salesdata, iteminfo, on='item_id')

In [None]:
salesitemdata.head()

In [None]:
salesitemdata = salesitemdata.drop(['item_price','date','item_category_name','item_name'], axis=1)

In [None]:
salesitemdata.head()

In [None]:
salesitemdata['shop_id'] = salesitemdata['shop_id'].apply(int)
salesitemdata['item_id'] = salesitemdata['item_id'].apply(int)
salesitemdata['item_category_id'] = salesitemdata['item_category_id'].apply(int)

In [None]:
groupSalesDf = salesitemdata.groupby(['date_block_num','item_id','item_category_id','shop_id']).sum()
groupSalesDf['item_cnt_month']=groupSalesDf['item_cnt_day']
groupSalesDf.drop('item_cnt_day', axis=1, inplace=True)
groupSalesDf.reset_index(inplace=True)
groupSalesDf.drop('item_category_id', axis=1, inplace=True)
groupSalesDf

In [None]:
groupSalesDf.info()

In [None]:
# clip out sales of more than 20 in a month
clippedSales = groupSalesDf.copy()
clippedSales['item_cnt_month'].clip(0,20, inplace=True)
clippedSales['item_cnt_month'].max()

In [None]:
testdata = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv', index_col=0)

In [None]:
testdata

In [None]:
testdata['date_block_num'] = '34'
testdata

In [None]:
testdata['date_block_num'] = testdata['date_block_num'].apply(int)

In [None]:
testdata.info()

In [None]:
sns.lineplot(x='date_block_num', y='item_cnt_month', data=groupSalesDf[groupSalesDf['item_id']==22167])

In [None]:
# Split Train Validation Data by Time
X_train = groupSalesDf[0:1126386].drop('item_cnt_month',axis=1)
y_train = groupSalesDf['item_cnt_month'][0:1126386]
X_valid = groupSalesDf[1126386:].drop('item_cnt_month',axis=1)
y_valid = groupSalesDf['item_cnt_month'][1126386:]

In [None]:
# Build Model
from xgboost import XGBRegressor
model = XGBRegressor(early_stopping_rounds=5,
                     eval_set=[(X_valid, y_valid)],
                     objective='reg:squarederror',
                     verbose=False)
model.fit(X_train,y_train)

In [None]:
pred = model.predict(X_valid)
# metrics
from sklearn import metrics
np.sqrt(metrics.mean_squared_error(y_valid, pred))

In [None]:
#Tune XGBR
model = XGBRegressor(objective='reg:squarederror',
                    n_estimators=100,
                    learning_rate=0.01,
                    colsample_bytree=1,
                    gamma=1,
                    subsample=0.8,
                    max_depth=3,
                    early_stopping_rounds=5,
                    eval_set=[(X_valid, y_valid)],
                    n_jobs=-1,
                    random_state=101)

In [None]:
model.fit(X_train,y_train)

In [None]:
pred = model.predict(X_valid)
# metrics
from sklearn import metrics
np.sqrt(metrics.mean_squared_error(y_valid, pred))

In [None]:
# Merge Training and Validation Sets
full_X = pd.concat([X_train,X_valid],axis=0)
full_X.sort_index(inplace=True)

full_y = pd.concat([y_train,y_valid],axis=0)
full_y.sort_index(inplace=True)

In [None]:
# Train on full data
model = XGBRegressor(objective='reg:squarederror',
                    n_estimators=100,
                    learning_rate=0.01,
                    colsample_bytree=1,
                    gamma=1,
                    subsample=0.8,
                    max_depth=3,
                    early_stopping_rounds=5,
                    n_jobs=-1,
                    random_state=101)
model.fit(full_X,full_y)

In [None]:
testdata = testdata[full_X.columns]

In [None]:
fullPred = model.predict(testdata)

In [None]:
presubmissionDf = pd.DataFrame(fullPred, columns=['item_cnt_month'])
presubmissionDf = pd.concat([testdata,presubmissionDf], axis=1)
presubmissionDf

In [None]:
submissionDf=pd.DataFrame(presubmissionDf['item_cnt_month'], columns=['item_cnt_month'])
submissionDf = submissionDf.rename_axis('ID')
submissionDf

In [None]:
submissionDf['item_cnt_month'].clip(0,20, inplace=True)
submissionDf['item_cnt_month'].max()

In [None]:
submissionDf.to_csv('futureSalesPredictions.csv')