In [1]:
#Importing Libraries
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from itertools import product
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
import gc
from lightgbm import LGBMRegressor

In [2]:
# Read Files
train=pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv",index_col='ID')
items=pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
shops=pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
items_categories=pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
sample_submission=pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [3]:
train.head()

178508    1.0
178509    1.0
Name: item_cnt_day, dtype: float64

In [None]:
items.head()

In [None]:
shops.head()

In [None]:
items_categories.head()

# Data Cleaning and Exploration

In [None]:
plt.subplot(121)
sns.boxplot(train['item_cnt_day'])
plt.subplot(122)
sns.boxplot(train['item_price'])

In [None]:
# The item_cnt_day>1001 and item_price>100000 are outlier points.
# All Check for Negative item_price
print(train[train['item_price']<0])

In [4]:
train=train[train['item_cnt_day']<1001]
train=train[train['item_price']<100000]
train.loc[484683,'item_price']=train.loc[(train['date_block_num']==4) &(train['shop_id']==32)&(train['item_id']==2973)]['item_price'].median()
train.loc[train['item_cnt_day']<0,'item_cnt_day']=0

In [None]:
shops['shop_name'].apply(lambda x: x.split(" "))

In [5]:
train.loc[train['shop_id']==0,'shop_id']=57
test.loc[train['shop_id']==0,'shop_id']=57
train.loc[train['shop_id']==1,'shop_id']=58
test.loc[train['shop_id']==1,'shop_id']=58
train.loc[train['shop_id']==10,'shop_id']=11
test.loc[train['shop_id']==10,'shop_id']=11

In [6]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

items_categories['split'] = items_categories['item_category_name'].str.split('-')
items_categories['type'] =  items_categories['split'].map(lambda x: x[0].strip())
items_categories['type_code'] = LabelEncoder().fit_transform( items_categories['type'])

items_categories['subtype'] =  items_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
items_categories['subtype_code'] = LabelEncoder().fit_transform(items_categories['subtype'])
items_categories = items_categories[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

In [None]:
# There are 363  items that are not present in train
print(len(set(test['item_id']))-len(set(test['item_id']).intersection(set(train['item_id']))))

In [7]:
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

In [8]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
group=train.groupby('item_id').agg({'item_price':['mean']})
group.columns=['avg_item_price']
group.reset_index(inplace=True)
matrix=pd.merge(matrix,group,on=['item_id'],how='left').fillna(0)

In [9]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [10]:
matrix=pd.merge(matrix,shops,on=['shop_id'],how='left')
matrix=pd.merge(matrix,items,on=['item_id'],how='left')
matrix=pd.merge(matrix,items_categories,on=['item_category_id'],how='left')
matrix['city_code']=matrix['city_code'].astype(np.int8)
matrix['item_category_id']=matrix['item_category_id'].astype(np.int8)
matrix['type_code']=matrix['type_code'].astype(np.int8)
matrix['subtype_code']=matrix['subtype_code'].astype(np.int8)

# Feature Engineering

In [11]:
def shift_feature(df,lags,feature,drop=True):
    for i in lags:
        new_col_name=feature+"_"+str(i)
        df[new_col_name]=df.groupby(['shop_id','item_id'])[feature].shift(periods=i)
    if drop:
        df.drop(feature,1,inplace=True)
    return df
def add_feature(df,cols,feature_name):
    group=df.groupby(cols).agg({'item_cnt_month':['mean']})
    group.columns=[feature_name]
    group.reset_index(inplace=True)
    df=pd.merge(df,group,on=cols,how='left')
    return df


        

In [12]:
matrix=shift_feature(matrix,[1,2,3],'item_cnt_month',False)
matrix=add_feature(matrix,['date_block_num','item_id'],'date_item_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'date_item_avg_cnt')
matrix=add_feature(matrix,['date_block_num','shop_id'],'date_shop_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'date_shop_avg_cnt')
matrix=add_feature(matrix,['date_block_num','item_category_id'],'date_item_category_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'date_item_category_avg_cnt')
matrix=add_feature(matrix,['date_block_num'],'date_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'date_avg_cnt')
matrix.fillna(0,inplace=True)

In [13]:
matrix['month']=matrix['date_block_num']%12
days=pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days']=matrix['month'].map(days)

In [14]:
matrix=add_feature(matrix,['month'],'month_avg_cnt')
matrix=shift_feature(matrix,[1,2,3,12],'month_avg_cnt')
matrix=add_feature(matrix,['month','item_id'],'month_item_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'month_item_avg_cnt')
matrix=add_feature(matrix,['month','shop_id'],'month_shop_avg_cnt')
matrix=shift_feature(matrix,[1,2,3],'month_shop_avg_cnt')
matrix=add_feature(matrix,['month','item_category_id'],'month_item_category_avg_cnt')
matrix=shift_feature(matrix,[1,2,3,12],'month_item_category_avg_cnt')
matrix.fillna(0,inplace=True)

In [15]:
matrix['date_item_cnt']=matrix['item_cnt_month']/matrix['days']
matrix=shift_feature(matrix,[1,2,3,12],'date_item_cnt')
matrix=add_feature(matrix,['date_block_num','item_id','city_code'],'item_city_cnt')
matrix=shift_feature(matrix,[1,2,3,12],'item_city_cnt')
group=matrix.groupby(['shop_id','item_id']).agg({'avg_item_price':['mean']})
group.columns=['avg_pair_price']
group.reset_index(inplace=True)
matrix=pd.merge(matrix,group,on=['shop_id','item_id'],how='left').fillna(0)
matrix['delta_price']=(matrix['avg_pair_price']-matrix['avg_item_price'])/matrix['avg_item_price']
matrix=shift_feature(matrix,[1,2,3,12],'avg_item_price')
matrix=shift_feature(matrix,[1,2,3,12],'avg_pair_price')
matrix=shift_feature(matrix,[1,2,3,12],'delta_price')
matrix=matrix.fillna(0)

In [16]:
matrix.to_pickle("sales.pkl")

# Building Model

In [None]:
del group,matrix


In [None]:
matrix=pd.read_pickle('sales.pkl')

In [16]:
X_train=matrix[matrix['date_block_num']<33].drop('item_cnt_month',1).values
y_train=matrix[matrix['date_block_num']<33]['item_cnt_month'].values
X_valid=matrix[matrix['date_block_num']==33].drop('item_cnt_month',1).values
y_valid=matrix[matrix['date_block_num']==33]['item_cnt_month'].values
X_test=matrix[matrix['date_block_num']==34].drop('item_cnt_month',1).values

In [19]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:1.11374	validation_1-rmse:1.05214
[1]	validation_0-rmse:1.01785	validation_1-rmse:0.99134
[2]	validation_0-rmse:0.96791	validation_1-rmse:0.96600
[3]	validation_0-rmse:0.94236	validation_1-rmse:0.95314
[4]	validation_0-rmse:0.92658	validation_1-rmse:0.94482
[5]	validation_0-rmse:0.91739	validation_1-rmse:0.94471
[6]	validation_0-rmse:0.91195	validation_1-rmse:0.94054
[7]	validation_0-rmse:0.90751	validation_1-rmse:0.93828
[8]	validation_0-rmse:0.90438	validation_1-rmse:0.93811
[9]	validation_0-rmse:0.90197	validation_1-rmse:0.93781
[10]	validation_0-rmse:0.89913	validation_1-rmse:0.93571
[11]	validation_0-rmse:0.89736	validation_1-rmse:0.93546
[12]	validation_0-rmse:0.89385	validation_1-rmse:0.93409
[13]	validation_0-rmse:0.89211	validation_1-rmse:0.93213
[14]	validation_0-rmse:0.89125	validation_1-rmse:0.93247
[15]	validation_0-rmse:0.88964	validation_1-rmse:0.93280
[16]	validation_0-rmse:0.88878	validation_1-rmse:0.93241
[17]	validation_0-rmse:0.88771	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=8,
             min_child_weight=300, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=0.8, tree_method='approx', validate_parameters=1,
             verbosity=None)

In [21]:
p1=model.predict(X_test)

In [25]:
model_lgbm = LGBMRegressor(
    max_depth = 8,
    n_estimators = 500,
    colsample_bytree=0.7,
    min_child_weight = 300,
    reg_alpha = 0.1,
    reg_lambda = 1,
    random_state = 42,
)

model_lgbm.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=10, 
    early_stopping_rounds = 40
    ) 

Training until validation scores don't improve for 40 rounds
[10]	training's rmse: 0.992893	training's l2: 0.985836	valid_1's rmse: 0.976544	valid_1's l2: 0.953637
[20]	training's rmse: 0.942808	training's l2: 0.888887	valid_1's rmse: 0.956375	valid_1's l2: 0.914653
[30]	training's rmse: 0.925059	training's l2: 0.855735	valid_1's rmse: 0.951516	valid_1's l2: 0.905383
[40]	training's rmse: 0.916863	training's l2: 0.840637	valid_1's rmse: 0.951532	valid_1's l2: 0.905413
[50]	training's rmse: 0.911625	training's l2: 0.83106	valid_1's rmse: 0.950006	valid_1's l2: 0.902511
[60]	training's rmse: 0.906939	training's l2: 0.822538	valid_1's rmse: 0.947526	valid_1's l2: 0.897806
[70]	training's rmse: 0.902635	training's l2: 0.814751	valid_1's rmse: 0.946284	valid_1's l2: 0.895454
[80]	training's rmse: 0.899734	training's l2: 0.809521	valid_1's rmse: 0.944147	valid_1's l2: 0.891414
[90]	training's rmse: 0.896845	training's l2: 0.804331	valid_1's rmse: 0.942922	valid_1's l2: 0.889103
[100]	trainin

LGBMRegressor(colsample_bytree=0.7, max_depth=8, min_child_weight=300,
              n_estimators=500, random_state=42, reg_alpha=0.1, reg_lambda=1)

In [27]:
p2=model_lgbm.predict(X_test)

In [None]:
p=(p1+p2)/2

In [30]:
sample_submission['item_cnt_month']=p
sample_submission.to_csv("submission.csv",index=False)