This notebook has been created as part of the Coursera project. Features and ideas have been
taken from multiple sources. 

Ensembling has been done by stacking linear regression, lightgbm and xgboost predictions.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
import xgboost
import sklearn
from xgboost import XGBRegressor
from xgboost import plot_importance

import time
import sys
import gc
import pickle


In [None]:
train_path = "../input/competitive-data-science-predict-future-sales/sales_train.csv"
test_path = "../input/competitive-data-science-predict-future-sales/test.csv"
items_path = "../input/competitive-data-science-predict-future-sales/items.csv"
shops_path = "../input/competitive-data-science-predict-future-sales/shops.csv"
item_cat_path = "../input/competitive-data-science-predict-future-sales/item_categories.csv"

In [None]:
items = pd.read_csv(items_path)
cats = pd.read_csv(item_cat_path)
shops = pd.read_csv(shops_path)
train = pd.read_csv( train_path )
test = pd.read_csv( test_path )

# **EDA**

In [None]:
monthly_sales = train.groupby('date_block_num')['item_cnt_day'].sum()
monthly_sales.plot()

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (10,4))
plt.xlim(-100, 3000)
sns.boxplot( x= train.item_cnt_day )

plt.figure( figsize = (10,4) )
plt.xlim(train.item_price.min(), train.item_price.max())
sns.boxplot( x = train.item_price )
plt.show()



In [None]:
# Removing outliers based on boxplots

train = train[train.item_price<100000]
train = train[train.item_cnt_day<1100]

#Fix negative price for a item 2973

median = np.median(train[train['item_id']==2973].item_price)
train.loc[train['item_price']<0,'item_price'] = median

In [None]:
sales= train.copy()

# **Feature Engineering**

In [None]:
from sklearn.preprocessing import LabelEncoder

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

In [None]:
def lag_feature( df,lags, cols ):
    for col in cols:
        print(col)
        tmp = df[["date_block_num", "shop_id","item_id",col ]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
            shifted.date_block_num = shifted.date_block_num + i
            df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
def add_feature(df,grp_cols,feature):
        new_df = df.groupby(grp_cols).agg({'item_cnt_month': ['mean']})
        new_df.columns = [feature]
        new_df.reset_index(inplace = True)
        df = pd.merge(df,new_df,on = grp_cols,how='left')
        return(df)

In [None]:
matrix = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in range(34):
    mat = sales[sales.date_block_num == i]
    matrix.append( np.array(list( product( [i], mat.shop_id.unique(), mat.item_id.unique() ) ), dtype = np.int16) )

matrix = pd.DataFrame( np.vstack(matrix), columns = cols )
matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["item_id"] = matrix["item_id"].astype(np.int16)
matrix.sort_values( cols, inplace = True )

In [None]:
test['date_block_num'] = 34
#Concatenate train and test dataframes
matrix = pd.concat([matrix,test], ignore_index = True)
matrix.drop('ID',axis = 1, inplace = True)
matrix.fillna(0,inplace = True)

In [None]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)

In [None]:
matrix.head()

In [None]:

matrix["month"] = matrix["date_block_num"] % 12
matrix['year'] = (matrix['date_block_num'] / 12).astype(np.int8)


In [None]:
group = sales.groupby( ["date_block_num", "shop_id", "item_id"] ).agg( {"item_cnt_day": ["sum"]} )
group.columns = ["item_cnt_month"]
group.reset_index( inplace = True)
matrix = pd.merge( matrix, group, on = cols, how = "left" )
matrix["item_cnt_month"] = matrix["item_cnt_month"].fillna(0).clip(0,20).astype(np.float16)

In [None]:
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':[('target_shop','sum')]})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
matrix = pd.merge(matrix, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)


In [None]:
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':[('target_item','sum')]})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
matrix = pd.merge(matrix, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)


In [None]:
gb = sales.groupby(['shop_id','item_id', 'date_block_num'],as_index=False).agg({'item_price':[('item_price_mean','mean')]})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
matrix = pd.merge(matrix, gb, how='left', on=['shop_id', 'item_id','date_block_num']).fillna(0)

In [None]:
matrix["revenue"] = matrix["item_price_mean"]*matrix["item_cnt_month"]

In [None]:
matrix.head()

In [None]:
matrix = lag_feature( matrix, [1,2,3,6,12], ["item_cnt_month"] )
matrix = lag_feature( matrix, [1,2,3,6,12], ["target_shop"] )
matrix = lag_feature( matrix, [1,2,3,6,12], ["target_item"] )
matrix = lag_feature( matrix, [1,2,3,6,12], ["revenue"] )
matrix = lag_feature( matrix, [1,2,3,6,12], ["item_price_mean"] )






# **Mean Encoding**

In [None]:
matrix = add_feature(matrix,['date_block_num', 'shop_id'],'date_shop_avg_cnt')
matrix = lag_feature(matrix,[1,2,6,12],['date_shop_avg_cnt'])

matrix = add_feature(matrix,['date_block_num', 'item_category_id'],'date_cat_avg_cnt')
matrix = lag_feature(matrix,[1,2,6,12],['date_cat_avg_cnt'])

matrix = add_feature(matrix,['date_block_num'],'date_avg_item_cnt')
matrix = lag_feature(matrix,[1,2,6,12],['date_avg_item_cnt'])

matrix = add_feature(matrix,['date_block_num', 'item_id'],'date_item_avg_cnt')
matrix = lag_feature(matrix,[1,2,6,12],['date_item_avg_cnt'])

matrix = add_feature(matrix,['date_block_num', 'city_code'],'date_city_avg_cnt')
matrix = lag_feature(matrix,[1,2,6,12],['date_city_avg_cnt'])

In [None]:
def optimize_memory(df):
    start_mem = df.memory_usage().sum() / 1024**2
    integers = ['int8','int16','int32','int64']
    floats   = ['float32','float64']
    int_cols  = [c for c in df if df[c].dtype in integers]
    float_cols  = [c for c in df if df[c].dtype in floats]
    for i in int_cols:
        df[i] = pd.to_numeric(df[i], downcast='integer')
    for i in float_cols:
        df[i] = pd.to_numeric(df[i], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
import gc

matrix.fillna(0,inplace = True)
matrix = optimize_memory(matrix)
gc.collect()

In [None]:
matrix = add_feature(matrix,['date_block_num', 'type_code'],'date_type_avg_cnt')
matrix = lag_feature(matrix,[1,2,3,12],['date_type_avg_cnt'])

matrix = add_feature(matrix,['date_block_num', 'subtype_code'],'date_subtype_avg_cnt')
matrix = lag_feature(matrix,[1,2,3,12],['date_subtype_avg_cnt'])



matrix.fillna(0,inplace = True)
matrix = optimize_memory(matrix)
gc.collect()

In [None]:
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days)

matrix['date_item_day'] = matrix['item_cnt_month'] / matrix['days']
matrix = lag_feature(matrix,[1,2,3,12],['date_item_day'])



In [None]:
matrix.fillna(0,inplace = True)
matrix = optimize_memory(matrix)
gc.collect()

In [None]:
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, ['date_item_avg_item_price'])

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)


features_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    features_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    features_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(features_to_drop, axis=1, inplace=True)

In [None]:
train['revenue'] = train['item_cnt_day']*train['item_price']
group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

matrix = lag_feature(matrix, [1], ['delta_revenue'])

matrix.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)

In [None]:
matrix.head()

In [None]:
matrix = matrix[matrix['date_block_num'] > 12]
data= matrix.copy()

In [None]:
colsdrop= ['date_item_day','date_item_avg_cnt','date_shop_avg_cnt','date_cat_avg_cnt',
          'date_avg_item_cnt']
colsdrop1= ['target_shop','target_item','item_cnt_month','revenue','item_price_mean',
           'date_city_avg_cnt','date_type_avg_cnt','date_subtype_avg_cnt']

dropcols= colsdrop+colsdrop1
#data.drop(colsdrop1, axis=1, inplace=True)

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
X_train = data[data.date_block_num < 33].drop(dropcols, axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(dropcols, axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(dropcols, axis=1)

In [None]:
X_valid.head()

In [None]:
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# **Model Training**

In [None]:
lr = LinearRegression()
lr.fit(X_train.values, Y_train)
pred_lr = lr.predict(X_valid.values)

print('Test R-squared for linreg is %f' % r2_score(Y_valid, pred_lr))

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=Y_train), 100)
pred_lgb = model.predict(X_valid)

print('Test R-squared for LightGBM is %f' % r2_score(Y_valid, pred_lgb))

In [None]:
del data
gc.collect();

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    subsample=0.8,
    colsample_bytree=0.8,
    eta = 0.3,
    seed=42)

xgb.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=10, 
    early_stopping_rounds = 20)

In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(xgb, (10,14))

In [None]:
pred_xgb = xgb.predict(X_valid)

print('Test R-squared for XGBoost is %f' % r2_score(Y_valid, pred_xgb))

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = xgb.predict(X_valid).clip(0,20)
y_pred_tr = xgb.predict(X_train).clip(0,20)
rmse_tr = mean_squared_error(Y_train, y_pred_tr,squared=False)
rmse_val = mean_squared_error(Y_valid, y_pred,squared=False)
print("RMSE Validation: %.5f" % rmse_val)
print("RMSE Training: %.5f" % rmse_tr)

# **Ensembling**

In [None]:
test_pred_lr= lr.predict(X_test)
test_pred_lgb= model.predict(X_test)
test_pred_xgb= xgb.predict(X_test)

stacked_valid_predictions= np.column_stack((pred_lr, pred_lgb, pred_xgb))
stacked_test_predictions= np.column_stack((test_pred_lr, test_pred_lgb, test_pred_xgb))

In [None]:
meta_model= LinearRegression()

meta_model.fit(stacked_valid_predictions, Y_valid)

final_predictions= meta_model.predict(stacked_test_predictions)

In [None]:
Y_test = final_predictions.clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('1c_submission4.csv', index=False)
