# Final Project Coursera

The first step is to load all the required libraries and load raw data files into memory.

In [None]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

**Validating packages versions**.

I'm going to validate de version of the installed packages

In [None]:
for p in [np, pd, lgb]:
    print (p.__name__, p.__version__)

In [None]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test_data = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_category = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

sales.head()

# Basic functions

This fuction is part of one of the assignments to reduce the size of data

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

It is required to add lagged data as new features

In [None]:
def lag_feature(all_data, list_lags, index_cols, cols_to_rename):
    shift_range = list_lags

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()
    
        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift
    return all_data

# EDA

Here is how data looks like. 

I have ploted sales of shops per month, in order to see the general behavior

In [None]:
Monthly_sales = sales.groupby(["date_block_num", "shop_id"])['item_cnt_day'].sum().reset_index(name = 'item_cnt_month')


fig, axs = plt.subplots(10, 6)

for i in range(60):
  shop_sale_per_month = Monthly_sales.loc[Monthly_sales['shop_id']==i]
  axs[i//6,i%6].tick_params(axis='both', which='both', bottom=False, top= False, labelbottom=False, right=False, left=False, labelleft=False)
  axs[i//6,i%6].plot(shop_sale_per_month['date_block_num'], shop_sale_per_month['item_cnt_month'])


del Monthly_sales, shop_sale_per_month

In general, sales have seasonal behaviour as expected. However, some shops show abnormal behaviour which turned out to be duplication issue and fixed as follows. Also, we removed the outliers from data.

In [None]:
sales.loc[sales.shop_id == 0, 'shop_id'] = 57
test_data.loc[test_data.shop_id == 0, 'shop_id'] = 57

sales.loc[sales.shop_id == 1, 'shop_id'] = 58
test_data.loc[test_data.shop_id == 1, 'shop_id'] = 58

sales.loc[sales.shop_id == 10, 'shop_id'] = 11
test_data.loc[test_data.shop_id == 10, 'shop_id'] = 11


# remove the oulier
sales = sales[sales.item_cnt_day<1001]

# Data Leakage

Finding the number of unique shop-item combinations that only exist in test data

In [None]:
temp_df = pd.merge(test_data[['shop_id','item_id']],sales[['shop_id','item_id']], on=['shop_id','item_id'], how='left', indicator='Exist')
temp_var =  (temp_df['Exist']=='left_only').sum()
print('Number of unique shop-item combination in the test set that do not exist in the training set:',temp_var)

As we can see, about 52% of combinations already exists in the training set. I'm going to use them if any leaked is found.

In [None]:
Leakage_Percentage = ((test_data.shape[0]-temp_var)/test_data.shape[0])*100
print('Percentage of shop-item combination in test data that are available in the training set:', Leakage_Percentage)

# Feature Engineering

The following code is based in assignments and it will be used in order to create all posible combinations of shop-items and fill out the target column.

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':['sum']})
gb.rename(columns = {'sum':'target'}, inplace = True) 

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data['target'] = all_data['target'].fillna(0).clip(0,20)

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 
all_data

I'm going to add the following features:
* City code from the shops csv
* Category ID
* The text data in item category gives some extra info that can be used such as the basket of commodities that an item belongs to

In [None]:
# Adding city_enc column
shops['city'] = shops.shop_name.apply(lambda x: str.replace(x, '!', '')).apply(lambda x: x.split(' ')[0])
shops['city_enc'] = LabelEncoder().fit_transform(shops['city'])
shops_data = shops[['shop_id','city_enc']]
all_data = pd.merge(all_data, shops_data, how='left', on=['shop_id'])

# Adding item_category_id column
all_data = pd.merge(all_data, items, how='left', on=['item_id'])
all_data = all_data.drop('item_name',axis =1)

# Adding basket_enc column
item_category['basket'] = item_category['item_category_name'].apply(lambda x: str(x).split(' ')[0])
item_category['basket_enc'] = LabelEncoder().fit_transform(item_category['basket'])
item_category = item_category[['item_category_id','basket_enc']]
all_data = pd.merge(all_data, item_category, how='left', on=['item_category_id'])
all_data

Now, I'm going to add the test data

In [None]:
all_data = pd.concat([all_data, test_data], ignore_index=True, sort=False, keys=['date_block_num','shop_id','item_id', 'city_enc', 'item_category_id', 'basket_enc', 'target'])
all_data = downcast_dtypes(all_data)
all_data

# Mean Encoding

Here, I'm going to aggregate data

In [None]:
# shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
gb.rename(columns = {'sum':'target_shop'}, inplace = True)

gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
gb.rename(columns = {'sum':'target_item'}, inplace = True)
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

all_data = downcast_dtypes(all_data)

all_data

Finally, I'm going to add lagged data. Based on my analysis using trial and error, I found just lagged data for previous 3 months has highest impact

In [None]:
from tqdm import tqdm_notebook

index_cols = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'basket_enc', 'city_enc']
cols_to_rename = list(all_data.columns.difference(index_cols)) 
list_lags = [1, 2, 3]
all_data = lag_feature(all_data, list_lags, index_cols, cols_to_rename)
all_data = downcast_dtypes(all_data)

all_data

# Training

I'm going to train two LGBM and random forest model and later I'll stack them

In [None]:
shift_range = list_lags

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 
del sales, grid
to_drop_cols

In [None]:
X_train = all_data[all_data.date_block_num < 33].drop(to_drop_cols, axis=1)
Y_train = all_data[all_data.date_block_num < 33]['target']
X_valid = all_data[all_data.date_block_num == 33].drop(to_drop_cols, axis=1)
Y_valid = all_data[all_data.date_block_num == 33]['target']
X_test = all_data[all_data.date_block_num == 34].drop(to_drop_cols, axis=1)

In [None]:
X = X_train.append(X_valid)
Y = np.append(Y_train, Y_valid)

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.05, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
lgb = lgb.train(lgb_params, lgb.Dataset(X, label=Y), 100)

pred_lgb_val = lgb.predict(X_valid)

print('Train mse is %f' % mean_squared_error(Y_train, lgb.predict(X_train)))
print('Val mse is %f' % mean_squared_error(Y_valid, pred_lgb_val))

In [None]:
rf = RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=10,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
rf.fit(X,Y)

pred_rf_val = rf.predict(X_valid)
print('Train mse is %f' % mean_squared_error(Y_train, rf.predict(X_train)))
print('Val mse is %f' % mean_squared_error(Y_valid, pred_rf_val))

# Stacking

I'm going to stack two models in order to improve the model.

In [None]:
plt.scatter(pred_rf_val, pred_lgb_val)

In [None]:
X_val_level2 = np.c_[pred_rf_val, pred_lgb_val]

lr = LinearRegression()
lr.fit(X_val_level2, Y_valid)
pred_lr_val =  lr.predict(X_val_level2)
print('Test mse is %f' % mean_squared_error(Y_valid, pred_lr_val))

# Final Test

In [None]:
lr = LinearRegression()
lr.fit(X_val_level2, Y_valid)
pred_lr_val =  lr.predict(X_val_level2)
print('Test mse is %f' % mean_squared_error(Y_valid, pred_lr_val))

In [None]:
all_data.to_csv('mycsvfile.csv',index=False)