In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import time
import keras
import pandas as pd
import lightgbm as lgb
import time
import numpy as np
import pickle
import sklearn

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from keras.models import Sequential
from keras.layers import Dense

for package in [pd, keras, np, lgb, sklearn]:
    print(package.__version__)

Load required data into dataframe

In [None]:
sales_train    = pd.read_csv('sales_train.csv')
items           = pd.read_csv('items_english.csv')
item_categories = pd.read_csv('categories_english.csv')
shops           = pd.read_csv('shops_english.csv')
test           = pd.read_csv('test.csv')

sales_train.head(10)

Extract features from shops and categories. We see the first part of shops is usually a town. We can use this as a feature. Since there are not many shops, we can define a function to take care of edge cases

In [None]:
def get_town(shop_name):
    pre = shop_name.split()[0]
    if pre == 'St.':
        return 'St Petersburg'
    elif pre == '!':
        return 'Yakutsk'
    elif 'Rostov' in pre:
        return 'Rostov'
    elif 'Moscow' in pre:
        return 'Moscow'
    elif 'Internet' in pre or 'Digital' in pre:
        return 'Online'
    else:
        return pre

shops['town'] = shops['shop_name'].apply(get_town)
shops.head(60)

Perform similar preprocessing on categories. In most cases, the category is divided into "{category} - {subcategory}". Can use this as a feature

In [None]:
def get_subcategories(category_name):
    s = category_name.split(' - ')
    if len(s) == 2:
        return s[0], s[1]
    else:
        return s[0], None
    
item_categories['category_1'], item_categories['category_2'] = zip(*item_categories['category_name'].apply(get_subcategories))
item_categories.head(20)

Get unique items, duplicate rows and info on nulls

In [None]:
print(f'{len(sales_train)} total sales {len(items)} items {len(item_categories)} categories in train')
print(f'{len(test)} pairs {test["shop_id"].nunique()} shops {test["item_id"].nunique()} items in test')
sales_train.isnull().sum()

No null rows to remove. Check duplicates

In [None]:
print(f'{sales_train.duplicated().sum()} duplicates found and removed')
sales_train.drop_duplicates(inplace=True)

item_price should not be negative (returns are signified by item_cnt_day being negative). Check for these rows and remove

In [None]:
print(f'{(sales_train["item_price"] < 0).sum()} item_price < 0 removed')
sales_train = sales_train[sales_train['item_price'] >= 0]

Substitute shop id's as noted above

In [None]:
sales_train[sales_train['shop_id'] == 57]['shop_id'] = 0
sales_train[sales_train['shop_id'] == 58]['shop_id'] = 1

Sales over time

In [None]:
sales_train[['date_block_num','item_cnt_day']].groupby('date_block_num').sum().plot()
plt.show()
sales_train[['item_price']].boxplot(vert=False)
plt.show()
sales_train[['item_cnt_day']].boxplot(vert=False)
plt.show()

Sales seem to peak in November and December and steadily drop over time. We are almost certainly safe to remove the maximum value in both series. 

In [None]:
sales_train.drop(sales_train['item_price'].idxmax(), inplace=True)
sales_train.drop(sales_train['item_cnt_day'].idxmax(), inplace=True)

Month and year are also useful as features

In [None]:
dates = sales_train[['date','date_block_num']]
dates['date'] = pd.to_datetime(dates['date'],format='%d.%m.%Y')
dates['month'] = dates['date'].dt.month
dates['year'] = dates['date'].dt.year
dates.drop('date',axis=1,inplace=True)
dates.drop_duplicates(inplace=True)

#add test block num
dates = dates.append({'date_block_num' : 34, 'month' : 11, 'year' : 2015},ignore_index=True)
dates.head(50)


Now start aggregating the data into the correct form for the task

In [None]:
df = sales_train[['date_block_num','shop_id','item_id','item_cnt_day']]. \
    groupby(['date_block_num','shop_id','item_id'], as_index=False).agg({'item_cnt_day' : 'sum'})
df['item_cnt_month'] = df['item_cnt_day']
df.drop(['item_cnt_day'],inplace=True, axis=1)
df.tail(15)

Only concerned with pairs in the test set, so find these for every month. For each date block, join with the test data on shop_id and item_id. If a row has ID null, it doesn't appear in the test data and may therefore be dropped. If item_cnt_month is null, it indicates that the pair had no sales in the date block and this should be filled with 0. If a row has no null values, that indicates the pair appeared in both the train and test data.

In [None]:
data = pd.DataFrame()
for date_block_num in df['date_block_num'].unique():
    block = df[df['date_block_num'] == date_block_num]
    new = block.merge(test, on=['shop_id','item_id'],how='outer')
    new.dropna(subset=['ID'],inplace=True)

    new['date_block_num'].fillna(date_block_num,inplace=True)
    new['item_cnt_month'].fillna(0 ,inplace=True)
    
    data = pd.concat([data, new])

#add test data as date block num 34, but leave item_cnt_month as null (target value)
data.tail()

In [None]:
test['date_block_num'] = 34
data = pd.concat([data, test])
data.tail()

Sanity check. Each pair should appear exactly 35 times (one for each date block)

In [None]:
pair = data['shop_id'].astype(str) + '-' + data['item_id'].astype(str)

assert len(pair.value_counts().unique()) == 1
assert pair.value_counts().unique() == 35

We now have our training data. The first 34 blocks will be used to fit several time series models with the final one for testing. Merge item and shop names into our data for feature engineering

In [None]:
item_data = data.merge(items, on=['item_id'])
cat_data = item_data.merge(item_categories, on=['category_id'])
full_data = cat_data.merge(shops, on=['shop_id'])
full_data_dates = full_data.merge(dates, on=['date_block_num'])
full_data_dates.sort_values('date_block_num',inplace=True)


We have done EDA and feature preprocessing (may do more in notebook depending on model). Save the resulting sheet for use later. We don't need to save shop category and item names as these correlate perfectly with their respective ids and we have already extracted features

In [None]:
full_data_dates.drop(columns=['shop_name','category_name','item_name'],inplace=True)
full_data_dates.tail()

Clip sales to acceptable range and add 1,2,3,6 and 12 month lag for each pair

In [None]:
data = full_data_dates
data['category_1'] = data['category_1'].astype('category').cat.codes
data['category_2'] = data['category_2'].astype('category').cat.codes
data['town'] = data['town'].astype('category').cat.codes
data['item_cnt_month'] = data['item_cnt_month'].clip(0, 20)

shift_range = [1,2,3,6,12]
for shift in shift_range:
    
    train_shift = data[['ID','date_block_num','item_cnt_month']]
    train_shift['date_block_num'] = train_shift['date_block_num'] + shift
    train_shift[f'item_cnt_month_lag_{shift}'] = train_shift['item_cnt_month']
    train_shift.drop('item_cnt_month',inplace=True,axis=1)
    
    data = data.merge(train_shift, on = ['ID','date_block_num'],how='left')
    data[f'item_cnt_month_lag_{shift}'] = data[f'item_cnt_month_lag_{shift}'].fillna(0)

data.head()

Add mean encoded features for 6 and 12 month intervals for relevant columns. Discard old data (from 2013) but still use it for mean encoding.

In [None]:
mae_cols = ['shop_id','item_id','category_id','category_1','category_2','town']
dates = data['date_block_num']
validation_block = 33

ts = time.time()
#for date_block_num in dates.unique():

res = pd.DataFrame()

for date_block_num in dates.unique():
    
    subset = data.loc[dates == date_block_num]
    
    for col in mae_cols:

        df1 = data.loc[(dates < date_block_num) & (dates > date_block_num - 6), ['date_block_num', col, 'item_cnt_month']]
        df2 = data.loc[(dates < date_block_num) & (dates > date_block_num - 12), ['date_block_num', col, 'item_cnt_month']]

        matrix1 = df1.drop('date_block_num',axis=1)
        avg1 = matrix1.groupby(col, as_index=False).mean()
        avg1['date_block_num'] = date_block_num
        avg1.rename(columns={'item_cnt_month' : f'{col}_cnt_6_month_average'}, inplace=True)
        
        matrix2 = df2.drop('date_block_num',axis=1)
        avg2 = matrix2.groupby(col, as_index=False).mean()
        avg2['date_block_num'] = date_block_num
        avg2.rename(columns={'item_cnt_month' : f'{col}_cnt_12_month_average'}, inplace=True)

        subset = subset.merge(avg1, on=[col,'date_block_num'],how='left')
        subset = subset.merge(avg2, on=[col,'date_block_num'],how='left')
        
    res = pd.concat([res, subset])

res.fillna(0, inplace=True)
print(f'calcualting mae features took {time.time() - ts} seconds')

#discard old data
res = res[res.date_block_num > 11]
res = res.sort_values(by = ['date_block_num','ID'], ascending = [True, True])
print(res.columns)
print(len(res))

#check ordering is the same for two date blocks as this property will be used later
assert (res[res.date_block_num == 15]['ID'] == res[res.date_block_num == 12]['ID']).all()

X_train = res[res.date_block_num < validation_block].drop(['item_cnt_month','date_block_num'], axis=1)
y_train = res[res.date_block_num < validation_block]['item_cnt_month']
X_valid = res[res.date_block_num == validation_block].drop(['item_cnt_month','date_block_num'], axis=1)
y_valid = res[res.date_block_num == validation_block]['item_cnt_month']

Train lightGBM

In [None]:
ts = time.time()
lgb_params = {
    'feature_fraction': 0.75,
    'metric': 'rmse',
    'bagging_fraction': 0.75, 
    'learning_rate': 0.03, 
    'objective': 'mse',
    'num_leaves': 1000,
    'max_depth' : 20,
    'bagging_freq':1,
    'verbose':-1,
}

train_data = lgb.Dataset(X_train, label = y_train)
valid_data = lgb.Dataset(X_valid, label = y_valid)

lgb_model = lgb.train(
    lgb_params,
    train_data,
    valid_sets=valid_data,
    num_boost_round=1000,
    verbose_eval=False
)
t = time.time() - ts

#save lgb model
with open('lgb_model.pkl','wb') as f:
    pickle.dump(lgb_model, f)

y_pred_train = lgb_model.predict(X_train)

r2 = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)

print(f'R-squared for training data is {r2}')
print(f'MSE for training data is {mse}')
print(f'Fitting time is {t} seconds')

Neural network

In [None]:
ts = time.time()

nn_model = Sequential()

nn_model.add(Dense(100, input_dim=26, activation='relu'))
nn_model.add(Dense(100, activation='relu'))
nn_model.add(Dense(1, activation='linear'))

nn_model.compile(loss='mse', optimizer='adam')
nn_model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_valid, y_valid))

t = time.time() - ts

#save lgb model
with open('nn_model.pkl','wb') as f:
    pickle.dump(nn_model, f)

y_pred_train = nn_model.predict(X_train)

r2 = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)

print(f'R-squared for training data is {r2}')
print(f'MSE for training data is {mse}')
print(f'Fitting time is {t} seconds')

In [None]:
y_pred_lgb = lgb_model.predict(X_valid)
r2 = r2_score(y_valid, y_pred_lgb)
mse = mean_squared_error(y_valid, y_pred_lgb)

print(f'Test R-squared for block {validation_block} is {r2} for lgb')
print(f'Test MSE for block {validation_block} is {mse} for lgb')

y_pred_nn = nn_model.predict(X_valid)
r2 = r2_score(y_valid, y_pred_nn)
mse = mean_squared_error(y_valid, y_pred_nn)

print(f'Test R-squared for block {validation_block} is {r2} for nn')
print(f'Test MSE for block {validation_block} is {mse} for nn')


Fit meta model. Note that in this case, linear regression has no hyper parameters to tune, so can fit on the validation block and then use the resulting model on the test data

In [None]:
meta_features = np.stack([y_pred_nn.squeeze(axis=1), y_pred_lgb], axis=1)

meta_model = LinearRegression()
meta_model.fit(meta_features, y_valid)

y_pred_meta = meta_model.predict(meta_features)
r2 = r2_score(y_valid, y_pred_meta)
mse = mean_squared_error(y_valid, y_pred_meta)

#save lgb model
with open('meta_model.pkl','wb') as f:
    pickle.dump(meta_model, f)
    
print(f'R-squared for training is {r2} for meta')
print(f'MSE for training is {mse} for meta')


Run this cell to load models to avoid training time

In [None]:
with open('nn_model.pkl','rb') as f:
    nn_model = pickle.load(f)
with open('lgb_model.pkl','rb') as f:
    lgb_model = pickle.load(f)
with open('meta_model.pkl','rb') as f:
    meta_model = pickle.load(f)

Make predictions for submission file

In [None]:
test_block = 34

X_test = res[res.date_block_num == test_block].drop(['item_cnt_month','date_block_num'], axis=1)
y_test = res[res.date_block_num == test_block]['item_cnt_month']

meta_nn = nn_model.predict(X_test).squeeze(axis=1)
meta_lgb = lgb_model.predict(X_test)
meta_features = np.stack([meta_nn, meta_lgb], axis=1)

y_test = meta_model.predict(meta_features).clip(0, 20)
X_test['item_cnt_month'] = y_test

sub = X_test[['ID','item_cnt_month']]

sub['ID'] = sub['ID'].astype(int)
sub = sub.sort_values('ID')

sub.to_csv('submission.csv', index=False)