In [4]:
from datetime import datetime, timedelta, date
from calendar import monthrange
# from pytz import timezone
import pandas as pd
import numpy as np
#import feather
import datetime 
import pytz
import csv
import os
#import seaborn as sns
#import matplotlib.pyplot as plt
import logging

import xgboost as xgb
import time
from sklearn.pipeline import *

from joblib import dump, load
from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
import datetime
from sklearn import preprocessing
from xgboost import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

kernel_with_output = False


from utils import *

#%matplotlib inline

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [2]:
# https://www.kaggle.com/szhou42/predict-future-sales-top-11-solution

# scp jini:/home/ubuntu/future-store-sales/submissions/xgb_base_submission.csv ~/Documents/reading_training/future-store-sales/submissions/.

In [5]:
DATA_DIR = './data/'
RESOURCES_DIR = './res/'
SUBMISSION_DIR = './submissions'
MODEL_DIR = './models'

sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_train.csv.gz'))
items = pd.read_csv(os.path.join(DATA_DIR, 'items.csv'))
shops = pd.read_csv(os.path.join(DATA_DIR, 'shops.csv'))
item_categories = pd.read_csv(os.path.join(DATA_DIR, 'item_categories.csv'))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv.gz'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv.gz')) 

date_df = pd.read_pickle(os.path.join(RESOURCES_DIR, "russian_holidays.pkl"))
geography = pd.read_csv(os.path.join(RESOURCES_DIR, 'geography.csv'))
#holidays = feather.read_dataframe(os.path.join(ADD_RESOURCES, 'hol.feather'))

In [6]:
# For every month we create a grid from all shops/items combinations from that month

grid = []
for block_num in sales.date_block_num.unique():
    cur_shops = sales[sales.date_block_num == block_num]['shop_id'].unique()
    cur_items = sales[sales.date_block_num == block_num]['item_id'].unique()
    
    # cartesian product of shops, items with the block number 
    l = list( product(*[cur_shops, cur_items, [block_num]]) ) # sample l[0:2] -> [(45, 13315, 33), (45, 13881, 33)]
    
    grid.append(np.array(l , dtype='int32'))
    # np-array looks like this:
        #     array([[   45, 13315,    33],
        #        [   45, 13880,    33],
        #        [   45, 13881,    33],
        #        ...,
        #        [   21,  7640,    33],
        #        [   21,  7632,    33],
        #        [   21,  7440,    33]], dtype=int32)
    # right now, 'grid' -> is a list of np-arrays

index_cols = ['shop_id', 'item_id', 'date_block_num']
# np.vstack is used to unpack the rows from all np.arrays and put into a single array
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32) 

In [7]:
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [8]:
# Aggregations
sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20) # we say no-item sells more than 20 everyday

groups = sales.groupby(['shop_id', 'item_id', 'date_block_num'])

monthly_sales = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
monthly_sales = monthly_sales.rename(columns = {'item_cnt_day' : 'item_cnt_month',
                                      'item_price' : 'avg_item_price'})

monthly_sales['item_cnt_month'] = monthly_sales['item_cnt_month'].clip(0,20)

monthly_sales = pd.merge(grid,monthly_sales,how='left',on=index_cols)
monthly_sales.item_cnt_month = monthly_sales.item_cnt_month.fillna(0)

# Get category id
monthly_sales = pd.merge(monthly_sales, items[['item_id', 'item_category_id']], on = 'item_id')
monthly_sales.to_csv('monthly_sales.csv', index_label=False)

monthly_sales.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,avg_item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,,37
4,19,22154,0,0.0,,37


In [251]:
# monthly_sales = pd.read_csv("./monthly_sales.csv")
# monthly_sales.head() 

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,avg_item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,,37
4,19,22154,0,0.0,,37


In [252]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


#### Benchmark submission

In [8]:
submission_filename = "benchmark_submission.csv"
train = monthly_sales
last_month = train[train.date_block_num==33]
last_month = last_month.groupby(['shop_id', 'item_id']).agg({'item_cnt_month':'mean'}).reset_index()

submission = test.merge(last_month,on=['shop_id', 'item_id'] , how='left', indicator=True)
submission['item_cnt_month'] = submission.item_cnt_month.fillna(0)
print(submission.shape)

submission.drop(columns=['shop_id', 'item_id', '_merge'], inplace=True)
submission = submission.set_index('ID')

print(submission.head())
submission.to_csv(os.path.join(SUBMISSION_DIR, submission_filename))

(214200, 5)
    item_cnt_month
ID                
0              0.0
1              0.0
2              1.0
3              0.0
4              0.0


##### Fix category

In [253]:

l_cat = list(item_categories.item_category_name)

for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
    l_cat[ind] = 'Access'
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
    l_cat[ind] = 'Consoles'
for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'
l_cat[25] = 'Accessories for games'
for ind in range(26,28):
    l_cat[ind] = 'phone games'
for ind in range(28,32):
    l_cat[ind] = 'CD games'
for ind in range(32,37):
    l_cat[ind] = 'Card'
for ind in range(37,43):
    l_cat[ind] = 'Movie'
for ind in range(43,55):
    l_cat[ind] = 'Books'
for ind in range(55,61):
    l_cat[ind] = 'Music'
for ind in range(61,73):
    l_cat[ind] = 'Gifts'
for ind in range(73,79):
    l_cat[ind] = 'Soft'
for ind in range(79,81):
    l_cat[ind] = 'Office'
for ind in range(81,83):
    l_cat[ind] = 'Clean'
l_cat[83] = 'Elements of a food'

item_categories['new_catg'] = l_cat
item_categories.head()

Unnamed: 0,item_category_name,item_category_id,new_catg
0,PC - Гарнитуры/Наушники,0,PC Headsets / Headphones
1,Аксессуары - PS2,1,Access
2,Аксессуары - PS3,2,Access
3,Аксессуары - PS4,3,Access
4,Аксессуары - PSP,4,Access


###### Build a train_test_set for combined feature engineering

In [256]:
train_set = monthly_sales
test_set = test.merge(items[['item_id' , 'item_category_id']], on='item_id', how='left')
test_set['date_block_num'] = 34
test_set ['item_cnt_month'] = -1
test_set['avg_item_price'] = -1
test_set.set_index('ID', inplace=True)

data = pd.concat([train_set, test_set], axis=0, ignore_index=True, sort=False)

data = data.merge(item_categories, how='left', on='item_category_id', indicator=True)
data.drop(columns=['item_category_name', '_merge','item_category_id' ], inplace=True)
data = data.merge(items[['item_id', 'item_name']], on=['item_id'], how='left')
print(data.shape)
data.head()  #(11,128,050 | 6)

(11128050, 7)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,avg_item_price,new_catg,item_name
0,59,22154,0,1.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD)
1,25,22154,0,5.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD)
2,24,22154,0,1.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD)
3,23,22154,0,0.0,,Movie,ЯВЛЕНИЕ 2012 (BD)
4,19,22154,0,0.0,,Movie,ЯВЛЕНИЕ 2012 (BD)


#### Adding features
1. previous shop/item sales (Lag feature)
2. previous item sales (Lag feature)
3. previous shop/item price (Lag feature)
4. previous item price (Lag feature)
5. #months from sale at the shop-item level
6. #months from sale at the item level

In [257]:
from feature_engg_utils import *

data = lagged_shop_item_sales(data, 2)
data = lagged_item_sales(data, 2)
data = lagged_shop_item_price(data,2)
data = lagged_item_price(data,2)
data = months_from_lastsale_shop_item(data)
data = months_from_lastsale_item(data)

print(data.shape)
data.head(3)

(11128050, 13)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,avg_item_price,new_catg,item_name,shop_item_sales_lag2,item_sales_lag2,shop_item_price_lag2,item_price_lag2,months_from_lastsale_shop_item,months_from_lastsale_item
0,59,22154,0,1.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD),0.0,0.0,0.0,0.0,-1,-1
1,25,22154,0,5.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD),0.0,0.0,0.0,0.0,-1,-1
2,24,22154,0,1.0,999.0,Movie,ЯВЛЕНИЕ 2012 (BD),0.0,0.0,0.0,0.0,-1,-1


###### Idea 8: Item name (Tfidf text feature)

###### Cross Validation (Hyperparameter tuning)