In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from pprint import pprint
import gc
from collections import deque
from tqdm.notebook import tqdm
from time import sleep
import pickle

In [5]:
pd.set_option("display.max_columns",None)

In [7]:
train = pd.read_csv('sales_train_cleaned.csv')
item_df = pd.read_csv('items.csv')
shops_df = pd.read_csv('shops.csv')

# Merging Mislabelled Shops

upon manually going through the data of the shops, discoveredthe following possible cases where the same shop have been given different shop_id, hence merging them together. The shops of interest are:
1. 0 => 57 (This is contentious)
2. 1 => 58 (This is contentious)
3. 10 => 11

In [8]:
shops_df

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


In [181]:
train.loc[(train.shop_id == 0), 'shop_id'] = 57
train.loc[(train.shop_id == 1), 'shop_id'] = 58
train.loc[(train.shop_id == 10),'shop_id'] = 11

# Determine Monthly Sales

In [182]:
# Determine the monthly item count
train_grpd_mnthly = train.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().to_frame()
train_grpd_mnthly

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_day
shop_id,item_id,date_block_num,Unnamed: 3_level_1
2,27,0,1.0
2,27,17,1.0
2,30,2,1.0
2,30,5,1.0
2,30,15,1.0
...,...,...,...
59,22164,27,2.0
59,22164,30,1.0
59,22167,9,1.0
59,22167,11,2.0


In [183]:
# Have a variant of the train_grpd_monthly that has the index reset and only for features
train_features = train_grpd_mnthly.reset_index()
train_features.drop(columns=['item_cnt_day'],inplace=True)
train_features

Unnamed: 0,shop_id,item_id,date_block_num
0,2,27,0
1,2,27,17
2,2,30,2
3,2,30,5
4,2,30,15
...,...,...,...
1609117,59,22164,27
1609118,59,22164,30
1609119,59,22167,9
1609120,59,22167,11


# LSTM Feature Engineering
The first step in the feature engineering process is to pivot the data such that we have the sales for each item_id and store_id pair as a column of date blocks

## Creating a Time Series Variant of Training Data

In [6]:
time_seq_data = train_grpd_mnthly.pivot_table(index=['shop_id','item_id'], columns='date_block_num',
                                                  values='item_cnt_day',fill_value=0).reset_index()
time_seq_data

NameError: name 'train_grpd_mnthly' is not defined

In [185]:
time_seq_data.to_csv("time_sequenced_train_data.csv",index=False)

## Adding Features


### Item Category ID Feature

In [186]:
# Taking a look at the data
item_df

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40
...,...,...,...
22165,"Ядерный титбит 2 [PC, Цифровая версия]",22165,31
22166,Язык запросов 1С:Предприятия [Цифровая версия],22166,54
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,22167,49
22168,Яйцо для Little Inu,22168,62


In [187]:
# group the data by the item_id
train_features = pd.merge(item_df,train_features, on=['item_id'])
train_features.drop(columns=['item_name'],inplace=True)
train_features = train_features.sort_values(by=['shop_id','date_block_num'])

In [188]:
train_features

Unnamed: 0,item_id,item_category_id,shop_id,date_block_num
34,27,19,2,0
1933,33,37,2,0
9886,317,45,2,0
11301,438,45,2,0
11995,471,49,2,0
...,...,...,...,...
1602687,22087,83,59,33
1603456,22088,83,59,33
1603915,22091,83,59,33
1604583,22100,42,59,33


### Pricing Features

#### Total Average Cost of Item

In [189]:
avg_item_price = train.groupby(['item_id'])['item_price'].mean().to_frame().reset_index()
avg_item_price

Unnamed: 0,item_id,item_price
0,0,58.000000
1,1,4490.000000
2,2,58.000000
3,3,79.000000
4,4,58.000000
...,...,...
21801,22165,59.000000
21802,22166,150.000000
21803,22167,297.686490
21804,22168,798.916667


In [190]:
train_features = pd.merge(avg_item_price, train_features, on=['item_id'])

In [191]:
train_features.rename(columns={'item_price':'avg_item_price'},inplace=True)
train_features

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num
0,0,58.000000,40,54,20
1,1,4490.000000,76,55,15
2,1,4490.000000,76,55,18
3,1,4490.000000,76,55,19
4,1,4490.000000,76,55,20
...,...,...,...,...,...
1609117,22168,798.916667,62,12,8
1609118,22168,798.916667,62,16,1
1609119,22168,798.916667,62,42,1
1609120,22168,798.916667,62,43,2


#### Monthly Average Cost of Items

In [192]:
monthly_avg_price = train.groupby(['item_id','date_block_num'])['item_price'].mean().to_frame().reset_index()
monthly_avg_price

Unnamed: 0,item_id,date_block_num,item_price
0,0,20,58.0
1,1,15,4490.0
2,1,18,4490.0
3,1,19,4490.0
4,1,20,4490.0
...,...,...,...
233906,22168,0,799.0
233907,22168,1,799.0
233908,22168,2,798.5
233909,22168,8,799.0


In [193]:
train_features = pd.merge(train_features, monthly_avg_price, on=['item_id','date_block_num'])

In [194]:
train_features.rename(columns={'item_price':'avg_monthly_item_price'},inplace=True)
train_features

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num,avg_monthly_item_price
0,0,58.000000,40,54,20,58.0
1,1,4490.000000,76,55,15,4490.0
2,1,4490.000000,76,55,18,4490.0
3,1,4490.000000,76,55,19,4490.0
4,1,4490.000000,76,55,20,4490.0
...,...,...,...,...,...,...
1609117,22168,798.916667,62,12,8,799.0
1609118,22168,798.916667,62,16,1,799.0
1609119,22168,798.916667,62,42,1,799.0
1609120,22168,798.916667,62,43,2,798.5


#### Avg Price Of Shop Item Pair 
# IGNORE

In [195]:
avg_shop_item_price = train.groupby(['shop_id','item_id'])['item_price'].mean().reset_index()
avg_shop_item_price.rename(columns={'item_price':'avg_shop_item_price'},inplace=True)
avg_shop_item_price

Unnamed: 0,shop_id,item_id,avg_shop_item_price
0,2,27,1498.500000
1,2,30,274.000000
2,2,31,626.052500
3,2,32,146.272727
4,2,33,226.272727
...,...,...,...
418903,59,22154,999.000000
418904,59,22155,149.000000
418905,59,22162,389.625000
418906,59,22164,724.000000


In [196]:
train_features = pd.merge(train_features, avg_shop_item_price, on=['shop_id','item_id'])
train_features

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num,avg_monthly_item_price,avg_shop_item_price
0,0,58.000000,40,54,20,58.0,58.0
1,1,4490.000000,76,55,15,4490.0,4490.0
2,1,4490.000000,76,55,18,4490.0,4490.0
3,1,4490.000000,76,55,19,4490.0,4490.0
4,1,4490.000000,76,55,20,4490.0,4490.0
...,...,...,...,...,...,...,...
1609117,22168,798.916667,62,12,8,799.0,799.0
1609118,22168,798.916667,62,16,1,799.0,799.0
1609119,22168,798.916667,62,42,1,799.0,799.0
1609120,22168,798.916667,62,43,2,798.5,798.5


#### Average Montly Shop Item Pair Price
# IGNORE

In [331]:
avg_monthly_shop_item_price = train.groupby(['shop_id','item_id','date_block_num'])['item_price'].mean().reset_index()
avg_monthly_shop_item_price.rename(columns={'item_price':'avg_monthly_shop_item_price'}, inplace=True)
avg_monthly_shop_item_price

Unnamed: 0,shop_id,item_id,date_block_num,avg_monthly_shop_item_price
0,2,27,0,2499.0
1,2,27,17,498.0
2,2,30,2,359.0
3,2,30,5,399.0
4,2,30,15,169.0
...,...,...,...,...
1609117,59,22164,27,699.0
1609118,59,22164,30,699.0
1609119,59,22167,9,299.0
1609120,59,22167,11,299.0


In [198]:
train_features = pd.merge(train_features, avg_monthly_shop_item_price, on=['shop_id','item_id','date_block_num'])
train_features

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price
0,0,58.000000,40,54,20,58.0,58.0,58.0
1,1,4490.000000,76,55,15,4490.0,4490.0,4490.0
2,1,4490.000000,76,55,18,4490.0,4490.0,4490.0
3,1,4490.000000,76,55,19,4490.0,4490.0,4490.0
4,1,4490.000000,76,55,20,4490.0,4490.0,4490.0
...,...,...,...,...,...,...,...,...
1609117,22168,798.916667,62,12,8,799.0,799.0,799.0
1609118,22168,798.916667,62,16,1,799.0,799.0,799.0
1609119,22168,798.916667,62,42,1,799.0,799.0,799.0
1609120,22168,798.916667,62,43,2,798.5,798.5,798.5


In [204]:
train_features[(train_features.avg_monthly_item_price == train_features.avg_shop_item_price)]

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price,days_on_sale
0,0,58.000000,40,54,20,58.0,58.0,58.0,1.0
1,1,4490.000000,76,55,15,4490.0,4490.0,4490.0,2.0
2,1,4490.000000,76,55,18,4490.0,4490.0,4490.0,1.0
3,1,4490.000000,76,55,19,4490.0,4490.0,4490.0,1.0
4,1,4490.000000,76,55,20,4490.0,4490.0,4490.0,1.0
...,...,...,...,...,...,...,...,...,...
1609117,22168,798.916667,62,12,8,799.0,799.0,799.0,1.0
1609118,22168,798.916667,62,16,1,799.0,799.0,799.0,1.0
1609119,22168,798.916667,62,42,1,799.0,799.0,799.0,1.0
1609120,22168,798.916667,62,43,2,798.5,798.5,798.5,1.0


### Number of Days Item Was Being Sold On a Particular Month

In [200]:
# I don't like this deep copy, but temporary and then will del and gc it
clipped = train.copy(deep=True)
clipped['item_cnt_day'] = clipped['item_cnt_day'].clip(-1,1)
days_on_sale = clipped.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().to_frame().reset_index()
del clipped; gc.collect()

9249

In [201]:
days_on_sale.rename(columns={'item_cnt_day':'days_on_sale'}, inplace=True)
days_on_sale

Unnamed: 0,shop_id,item_id,date_block_num,days_on_sale
0,2,27,0,1.0
1,2,27,17,1.0
2,2,30,2,1.0
3,2,30,5,1.0
4,2,30,15,1.0
...,...,...,...,...
1609117,59,22164,27,2.0
1609118,59,22164,30,1.0
1609119,59,22167,9,1.0
1609120,59,22167,11,2.0


In [202]:
train_features = pd.merge(train_features, days_on_sale, on=['shop_id','item_id','date_block_num'])

In [203]:
train_features

Unnamed: 0,item_id,avg_item_price,item_category_id,shop_id,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price,days_on_sale
0,0,58.000000,40,54,20,58.0,58.0,58.0,1.0
1,1,4490.000000,76,55,15,4490.0,4490.0,4490.0,2.0
2,1,4490.000000,76,55,18,4490.0,4490.0,4490.0,1.0
3,1,4490.000000,76,55,19,4490.0,4490.0,4490.0,1.0
4,1,4490.000000,76,55,20,4490.0,4490.0,4490.0,1.0
...,...,...,...,...,...,...,...,...,...
1609117,22168,798.916667,62,12,8,799.0,799.0,799.0,1.0
1609118,22168,798.916667,62,16,1,799.0,799.0,799.0,1.0
1609119,22168,798.916667,62,42,1,799.0,799.0,799.0,1.0
1609120,22168,798.916667,62,43,2,798.5,798.5,798.5,1.0


### Shop Inactivity
Check which shops have not been selling items for over n months and removing them from the dataset, this is a method to handle with missing data

In [209]:
train_grpd = train_grpd_mnthly.reset_index()
train_grpd = train_grpd.groupby(['shop_id','item_id'])

In [219]:
len(train_grpd.get_group((54,0)).index)
keys = list(train_grpd.groups.keys())

In [220]:
def get_inactive_shop_items(train_grpd, keys, thresh=3):
    koi = deque()
    for idx in tqdm(range(len(keys))):
        key = keys[idx]
        num_rows = len(train_grpd.get_group(key).index)
        if num_rows < thresh:
            koi.append(key)
    return koi

In [221]:
koi = get_inactive_shop_items(train_grpd, keys)

HBox(children=(FloatProgress(value=0.0, max=418908.0), HTML(value='')))




In [232]:
# Store the store_id,item_id pairs in a pickle file
with open('sparce_data_thresh3.pikle','wb') as f:
    pickle.dump(koi,f)

In [258]:
# Remove the shop and item pairs that we don't have enough data
train_grpd_mnthly_LSTM = train_grpd_mnthly.reset_index()
train_grpd_mnthly_LSTM = train_grpd_mnthly_LSTM.set_index(['shop_id','item_id'])
train_features_LSTM = train_features.set_index(['shop_id','item_id'])

In [264]:
train_grpd_mnthly_LSTM.drop(index=koi, inplace=True)
train_grpd_mnthly_LSTM.reset_index(inplace=True)

In [265]:
train_features_LSTM.drop(index=koi, inplace=True)
train_features_LSTM.reset_index(inplace=True)

In [267]:
time_seq_data_LSTM = time_seq_data.set_index(['shop_id','item_id'])
time_seq_data_LSTM.drop(index=koi, inplace=True)
time_seq_data_LSTM.reset_index(inplace=True)

### Target Mean Embedding
**NOTE**: not sure if this should take place after removing shops from previous stop or before

Converting the categorical features in a mean embedding, the variable of concerns are:
1. item_id
2. shop_id
3. iteam_category_id

In [339]:
item_id_embedding = train_grpd_mnthly_LSTM.groupby(['item_id'])['item_cnt_day'].mean().reset_index()
item_id_embedding.rename(columns={'item_cnt_day':'item_id_embed'},inplace=True)

In [340]:
shop_id_embedding = train_grpd_mnthly_LSTM.groupby(['shop_id'])['item_cnt_day'].mean().reset_index()
shop_id_embedding.rename(columns={'item_cnt_day':'shop_id_embed'},inplace=True)

In [342]:
val = pd.merge(item_df, train_grpd_mnthly_LSTM, on=['item_id'])
item_cat_id_embedding = val.groupby(['item_category_id'])['item_cnt_day'].mean().reset_index()
item_cat_id_embedding.rename(columns={'item_cnt_day':'item_cat_id_embed'},inplace=True)
item_cat_id_embedding

Unnamed: 0,item_category_id,item_cat_id_embed
0,2,1.921210
1,3,3.325635
2,4,1.873720
3,5,1.534349
4,6,2.530565
...,...,...
69,79,14.598338
70,80,24.295833
71,81,3.189003
72,82,12.936330


In [344]:
train_features_LSTM = pd.merge(item_id_embedding, train_features_LSTM, on=['item_id'])
train_features_LSTM = pd.merge(shop_id_embedding, train_features_LSTM, on=['shop_id'])
train_features_LSTM = pd.merge(item_cat_id_embedding, train_features_LSTM, on=['item_category_id'])

In [345]:
train_features_LSTM

Unnamed: 0,item_category_id,item_cat_id_embed,shop_id,shop_id_embed,item_id,item_id_embed,avg_item_price,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price,days_on_sale
0,2,1.921210,2,2.191155,5572,2.455458,1554.692618,0,1550.578534,1552.495312,1532.857143,7.0
1,2,1.921210,2,2.191155,5572,2.455458,1554.692618,1,1503.427121,1552.495312,1590.000000,1.0
2,2,1.921210,2,2.191155,5572,2.455458,1554.692618,2,1504.359848,1552.495312,1490.000000,1.0
3,2,1.921210,2,2.191155,5572,2.455458,1554.692618,3,1485.773333,1552.495312,1490.000000,2.0
4,2,1.921210,2,2.191155,5572,2.455458,1554.692618,4,1484.190086,1552.495312,1490.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1308163,83,4.939431,59,2.147009,22092,3.906367,122.367566,17,109.000000,116.779808,109.000000,1.0
1308164,83,4.939431,59,2.147009,22092,3.906367,122.367566,12,109.000000,116.779808,109.000000,3.0
1308165,83,4.939431,59,2.147009,22092,3.906367,122.367566,13,109.000000,116.779808,109.000000,1.0
1308166,83,4.939431,59,2.147009,22092,3.906367,122.367566,15,109.000000,116.779808,109.000000,1.0


### Add Month and Year

In [348]:
from datetime import datetime
train['date'] = train['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))

train['day'] = train.date.apply(lambda x: int(x.strftime('%d')))
train['month'] = train.date.apply(lambda x: int(x.strftime('%m')))
train['year'] = train.date.apply(lambda x: int(x.strftime('%Y')))

In [None]:
month_years = train.drop_duplicates(keep='first')
month_years = month_years.set_index('date_block_num')

In [378]:
month_years = month_years.reset_index()

In [388]:
train_features_LSTM = pd.merge(train_features_LSTM, month_years, on=['date_block_num'])

In [389]:
train_features_LSTM

Unnamed: 0,item_category_id,item_cat_id_embed,shop_id,shop_id_embed,item_id,item_id_embed,avg_item_price,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price,days_on_sale,month,year
0,2,1.921210,2,2.191155,5572,2.455458,1554.692618,0,1550.578534,1552.495312,1532.857143,7.0,1,2013
1,2,1.921210,2,2.191155,5573,1.657895,598.008738,0,907.004110,649.000000,924.000000,2.0,1,2013
2,2,1.921210,2,2.191155,5575,1.760417,1040.575735,0,983.143378,1053.412941,955.005000,4.0,1,2013
3,2,1.921210,2,2.191155,5576,1.413793,2410.619901,0,2448.605797,2461.428571,2490.000000,3.0,1,2013
4,2,1.921210,2,2.191155,5632,1.386207,2410.294570,0,2362.500000,2440.000000,2390.000000,1.0,1,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308163,83,4.939431,58,2.271148,22088,6.867014,91.721700,33,118.559633,89.265957,119.000000,2.0,10,2015
1308164,83,4.939431,58,2.271148,22091,2.626106,128.138778,33,176.370370,129.484848,179.000000,1.0,10,2015
1308165,83,4.939431,59,2.147009,22087,5.134128,88.036466,33,119.000000,87.571429,119.000000,3.0,10,2015
1308166,83,4.939431,59,2.147009,22088,6.867014,91.721700,33,118.559633,89.375940,119.000000,2.0,10,2015


#### One Hot Encoding of Years

In [390]:
years_encoded = pd.DataFrame(data=np.array([[2013,1,0,0],[2014,0,1,0],[2015,0,0,1]]),columns=['year','Year-2013','Year-2014','Year-2015'])
years_encoded

Unnamed: 0,year,Year-2013,Year-2014,Year-2015
0,2013,1,0,0
1,2014,0,1,0
2,2015,0,0,1


In [392]:
train_features_LSTM = pd.merge(train_features_LSTM,years_encoded,on=['year'])

In [393]:
train_features_LSTM

Unnamed: 0,item_category_id,item_cat_id_embed,shop_id,shop_id_embed,item_id,item_id_embed,avg_item_price,date_block_num,avg_monthly_item_price,avg_shop_item_price,avg_monthly_shop_item_price,days_on_sale,month,year,Year-2013,Year-2014,Year-2015
0,2,1.921210,2,2.191155,5572,2.455458,1554.692618,0,1550.578534,1552.495312,1532.857143,7.0,1,2013,1,0,0
1,2,1.921210,2,2.191155,5573,1.657895,598.008738,0,907.004110,649.000000,924.000000,2.0,1,2013,1,0,0
2,2,1.921210,2,2.191155,5575,1.760417,1040.575735,0,983.143378,1053.412941,955.005000,4.0,1,2013,1,0,0
3,2,1.921210,2,2.191155,5576,1.413793,2410.619901,0,2448.605797,2461.428571,2490.000000,3.0,1,2013,1,0,0
4,2,1.921210,2,2.191155,5632,1.386207,2410.294570,0,2362.500000,2440.000000,2390.000000,1.0,1,2013,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308163,83,4.939431,58,2.271148,22088,6.867014,91.721700,33,118.559633,89.265957,119.000000,2.0,10,2015,0,0,1
1308164,83,4.939431,58,2.271148,22091,2.626106,128.138778,33,176.370370,129.484848,179.000000,1.0,10,2015,0,0,1
1308165,83,4.939431,59,2.147009,22087,5.134128,88.036466,33,119.000000,87.571429,119.000000,3.0,10,2015,0,0,1
1308166,83,4.939431,59,2.147009,22088,6.867014,91.721700,33,118.559633,89.375940,119.000000,2.0,10,2015,0,0,1


#### One Hot Encoding of Months

In [391]:
data = np.identity(12)
months = np.arange(1,13)
months = months.reshape(-1,1)
data = np.append(months,data,axis=1)
months = ['month','Jan','Feb','March','April','May','June','July','Aug','Sept','Oct','Nov','Dec']
months_encoded = pd.DataFrame(data=data,columns=months)
months_encoded

Unnamed: 0,month,Jan,Feb,March,April,May,June,July,Aug,Sept,Oct,Nov,Dec
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [394]:
train_features_LSTM = pd.merge(train_features_LSTM, months_encoded, on=['month'])

In [395]:
train_features_LSTM

Unnamed: 0,item_category_id,item_cat_id_embed,shop_id,shop_id_embed,item_id,item_id_embed,avg_item_price,date_block_num,avg_monthly_item_price,avg_shop_item_price,...,March,April,May,June,July,Aug,Sept,Oct,Nov,Dec
0,2,1.921210,2,2.191155,5572,2.455458,1554.692618,0,1550.578534,1552.495312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.921210,2,2.191155,5573,1.657895,598.008738,0,907.004110,649.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1.921210,2,2.191155,5575,1.760417,1040.575735,0,983.143378,1053.412941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1.921210,2,2.191155,5576,1.413793,2410.619901,0,2448.605797,2461.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,1.921210,2,2.191155,5632,1.386207,2410.294570,0,2362.500000,2440.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308163,83,4.939431,57,2.479158,22092,3.906367,122.367566,22,113.838710,107.482022,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308164,83,4.939431,59,2.147009,22087,5.134128,88.036466,22,79.260000,87.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308165,83,4.939431,59,2.147009,22088,6.867014,91.721700,22,82.055000,89.375940,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308166,83,4.939431,59,2.147009,22091,2.626106,128.138778,22,110.764706,126.586207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Re-order Columns
Purpose is to standardize for the first few at the columns we want to remove 

In [396]:
train_features_LSTM = train_features_LSTM[
    ['shop_id','item_id','item_category_id',
     'avg_shop_item_price','avg_monthly_shop_item_price',
     'date_block_num','month','year',
     'shop_id_embed','item_id_embed','item_cat_id_embed',
     'avg_item_price','avg_monthly_item_price',
     'Year-2013','Year-2014','Year-2015',
     'Jan','Feb','March','April','May','June','July','Aug','Sept','Oct','Nov','Dec']]

features_to_drop = ['shop_id','item_id','item_category_id',
                    'avg_shop_item_price','avg_monthly_shop_item_price',
                    'date_block_num','month','year']

# Reshape Data for LSTM

In [397]:
train_features_LSTM

Unnamed: 0,shop_id,item_id,item_category_id,avg_shop_item_price,avg_monthly_shop_item_price,date_block_num,month,year,shop_id_embed,item_id_embed,...,March,April,May,June,July,Aug,Sept,Oct,Nov,Dec
0,2,5572,2,1552.495312,1532.857143,0,1,2013,2.191155,2.455458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,5573,2,649.000000,924.000000,0,1,2013,2.191155,1.657895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5575,2,1053.412941,955.005000,0,1,2013,2.191155,1.760417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,5576,2,2461.428571,2490.000000,0,1,2013,2.191155,1.413793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,5632,2,2440.000000,2390.000000,0,1,2013,2.191155,1.386207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1308163,57,22092,83,107.482022,109.000000,22,11,2014,2.479158,3.906367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308164,59,22087,83,87.571429,79.000000,22,11,2014,2.147009,5.134128,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308165,59,22088,83,89.375940,84.000000,22,11,2014,2.147009,6.867014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1308166,59,22091,83,126.586207,109.000000,22,11,2014,2.147009,2.626106,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [287]:
# Looking at time_seq data and generating data for one store, just to get a feel of how to develop
# Assuming that these are the given inputs into the function that generates a single frame of data
# time_seq_indexed = time_seq_data_LSTM.set_index(['shop_id','item_id'])
# shop_monthly_sales = time_seq_indexed.groupby(['shop_id','item_id']).get_group((55,1)).values
features_ = train_features_LSTM.groupby(['shop_id','item_id']).get_group((55,1)).sort_values(by=['date_block_num'])
train_grpd_mnthly_LSTM_indexed = train_grpd_mnthly_LSTM.set_index(['shop_id','item_id'])
shop_monthly_sales = train_grpd_mnthly_LSTM_indexed.groupby(['shop_id','item_id']).get_group((55,1)).values
shop_monthly_sales = np.delete(shop_monthly_sales,0,1)

In [296]:
def create_batch(shop_monthly_sales_of_item, shop_item_features):
    # Get the features corresponding to the shop_id, item_id pair
    shop_monthly_sales_of_item = shop_monthly_sales_of_item.reshape(-1,1)
    given_block_num = set(shop_item_features['date_block_num'].values)
    features_grpd = shop_item_features.groupby(['date_block_num'])
    feature_names = [c for c in features_.columns]
    num_features = len(feature_names)
    feature_names.insert(0,'item_cnt_month')
    finale_feature_array = deque()
    for block_num in given_block_num:
        feat = features_grpd.get_group((block_num)).values
        feat = feat.flatten()
        finale_feature_array.append(feat)
    batch = np.array(list(finale_feature_array))
    batch = np.append(shop_monthly_sales_of_item,batch,axis=1)
    # print("Features:",batch.shape,"shops:",shop_monthly_sales_of_item.shape)
    df = pd.DataFrame(columns=feature_names,data=batch)
    return batch,df

In [297]:
batch,df = create_batch(shop_monthly_sales, features_)

In [70]:
def create_high_lvl_dataset(time_seq_data,train_features,keys):
    df_database = {}
    dataset = deque()
    for key_id in tqdm(range(len(keys))):
        key = keys[key_id]
        shop_monthly_item_sales = time_seq_data.get_group(key).values
        shop_item_features = train_features.get_group(key)
        batch, df = create_batch(shop_monthly_item_sales, shop_item_features)
        df_database[key] = df
        dataset.append(batch)
    dataset = np.asarray(list(dataset))
    return dataset, df_database

In [None]:
def create_dataset(shops_monthly_sales, features, keys, time_steps=2):
    df_database = {}
    data_set = deque()
    for key_id in tqdm(range(len(keys))):
        key = keys[id]
        sales = shops_monthly_sales.get_group(key).values
        feats = features.get_group(key).sort_values(by=['date_block_num'])
        batch, df = create_batch(sales, feats)
        

In [65]:
time_seq_indexed_grpd = time_seq_indexed.groupby(['shop_id','item_id'])
train_features_grpd = train_features.groupby(['shop_id','item_id'])
keys = list(train_features_grpd.groups.keys())

In [71]:
dataset, df_databse = create_high_lvl_dataset(time_seq_indexed_grpd, train_features_grpd, keys)

HBox(children=(FloatProgress(value=0.0, max=424123.0), HTML(value='')))




In [81]:
with open('formatted_training_data.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [82]:
with open('batches_dict.pickle', 'wb') as f:
    pickle.dump(df_databse, f)

In [97]:
def create_timestep_dataset(dataset, time_step=3):
    timestepped_features = deque()
    timestepped_item_cnt = deque()
    for batch_num in tqdm(range(dataset.shape[0])):
        batch = dataset[batch_num]
        for i in range(dataset.shape[1]-time_step):
            v = batch[i:i+time_step,1:]
            y = batch[i+time_step,0]
            timestepped_features.append(v)
            timestepped_item_cnt.append(y)
    timestepped_features = np.asarray(list(timestepped_features))
    timestepped_item_cnt = np.asarray(list(timestepped_item_cnt))
    return timestepped_features,timestepped_item_cnt
    

In [83]:
v = dataset[0]

In [98]:
time_step_feat, y = create_timestep_dataset(dataset)

HBox(children=(FloatProgress(value=0.0, max=424123.0), HTML(value='')))




# Reshape Data V2

In [407]:
features_ = train_features_LSTM.groupby(['shop_id','item_id']).get_group((55,1)).sort_values(by=['date_block_num'])
train_grpd_mnthly_LSTM_indexed = train_grpd_mnthly_LSTM.set_index(['shop_id','item_id'])
shop_monthly_sales = train_grpd_mnthly_LSTM_indexed.groupby(['shop_id','item_id']).get_group((55,1)).values
shop_monthly_sales = np.delete(shop_monthly_sales,0,1)

In [326]:
keys_lstm = list(train_features_LSTM.groupby(['shop_id','item_id']).groups.keys())

In [None]:
def fill_data_sequence(sales, feats):
    min_block_num = feats['date_block_num'].min()
    max_block_num = feats['date_block_num'].max()
    feature_names = [c for c in feats.columns]
    date_block_num = feature_names.index('date_block_num')
    given_block_num = set(shop_item_features['date_block_num'].values)
    for block_num in range(min_block_num+1, max_block_num):
        if block_num not in given_block_num:
            df_data = np.zeros((1,len(feature_names)))
            df = pd.DataFrame(data=df_data,columns=feature_names)
            df.at[0,'Date_block_num'] = block_num
            df.at[0,'
            

In [426]:
def create_data_set(shop_item_monthly_sales, features, keys, time_step=2):
    dataset = deque()
    item_cnt_month = deque()
    for idx in tqdm(range(len(keys))):
        key = keys[idx]
        sales = shop_item_monthly_sales.get_group(key).sort_values(by=['date_block_num'])
        feats = features.get_group(key).sort_values(by=['date_block_num'])
        
        for i in range(len(feats)-time_step):
            v = feats.iloc[i:(i+time_step)].values
            y = sales['item_cnt_day'].values
            y = y[i+time_step]
            item_cnt_month.append(y)
            dataset.append(v)
    dataset = np.asarray(list(dataset))
    item_cnt_month = np.asarray(list(item_cnt_month))
    return dataset, item_cnt_month
            

In [427]:
dataset, item_counts = create_data_set(train_grpd_mnthly_LSTM_indexed.groupby(['shop_id','item_id']), 
                                      train_features_LSTM.groupby(['shop_id','item_id']),
                                      keys_lstm)

HBox(children=(FloatProgress(value=0.0, max=197290.0), HTML(value='')))




In [430]:
with open('train_LSTM_V4.pickle','wb') as f:
    pickle.dump(dataset,f)

with open('item_cnt_V4.piickle', 'wb') as f:
    pickle.dump(item_counts,f)