In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
import matplotlib.pyplot as plt
from pprint import pprint
import gc
from collections import deque
from tqdm.notebook import tqdm
from time import sleep
import pickle
from datetime import datetime

In [2]:
train = pd.read_csv('sales_train_cleaned.csv')
item_df = pd.read_csv('items.csv')

with open('shop_id_embedding_weights.pkl','rb') as f:
    shop_id_embedding_weights = pickle.load(f)

with open('month_embedding_weights.pkl','rb') as f:
    month_embedding_weights = pickle.load(f)
    
with open('year_embedding_weights.pkl','rb') as f:
    year_embedding_weights = pickle.load(f)

with open('item_cat_embedding_weights.pkl','rb') as f:
    item_cat_embedding_weights = pickle.load(f)

# Merging Mislabelled Shops

upon manually going through the data of the shops, discoveredthe following possible cases where the same shop have been given different shop_id, hence merging them together. The shops of interest are:
1. 0 => 57 (This is contentious)
2. 1 => 58 (This is contentious)
3. 10 => 11

In [3]:
train.loc[(train.shop_id == 0), 'shop_id'] = 57
train.loc[(train.shop_id == 1), 'shop_id'] = 58
train.loc[(train.shop_id == 10),'shop_id'] = 11

# Adding Month and Years

In [4]:
train['date'] = train['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))

train['day'] = train.date.apply(lambda x: int(x.strftime('%d')))
train['month'] = train.date.apply(lambda x: int(x.strftime('%m')))
train['year'] = train.date.apply(lambda x: int(x.strftime('%Y')))

# Adding Features

## Montly Sales

In [5]:
features = train.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().reset_index()
features.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,2,27,0,1.0
1,2,27,17,1.0
2,2,30,2,1.0
3,2,30,5,1.0
4,2,30,15,1.0
...,...,...,...,...
1609117,59,22164,27,2.0
1609118,59,22164,30,1.0
1609119,59,22167,9,1.0
1609120,59,22167,11,2.0


## Temporal

In [6]:
block_num_representation = train[['date_block_num','month','year']]
block_num_representation = block_num_representation.drop_duplicates()

In [7]:
features = pd.merge(features,block_num_representation,on=['date_block_num'])


In [8]:
features = features[['shop_id','item_id','date_block_num','year','month',
                    'avg_item_price','avg_item_price_per_shop']]
features

KeyError: "['avg_item_price_per_shop', 'avg_item_price'] not in index"

## Pricing Information

### Average Price of Item

Adding average pricing information for a given item across all time

In [9]:
pricing = train.groupby(['item_id'])['item_price'].mean().reset_index()
pricing

Unnamed: 0,item_id,item_price
0,0,58.000000
1,1,4490.000000
2,2,58.000000
3,3,79.000000
4,4,58.000000
...,...,...
21801,22165,59.000000
21802,22166,150.000000
21803,22167,297.686490
21804,22168,798.916667


In [10]:
# Adding the item average price
features = pd.merge(features,pricing,on=['item_id'])
features.rename(columns={'item_price':'avg_item_price'},inplace=True)

In [11]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price
0,2,27,0,1.0,1,2013,1461.228571
1,11,27,0,1.0,1,2013,1461.228571
2,19,27,0,1.0,1,2013,1461.228571
3,26,27,0,1.0,1,2013,1461.228571
4,31,27,0,1.0,1,2013,1461.228571
...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000
1609118,55,2435,29,1.0,6,2015,199.000000
1609119,55,3004,29,1.0,6,2015,1872.000000
1609120,55,5373,29,2.0,6,2015,12576.000000


### Average Price of Item Per Month

In [12]:
avg_price_per_month = train.groupby(['month','item_id'])['item_price'].mean().reset_index()
avg_price_per_month

Unnamed: 0,month,item_id,item_price
0,1,10,28.000000
1,1,19,28.000000
2,1,27,2159.125000
3,1,28,549.000000
4,1,29,2397.500000
...,...,...,...
162467,12,22153,299.000000
162468,12,22154,999.000000
162469,12,22155,149.000000
162470,12,22166,150.000000


In [13]:
features = pd.merge(features,avg_price_per_month,on=['item_id','month'])
features.rename(columns={'item_price':'avg_item_price_per_month'},inplace=True)
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month
0,2,27,0,1.0,1,2013,1461.228571,2159.125
1,11,27,0,1.0,1,2013,1461.228571,2159.125
2,19,27,0,1.0,1,2013,1461.228571,2159.125
3,26,27,0,1.0,1,2013,1461.228571,2159.125
4,31,27,0,1.0,1,2013,1461.228571,2159.125
...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000
1609118,55,2435,29,1.0,6,2015,199.000000,199.000
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000


### Average Item Price Per Shop 

In [14]:
avg_item_price_per_shop = train.groupby(['item_id','shop_id'])['item_price'].mean().reset_index()
avg_item_price_per_shop

Unnamed: 0,item_id,shop_id,item_price
0,0,54,58.0
1,1,55,4490.0
2,2,54,58.0
3,3,54,79.0
4,4,54,58.0
...,...,...,...
418903,22168,12,799.0
418904,22168,16,799.0
418905,22168,42,799.0
418906,22168,43,798.5


In [15]:
features = pd.merge(features, avg_item_price_per_shop, 
                    on=['item_id','shop_id'])
features.rename(columns={'item_price':'avg_item_price_per_shop'},
               inplace=True)
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop
0,2,27,0,1.0,1,2013,1461.228571,2159.125000,1498.5
1,2,27,17,1.0,6,2014,1461.228571,858.000000,1498.5
2,11,27,0,1.0,1,2013,1461.228571,2159.125000,1496.5
3,11,27,9,1.0,10,2013,1461.228571,942.266667,1496.5
4,19,27,0,1.0,1,2013,1461.228571,2159.125000,1965.0
...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000000,216.0
1609118,55,2435,29,1.0,6,2015,199.000000,199.000000,199.0
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000000,1872.0
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000000,12576.0


### Avg Item price per shop per month

In [16]:
avg_item_price_per_month_per_shop = train.groupby(['item_id',
                            'shop_id',
                            'month'])['item_price'].mean().reset_index()
avg_item_price_per_month_per_shop

Unnamed: 0,item_id,shop_id,month,item_price
0,0,54,9,58.0
1,1,55,4,4490.0
2,1,55,7,4490.0
3,1,55,8,4490.0
4,1,55,9,4490.0
...,...,...,...,...
1395040,22168,12,9,799.0
1395041,22168,16,2,799.0
1395042,22168,42,2,799.0
1395043,22168,43,3,798.5


In [17]:
features = pd.merge(features, avg_item_price_per_month_per_shop, 
                    on=['item_id','shop_id','month'])

In [18]:
features.rename(columns={'item_price':'avg_item_price_per_shop_per_month'},inplace=True)


In [19]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop,avg_item_price_per_shop_per_month
0,2,27,0,1.0,1,2013,1461.228571,2159.125000,1498.5,2499.0
1,2,27,17,1.0,6,2014,1461.228571,858.000000,1498.5,498.0
2,11,27,0,1.0,1,2013,1461.228571,2159.125000,1496.5,1890.0
3,11,27,9,1.0,10,2013,1461.228571,942.266667,1496.5,1103.0
4,19,27,0,1.0,1,2013,1461.228571,2159.125000,1965.0,2499.0
...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000000,216.0,216.0
1609118,55,2435,29,1.0,6,2015,199.000000,199.000000,199.0,199.0
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000000,1872.0,1872.0
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000000,12576.0,12576.0


## Sales Information

### Avg Number of Sales Per Item

In [20]:
avg_num_item_sales = train.groupby(['item_id'])['item_cnt_day'].mean().reset_index()
avg_num_item_sales

Unnamed: 0,item_id,item_cnt_day
0,0,1.000000
1,1,1.000000
2,2,1.000000
3,3,1.000000
4,4,1.000000
...,...,...
21801,22165,1.000000
21802,22166,1.288889
21803,22167,1.184919
21804,22168,1.000000


In [21]:
features = pd.merge(features,avg_num_item_sales,on=['item_id'])

In [22]:
features.rename(columns={'item_cnt_day':'avg_num_item_sales'},inplace=True)

In [23]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop,avg_item_price_per_shop_per_month,avg_num_item_sales
0,2,27,0,1.0,1,2013,1461.228571,2159.125000,1498.5,2499.0,1.0
1,2,27,17,1.0,6,2014,1461.228571,858.000000,1498.5,498.0,1.0
2,11,27,0,1.0,1,2013,1461.228571,2159.125000,1496.5,1890.0,1.0
3,11,27,9,1.0,10,2013,1461.228571,942.266667,1496.5,1103.0,1.0
4,19,27,0,1.0,1,2013,1461.228571,2159.125000,1965.0,2499.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000000,216.0,216.0,1.0
1609118,55,2435,29,1.0,6,2015,199.000000,199.000000,199.0,199.0,1.0
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000000,1872.0,1872.0,1.0
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000000,12576.0,12576.0,2.0


### Avg Montly Item Sales

In [24]:
avg_monthly_item_num = train.groupby(['item_id','month'])['item_cnt_day'].mean().reset_index()


In [25]:
avg_monthly_item_num.rename(columns={'item_cnt_day':'avg_monthly_item_num'},inplace=True)


In [26]:
avg_monthly_item_num

Unnamed: 0,item_id,month,avg_monthly_item_num
0,0,9,1.0
1,1,4,1.0
2,1,7,1.0
3,1,8,1.0
4,1,9,1.0
...,...,...,...
162467,22168,1,1.0
162468,22168,2,1.0
162469,22168,3,1.0
162470,22168,9,1.0


In [27]:
features = pd.merge(features, avg_monthly_item_num, 
                    on=['item_id','month'])

In [28]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num
0,2,27,0,1.0,1,2013,1461.228571,2159.125,1498.5,2499.0,1.0,1.0
1,11,27,0,1.0,1,2013,1461.228571,2159.125,1496.5,1890.0,1.0,1.0
2,19,27,0,1.0,1,2013,1461.228571,2159.125,1965.0,2499.0,1.0,1.0
3,26,27,0,1.0,1,2013,1461.228571,2159.125,1565.0,2499.0,1.0,1.0
4,31,27,0,1.0,1,2013,1461.228571,2159.125,2499.0,2499.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000,216.0,216.0,1.0,1.0
1609118,55,2435,29,1.0,6,2015,199.000000,199.000,199.0,199.0,1.0,1.0
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000,1872.0,1872.0,1.0,1.0
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000,12576.0,12576.0,2.0,2.0


### Avg Sales Per Shop Item Pair

In [29]:
avg_item_sales_per_shop = train.groupby(['item_id','shop_id'])['item_cnt_day'].mean().reset_index()
avg_item_sales_per_shop.rename(columns={'item_cnt_day':'avg_item_sales_per_shop'},inplace=True)


In [30]:
avg_item_sales_per_shop

Unnamed: 0,item_id,shop_id,avg_item_sales_per_shop
0,0,54,1.0
1,1,55,1.0
2,2,54,1.0
3,3,54,1.0
4,4,54,1.0
...,...,...,...
418903,22168,12,1.0
418904,22168,16,1.0
418905,22168,42,1.0
418906,22168,43,1.0


In [31]:
features = pd.merge(features,avg_item_sales_per_shop,
                    on=['item_id','shop_id'])

In [32]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num,avg_item_sales_per_shop
0,2,27,0,1.0,1,2013,1461.228571,2159.125000,1498.5,2499.0,1.0,1.0,1.0
1,2,27,17,1.0,6,2014,1461.228571,858.000000,1498.5,498.0,1.0,1.0,1.0
2,11,27,0,1.0,1,2013,1461.228571,2159.125000,1496.5,1890.0,1.0,1.0,1.0
3,11,27,9,1.0,10,2013,1461.228571,942.266667,1496.5,1103.0,1.0,1.0,1.0
4,19,27,0,1.0,1,2013,1461.228571,2159.125000,1965.0,2499.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,1.0,6,2015,216.000000,216.000000,216.0,216.0,1.0,1.0,1.0
1609118,55,2435,29,1.0,6,2015,199.000000,199.000000,199.0,199.0,1.0,1.0,1.0
1609119,55,3004,29,1.0,6,2015,1872.000000,1872.000000,1872.0,1872.0,1.0,1.0,1.0
1609120,55,5373,29,2.0,6,2015,12576.000000,12576.000000,12576.0,12576.0,2.0,2.0,2.0


In [33]:
features.drop(['item_cnt_month'],axis=1,inplace=True)

### Item Sales Per Shop, Item, BlockNum (item_cnt_month)

In [34]:
item_cnt_month = train.groupby(['item_id','shop_id','date_block_num'])['item_cnt_day'].sum().reset_index()
item_cnt_month.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)

In [35]:
features = pd.merge(features,item_cnt_month,
                    on=['item_id','shop_id','date_block_num'])

In [36]:
features

Unnamed: 0,shop_id,item_id,date_block_num,month,year,avg_item_price,avg_item_price_per_month,avg_item_price_per_shop,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num,avg_item_sales_per_shop,item_cnt_month
0,2,27,0,1,2013,1461.228571,2159.125000,1498.5,2499.0,1.0,1.0,1.0,1.0
1,2,27,17,6,2014,1461.228571,858.000000,1498.5,498.0,1.0,1.0,1.0,1.0
2,11,27,0,1,2013,1461.228571,2159.125000,1496.5,1890.0,1.0,1.0,1.0,1.0
3,11,27,9,10,2013,1461.228571,942.266667,1496.5,1103.0,1.0,1.0,1.0,1.0
4,19,27,0,1,2013,1461.228571,2159.125000,1965.0,2499.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,29,6,2015,216.000000,216.000000,216.0,216.0,1.0,1.0,1.0,1.0
1609118,55,2435,29,6,2015,199.000000,199.000000,199.0,199.0,1.0,1.0,1.0,1.0
1609119,55,3004,29,6,2015,1872.000000,1872.000000,1872.0,1872.0,1.0,1.0,1.0,1.0
1609120,55,5373,29,6,2015,12576.000000,12576.000000,12576.0,12576.0,2.0,2.0,2.0,2.0


## Item Category ID

In [37]:
item_df_no_name = item_df.drop(['item_name'],axis=1)

In [38]:
features = pd.merge(features,item_df_no_name,on=['item_id'])

In [39]:
features = features[['shop_id','item_id','item_category_id','date_block_num','year','month','avg_item_price',
            'avg_item_price_per_shop','avg_item_price_per_month','avg_item_price_per_shop_per_month',
            'avg_num_item_sales','avg_monthly_item_num','avg_item_sales_per_shop','item_cnt_month']]

In [40]:
features

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,year,month,avg_item_price,avg_item_price_per_shop,avg_item_price_per_month,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num,avg_item_sales_per_shop,item_cnt_month
0,2,27,19,0,2013,1,1461.228571,1498.5,2159.125000,2499.0,1.0,1.0,1.0,1.0
1,2,27,19,17,2014,6,1461.228571,1498.5,858.000000,498.0,1.0,1.0,1.0,1.0
2,11,27,19,0,2013,1,1461.228571,1496.5,2159.125000,1890.0,1.0,1.0,1.0,1.0
3,11,27,19,9,2013,10,1461.228571,1496.5,942.266667,1103.0,1.0,1.0,1.0,1.0
4,19,27,19,0,2013,1,1461.228571,1965.0,2159.125000,2499.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,863,78,29,2015,6,216.000000,216.0,216.000000,216.0,1.0,1.0,1.0,1.0
1609118,55,2435,31,29,2015,6,199.000000,199.0,199.000000,199.0,1.0,1.0,1.0,1.0
1609119,55,3004,76,29,2015,6,1872.000000,1872.0,1872.000000,1872.0,1.0,1.0,1.0,1.0
1609120,55,5373,76,29,2015,6,12576.000000,12576.0,12576.000000,12576.0,2.0,2.0,2.0,2.0


# Format Data

In [41]:
def update_date_block_num(row,date_block_num):
    return date_block_num
def update_to_zero(row):
    return 0.0

In [42]:
features_grpd = features.groupby(['shop_id','item_id'])
feature_list = [c for c in features.columns]
feature_idx_dict = {}
for idx in range(len(feature_list)):
    feature_idx_dict[feature_list[idx]] = idx 

In [43]:
def generate_filled_data(features,feature_dict, feat_list):
    frames = []
    keys = list(features_grpd.groups.keys())
    for idx in tqdm(range(len(keys))):
        key = keys[idx]
        data = features.get_group(key)
        df = naive_fill(data,feature_dict, feat_list)
        frames.append(df)
    return pd.concat(frames)

In [44]:
# Create a data frame that fills in the blanks between missing dates
def intelligent_fill(data):
    frames = []
    block_nums_given = data['date_block_num'].values
    missing_block_nums = list(set(range(34))-set(data['date_block_num'].values))
    for block_num in missing_block_nums:
        # print("block_num is:",block_num)
        dist = np.abs(block_nums_given - block_num)
        # print("dist is:",dist)
        min_dist = np.amin(dist)
        # print("min_dist is:",min_dist)
        idx = (np.where(dist == min_dist))[0]
        closest_block_num = block_nums_given[idx[0]]
        df = data[data.date_block_num == closest_block_num]
        
        # This is garbage code, but for not just to get things working
        df['date_block_num'] = df.apply(update_date_block_num, args=(block_num,),axis=1)
        df['avg_monthly_item_num'] = df.apply(update_to_zero,axis=1)
        df['item_cnt_month'] = df.apply(update_to_zero,axis=1)
        df['avg_item_price_per_month'] = df.apply(update_to_zero,axis=1)
        df['avg_item_price_per_shop_per_month'] = df.apply(update_to_zero,axis=1)
        frames.append(df)
    return pd.concat(frames)
            

In [45]:
def naive_fill(data, feature_idx_dict, feat_list):
    block_nums_given = data['date_block_num'].values
    missing_block_nums = np.asarray(list(set(range(34)) - set(block_nums_given)))
    df_data = np.zeros((len(missing_block_nums),len(feature_idx_dict)))
    df_data = df_data.astype(int)
    shop_id = int(data['shop_id'].values[0])
    item_id = int(data['item_id'].values[0])
    item_category_id = int(data['item_category_id'].values[0])
    df_data[:,feature_idx_dict['date_block_num']] = missing_block_nums
    df_data[:,feature_idx_dict['shop_id']] = shop_id
    df_data[:,feature_idx_dict['item_id']] = item_id
    df_data[:,feature_idx_dict['item_category_id']] = item_category_id
    df = pd.DataFrame(data=df_data,columns=feat_list)
    return df

In [46]:
df = generate_filled_data(features_grpd, feature_idx_dict, feature_list)

HBox(children=(FloatProgress(value=0.0, max=418908.0), HTML(value='')))




In [47]:
df.drop(['year','month'],inplace=True,axis=1)

In [48]:
missing_df = pd.merge(df,block_num_representation,on=['date_block_num'])

In [49]:
missing_df = missing_df[['shop_id','item_id','item_category_id','date_block_num','year','month','avg_item_price',
            'avg_item_price_per_shop','avg_item_price_per_month','avg_item_price_per_shop_per_month',
            'avg_num_item_sales','avg_monthly_item_num','avg_item_sales_per_shop','item_cnt_month']]
missing_df

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,year,month,avg_item_price,avg_item_price_per_shop,avg_item_price_per_month,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num,avg_item_sales_per_shop,item_cnt_month
0,2,27,19,1,2013,2,0,0,0,0,0,0,0,0
1,2,30,40,1,2013,2,0,0,0,0,0,0,0,0
2,2,32,40,1,2013,2,0,0,0,0,0,0,0,0
3,2,33,37,1,2013,2,0,0,0,0,0,0,0,0
4,2,53,57,1,2013,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12633745,59,22153,37,17,2014,6,0,0,0,0,0,0,0,0
12633746,59,22154,37,17,2014,6,0,0,0,0,0,0,0,0
12633747,59,22155,37,17,2014,6,0,0,0,0,0,0,0,0
12633748,59,22162,40,17,2014,6,0,0,0,0,0,0,0,0


In [52]:
final_training_df = pd.concat([features,missing_df])

In [53]:
final_training_df

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,year,month,avg_item_price,avg_item_price_per_shop,avg_item_price_per_month,avg_item_price_per_shop_per_month,avg_num_item_sales,avg_monthly_item_num,avg_item_sales_per_shop,item_cnt_month
0,2,27,19,0,2013,1,1461.228571,1498.5,2159.125000,2499.0,1.0,1.0,1.0,1.0
1,2,27,19,17,2014,6,1461.228571,1498.5,858.000000,498.0,1.0,1.0,1.0,1.0
2,11,27,19,0,2013,1,1461.228571,1496.5,2159.125000,1890.0,1.0,1.0,1.0,1.0
3,11,27,19,9,2013,10,1461.228571,1496.5,942.266667,1103.0,1.0,1.0,1.0,1.0
4,19,27,19,0,2013,1,1461.228571,1965.0,2159.125000,2499.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12633745,59,22153,37,17,2014,6,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12633746,59,22154,37,17,2014,6,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12633747,59,22155,37,17,2014,6,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12633748,59,22162,40,17,2014,6,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [364]:
final_training_df.to_csv('LSTM_Training_Data_V2.csv',index=False)

# Inserting Embeddings

## Shop ID embedding

In [54]:
unique_shop_id = features.shop_id.unique()
unique_shop_id = list(unique_shop_id.astype(int))
unique_shop_id.sort()
le = LabelEncoder()
le.fit(unique_shop_id)

LabelEncoder()

In [55]:
data = np.concatenate((le.classes_.reshape(-1,1),shop_id_embedding_weights),axis=1)

In [56]:
columns = ['shop_id']
for i in range(26):
    columns.append('shop_embedding_'+str(i))

shop_id_embedding_df = pd.DataFrame(data=data,columns=columns)

In [57]:
shop_id_embedding_df = shop_id_embedding_df.astype({'shop_id':'int64'})

## Item Category Embedding

In [58]:
unique_item_cat = list(features.item_category_id.unique())
unique_item_cat.sort()
unique_item_cat = np.asarray(unique_item_cat)
unique_item_cat

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83])

In [59]:
cat_data = np.concatenate((unique_item_cat.reshape(-1,1),item_cat_embedding_weights),axis=1)
cat_data.shape

(84, 43)

In [60]:
columns = ['item_category_id']
for i in range(42):
    columns.append('item_category_id_embedding_'+str(i))

item_cat_id_embedding_df = pd.DataFrame(data=cat_data, columns=columns)
item_cat_id_embedding_df = item_cat_id_embedding_df.astype({'item_category_id':'int64'})

In [61]:
item_cat_id_embedding_df

Unnamed: 0,item_category_id,item_category_id_embedding_0,item_category_id_embedding_1,item_category_id_embedding_2,item_category_id_embedding_3,item_category_id_embedding_4,item_category_id_embedding_5,item_category_id_embedding_6,item_category_id_embedding_7,item_category_id_embedding_8,...,item_category_id_embedding_32,item_category_id_embedding_33,item_category_id_embedding_34,item_category_id_embedding_35,item_category_id_embedding_36,item_category_id_embedding_37,item_category_id_embedding_38,item_category_id_embedding_39,item_category_id_embedding_40,item_category_id_embedding_41
0,0,-0.237994,-0.080414,0.849572,0.194955,0.759653,-0.163054,-0.165503,0.682277,0.100477,...,-1.513865,-0.454181,0.508111,-0.335773,-0.128002,-1.189908,0.235172,0.402190,-1.401626,0.110708
1,1,0.158821,0.357169,0.767914,0.930373,-0.561077,-0.081719,-0.453086,0.085200,0.190384,...,0.130665,0.069990,-0.301426,-0.162649,-0.289075,-1.561382,0.491715,-0.490381,-0.955593,-0.091784
2,2,0.528947,0.603718,0.435343,0.613169,0.536616,-0.103844,-0.547318,0.627502,-0.256596,...,0.053889,0.076195,0.259457,-0.500726,-0.606526,-2.140256,1.298924,-0.153488,-0.938191,0.580219
3,3,0.155460,0.172748,0.438695,0.294159,-0.429204,0.450114,0.082607,0.018304,0.043943,...,-0.095646,0.032121,0.074723,0.119041,-0.089815,-1.358339,-0.030708,0.257370,-1.131572,0.143188
4,4,0.015226,1.020966,0.617417,-1.027962,0.680634,1.020311,-0.914483,0.160749,1.867281,...,-2.355460,-0.069158,-0.405441,-1.335154,0.147784,-0.780687,2.485381,-1.038629,-1.288584,-0.524562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,79,0.057731,0.020087,0.113421,-0.043076,0.143730,0.124753,-0.510363,-0.025641,0.017499,...,-0.255234,0.401725,-0.046965,-0.034062,0.090075,-1.708200,-2.047479,-0.007060,-0.643218,-0.062878
80,80,-0.026260,0.109714,0.044631,0.228472,0.117430,-0.356205,-0.495142,-0.108378,-0.023228,...,-0.480521,0.823873,-0.739041,-0.516399,0.006648,-1.045492,-1.302128,0.055033,-0.350265,-0.403543
81,81,0.002721,-0.067506,-0.502804,-0.645631,0.440223,-0.184377,-0.016769,0.023051,-0.094362,...,-0.121298,-0.865571,-0.001384,0.211389,-0.041620,-2.380969,-0.284099,0.242165,0.011268,0.084934
82,82,-0.183534,-0.020649,0.273060,-0.058902,0.420845,0.400296,-0.451424,-0.018463,-0.079306,...,0.141840,-1.667350,0.024903,-0.228979,-0.061779,-1.621317,-1.827847,-0.069815,0.175290,-0.056048


## Month Embedding

In [62]:
months = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12]])
month_data = np.concatenate((months,month_embedding_weights),axis=1)
month_data.shape

(12, 7)

In [63]:
columns = ['month']
for i in range(6):
    columns.append('month_embedding_'+str(i))

month_embedding_df = pd.DataFrame(data=month_data, columns=columns)
month_embedding_df = month_embedding_df.astype({'month':'int64'})

In [64]:
month_embedding_df

Unnamed: 0,month,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,month_embedding_4,month_embedding_5
0,1,0.211709,1.657333,-0.019381,1.103483,-1.286952,-2.872084
1,2,0.241376,2.08118,-0.172409,1.922793,-1.32747,-2.214527
2,3,0.677484,2.748065,-0.122578,1.451593,-1.241271,-2.719061
3,4,-0.0474,3.873985,-0.903815,1.344345,-1.586019,-4.483506
4,5,0.548098,3.91799,-0.747472,0.951144,-2.578927,-3.875617
5,6,0.040027,3.192486,-0.821037,1.230072,-2.909809,-4.081171
6,7,-0.275926,3.560286,-0.39784,1.759957,-2.282169,-4.717065
7,8,0.356471,2.754771,0.271973,1.199562,-2.941004,-3.542341
8,9,-0.101906,3.925657,0.270718,1.73044,-1.967278,-4.293863
9,10,0.211367,4.223983,0.021049,1.049399,-1.685574,-5.161833


## Year Embedding

In [65]:
years = np.array([[2013],[2014],[2015]])
years_data = np.concatenate((years,year_embedding_weights),axis=1)
years_data.shape

(3, 3)

In [66]:
columns = ['year']
for i in range(2):
    columns.append('year_embedding_'+str(i))
years_embedding_df = pd.DataFrame(data=years_data, columns=columns)
years_embedding_df = years_embedding_df.astype({'year':'int64'})

In [67]:
years_embedding_df

Unnamed: 0,year,year_embedding_0,year_embedding_1
0,2013,2.40236,-0.980483
1,2014,2.692846,-2.707979
2,2015,5.859278,-2.785006


## Merge the Embeddings

In [68]:
embedded_training_df = pd.merge(final_training_df, month_embedding_df, on=['month'])
embedded_training_df = pd.merge(embedded_training_df, years_embedding_df, on=['year'])
embedded_training_df = pd.merge(embedded_training_df, item_cat_id_embedding_df, on=['item_category_id'])
embedded_training_df = pd.merge(embedded_training_df, shop_id_embedding_df, on=['shop_id'])

In [69]:
embedded_training_df

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,year,month,avg_item_price,avg_item_price_per_shop,avg_item_price_per_month,avg_item_price_per_shop_per_month,...,shop_embedding_16,shop_embedding_17,shop_embedding_18,shop_embedding_19,shop_embedding_20,shop_embedding_21,shop_embedding_22,shop_embedding_23,shop_embedding_24,shop_embedding_25
0,2,27,19,0,2013,1,1461.228571,1498.500000,2159.125000,2499.000000,...,-0.519515,0.589719,-1.259106,0.977608,2.524584,-0.084711,0.614360,-0.218721,1.054061,0.344803
1,2,1409,19,0,2013,1,1058.565455,807.310000,1270.085714,1398.500000,...,-0.519515,0.589719,-1.259106,0.977608,2.524584,-0.084711,0.614360,-0.218721,1.054061,0.344803
2,2,1467,19,0,2013,1,957.239055,932.361111,963.444711,899.000000,...,-0.519515,0.589719,-1.259106,0.977608,2.524584,-0.084711,0.614360,-0.218721,1.054061,0.344803
3,2,1471,19,0,2013,1,1661.040967,1463.705882,2064.657252,1819.000000,...,-0.519515,0.589719,-1.259106,0.977608,2.524584,-0.084711,0.614360,-0.218721,1.054061,0.344803
4,2,1832,19,0,2013,1,939.034265,849.348519,1446.971688,1265.833333,...,-0.519515,0.589719,-1.259106,0.977608,2.524584,-0.084711,0.614360,-0.218721,1.054061,0.344803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14242867,55,13093,36,32,2015,9,250.000000,250.000000,250.000000,250.000000,...,0.622923,0.186599,-0.440730,0.843658,2.167138,-0.101304,1.226746,0.373412,0.844474,-0.231308
14242868,55,13095,36,32,2015,9,500.000000,500.000000,500.000000,500.000000,...,0.622923,0.186599,-0.440730,0.843658,2.167138,-0.101304,1.226746,0.373412,0.844474,-0.231308
14242869,55,13091,36,32,2015,9,1000.000000,1000.000000,1000.000000,1000.000000,...,0.622923,0.186599,-0.440730,0.843658,2.167138,-0.101304,1.226746,0.373412,0.844474,-0.231308
14242870,55,13094,36,32,2015,9,2500.000000,2500.000000,2500.000000,2500.000000,...,0.622923,0.186599,-0.440730,0.843658,2.167138,-0.101304,1.226746,0.373412,0.844474,-0.231308


In [70]:
embedded_training_df.to_csv('LSTM_Embedded_Train_Data_V2.csv',index=False)

In [71]:
features.to_csv('LSTM_Features.csv',index=False)