# Model based on embeddings

The idea is based on embeddings, as explained by Jeremy Howard of [fast.ai].

The code is taken from [3rd place winner](https://github.com/entron/entity-embedding-rossmann) of a similar [competition](https://www.kaggle.com/c/rossmann-store-sales/) on Kaggle.

In [5]:
import pandas as pd
import numpy as np

%matplotlib inline
%config Completer.use_jedi = False


In [6]:
from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Reshape, Dot, Add
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras import optimizers


In [7]:
ds = pd.read_feather('../data/train_for_test.ftr')

In [8]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3646331 entries, 0 to 3646330
Data columns (total 10 columns):
date_block_num           category
shop_id                  category
item_id                  category
item_cnt_month           float64
item_price               float64
year                     category
month                    category
item_cnt_month_lag_1     float64
item_cnt_month_lag_2     float64
item_cnt_month_lag_12    float64
dtypes: category(5), float64(5)
memory usage: 160.8 MB


In [9]:
ds.drop(columns=['item_price','item_cnt_month_lag_1', 'item_cnt_month_lag_2',
       'item_cnt_month_lag_12'],inplace=True)

In [10]:
ds.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,year,month
0,0,2,30,0.0,2013,1
1,0,42,4352,0.0,2013,1
2,0,42,4354,0.0,2013,1
3,0,42,4356,0.0,2013,1
4,0,42,4357,0.0,2013,1


In [11]:
embedding_columns=['shop_id','item_id']
for c in embedding_columns:
    ds[c].cat.remove_unused_categories(inplace=True)

In [12]:
n_levels = ds[embedding_columns].nunique()
n_levels

shop_id      42
item_id    4716
dtype: int64

In [13]:
ds[embedding_columns].max()

shop_id       59.0
item_id    22167.0
dtype: float64

In [79]:
n_dimensions=n_levels.copy()
n_dimensions.iloc[range(2)]=[3,3]
n_dimensions

shop_id    3
item_id    3
dtype: int64

In [80]:
input_shop = Input(shape=(1,))
output_shop = Embedding(n_levels.shop_id, n_dimensions.shop_id, name='shop_embedding')(input_shop)
output_shop = Reshape(target_shape=(n_dimensions.shop_id,))(output_shop)
bias_shop = Embedding(n_levels.shop_id, 1, name='bias_shop')(input_shop)
bias_shop = Reshape(target_shape=(1,))(bias_shop)



In [81]:
input_item = Input(shape=(1,))
output_item = Embedding(n_levels.item_id, n_dimensions.item_id, name='item_embedding')(input_item)
output_item = Reshape(target_shape=(n_dimensions.item_id,))(output_item)
bias_item = Embedding(n_levels.item_id, 1, name='bias_item')(input_item)
bias_item = Reshape(target_shape=(1,))(bias_item)



In [82]:

input_model = [input_shop ,input_item]
output_embeddings = [output_shop,output_item]



In [83]:
output_model = Dot(axes=-1, normalize=False)([output_shop,output_item])
output_model = Add()([output_model,bias_shop,bias_item])

model = KerasModel(inputs=[input_shop,input_item], outputs=output_model)

In [84]:
dsmall=ds.groupby(['shop_id','item_id'], as_index=False)['item_cnt_month'].mean().dropna()

x=[dsmall[c].cat.codes.values for c in ['shop_id','item_id']]
y=dsmall.item_cnt_month.clip(lower=0,upper=40)
x,len(x[0])

([array([ 0,  0,  0, ..., 41, 41, 41], dtype=int8),
  array([   0,    1,    2, ..., 4711, 4713, 4715], dtype=int16)],
 111404)

In [85]:
sgd = optimizers.SGD(lr=0.6, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd)

Quick summary of fit results, after a few restarts (now with embeddings bias)
- 1 dimension MSE 0.17
- 2 dims MSE 0.12
- 3 dims MSE 0.10
- 4 dims MSE 0.08
- 10 dims MSE 0.03

In [88]:
model.fit(x,y,validation_split=0,epochs=300,verbose=0,batch_size=2**14)
model.fit(x,y,validation_split=0,epochs=1,verbose=2,batch_size=2**14)



Epoch 1/1
 - 0s - loss: 0.1003


<keras.callbacks.History at 0x12f3515f8>

In [None]:
from sklearn import manifold
import matplotlib.pyplot as plt
%matplotlib inline

items=pd.read_csv('../readonly/final_project_data/items.csv',index_col='item_id')

full_embeddings=model.get_layer('item_embedding').get_weights()[0]
n_full = len(full_embeddings)
n_plot=500
pick_idx=np.random.choice(range(n_full),size=n_plot)
plot_embeddings=full_embeddings[pick_idx]
item_cats=items.iloc[dsmall.item_id.cat.categories[pick_idx]].item_category_id.values

tsne = manifold.TSNE(init='pca', random_state=0, method='exact')
Y = tsne.fit_transform(plot_embeddings)
plt.figure(figsize=(8,8))
plt.scatter(-Y[:, 0], -Y[:, 1],c=item_cats,cmap='tab20')
#for i, txt in enumerate(ds.item_id.cat.categories[pick_idx]):
#    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-1, 1), textcoords = 'offset points')



In [None]:
shop_cats=shops.iloc[dsmall.shop_id.cat.categories].index.values


In [None]:

shops=pd.read_csv('../readonly/final_project_data/shops.csv',index_col='shop_id')

shop_embeddings_array=model.get_layer('shop_embedding').get_weights()[0]

plot_embeddings=shop_embeddings_array
shop_cats=shops.iloc[dsmall.shop_id.cat.categories].index.values

tsne = manifold.TSNE(init='pca', random_state=0, method='exact')
Y = tsne.fit_transform(plot_embeddings)
plt.figure(figsize=(8,8))
plt.scatter(-Y[:, 0], -Y[:, 1],c=shop_cats,cmap='tab20')
#for i, txt in enumerate(ds.item_id.cat.categories[pick_idx]):
#    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-1, 1), textcoords = 'offset points')



In [None]:
test=pd.read_csv('../readonly/final_project_data/test.csv')

In [None]:
test.shop_id=pd.Categorical(test.shop_id,categories=dsmall.shop_id.cat.categories)
test.item_id=pd.Categorical(test.item_id,categories=dsmall.item_id.cat.categories)
new_items_idx=test.item_id.isna()
test.info()

In [None]:
x_test=[test.loc[~new_items_idx,c].cat.codes.values for c in ['shop_id','item_id']]
x_test,len(x_test[0])



In [None]:
test.loc[~new_items_idx,'item_cnt_month']=model.predict(x_test)

In [None]:
test.item_cnt_month.hist(bins=100,log=True)

In [None]:
test.item_cnt_month.fillna(value=0.0,inplace=True)

In [None]:
test.item_cnt_month.clip(lower=0,upper=20,inplace=True)

In [None]:
test.to_csv('../data/submit_shop_item_embed_fill0.csv',header=True,index=False,columns=['ID','item_cnt_month'])

In [None]:
dsmall.shape

In [None]:
# save for tensorflow projector
pd.DataFrame(full_embeddings).to_csv('../data/item_embeddings.tsv',sep='\t',header=False,index=False)

In [None]:
item_cats=items.iloc[dsmall.item_id.cat.categories].item_category_id.values
item_ids=items.iloc[dsmall.item_id.cat.categories].index.values


In [None]:
pd.DataFrame(item_cats,columns=['cat_id']).to_csv('../data/item_embeddings_meta.tsv',sep='\t',header=True)

In [None]:
item_embeddings = pd.DataFrame(full_embeddings,index=item_ids,columns=['item_emb_1','item_emb_2','item_emb_3'])
item_embeddings.index.name='item_id'
item_embeddings.hist(bins=50,log=True,figsize=(8,8))

In [None]:
item_embeddings.to_csv('../data/item_embeddings.csv')

In [None]:
shop_embeddings=pd.DataFrame(shop_embeddings_array,index=shop_cats,columns=['shop_emb_1','shop_emb_2','shop_emb_3'])

shop_embeddings.index.name='shop_id'

shop_embeddings.hist(bins=50,log=True,figsize=(8,8))

In [None]:
shop_embeddings.to_csv('../data/shop_embeddings.csv')