In [7]:
import pandas as pd
import numpy as np
import gc
from tqdm.notebook import tqdm
import pickle
from datetime import datetime
from collections import deque
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [3]:
standard_scaler = StandardScaler()

In [4]:
train = pd.read_csv('sales_train_cleaned.csv')
item_df = pd.read_csv('items.csv')
kaggle_data = pd.read_csv('test.csv')

with open('shop_id_embedding_weights_v3.pkl','rb') as f:
    shop_id_embedding_weights = pickle.load(f)

with open('month_embedding_weights_v3.pkl','rb') as f:
    month_embedding_weights = pickle.load(f)
    
with open('year_embedding_weights_v3.pkl','rb') as f:
    year_embedding_weights = pickle.load(f)

with open('item_cat_embedding_weights_v3.pkl','rb') as f:
    item_cat_embedding_weights = pickle.load(f)

# Merging Mislabelled Shops

upon manually going through the data of the shops, discoveredthe following possible cases where the same shop have been given different shop_id, hence merging them together. The shops of interest are:
1. 0 => 57 (This is contentious)
2. 1 => 58 (This is contentious)
3. 10 => 11

In [5]:
train.loc[(train.shop_id == 0), 'shop_id'] = 57
train.loc[(train.shop_id == 1), 'shop_id'] = 58
train.loc[(train.shop_id == 10),'shop_id'] = 11

kaggle_data.loc[kaggle_data.shop_id == 10] = 11 

In [6]:
train['date'] = train['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))

train['day'] = train.date.apply(lambda x: int(x.strftime('%d')))
train['month'] = train.date.apply(lambda x: int(x.strftime('%m')))
train['year'] = train.date.apply(lambda x: int(x.strftime('%Y')))

In [7]:
train = pd.merge(train,item_df,on=['item_id'])

In [None]:
# setting up the kaggle data


# Feature Adding

## Montly Sales

In [7]:
features = train.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().reset_index()
features.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)

## Temporal

In [8]:
block_num_representation = train[['date_block_num','month','year']]
block_num_representation = block_num_representation.drop_duplicates()
features = pd.merge(features,block_num_representation,on=['date_block_num'])

In [9]:
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year
0,2,27,0,1.0,1,2013
1,2,33,0,1.0,1,2013
2,2,317,0,1.0,1,2013
3,2,438,0,1.0,1,2013
4,2,471,0,2.0,1,2013
...,...,...,...,...,...,...
1609117,59,22058,29,1.0,6,2015
1609118,59,22087,29,1.0,6,2015
1609119,59,22088,29,4.0,6,2015
1609120,59,22102,29,1.0,6,2015


## Item Category ID

In [10]:
item_df.drop(['item_name'], axis=1, inplace=True)
features = pd.merge(features, item_df, on=['item_id'])
features = features[['shop_id','item_id','item_category_id',
                    'date_block_num','month','year','item_cnt_month']]

## Total Shop Sales

In [11]:
total_shop_sales = train.groupby(['shop_id'])['item_cnt_day'].sum().reset_index()
total_shop_sales.rename(columns={'item_cnt_day':'total_shop_sales'},inplace=True)
val = total_shop_sales['total_shop_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
total_shop_sales['total_shop_sales_scaled'] = val
del val; gc.collect();

In [12]:
# Merge the features for the scaled sales
val = total_shop_sales.drop(['total_shop_sales'],axis=1)
features = pd.merge(features,val,on=['shop_id'])
del val; gc.collect();

## Average Shop Sales

In [15]:
average_shop_sales = train.groupby(['shop_id'])['item_cnt_day'].mean().reset_index()
average_shop_sales.rename(columns={'item_cnt_day':'avg_shop_sales'}, inplace=True)
val = average_shop_sales['avg_shop_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
average_shop_sales['avg_shop_sales_scaled'] = val
del val; gc.collect();

In [16]:
val = average_shop_sales.drop(['avg_shop_sales'],axis=1)
features = pd.merge(features,val,on=['shop_id'])
del val; gc.collect();

## Total Item Sales

In [17]:
total_item_sales = train.groupby(['item_id'])['item_cnt_day'].sum().reset_index()
total_item_sales.rename(columns={'item_cnt_day':'total_item_sales'},inplace=True)
val = total_item_sales['total_item_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
total_item_sales['total_item_sales_scaled'] = val
del val; gc.collect();

In [18]:
val = total_item_sales.drop(['total_item_sales'],axis=1)
features = pd.merge(features,val,on=['item_id'])
del val; gc.collect();

## Average Item Sales

In [19]:
average_item_sales = train.groupby(['item_id'])['item_cnt_day'].mean().reset_index()
average_item_sales.rename(columns={'item_cnt_day':'avg_item_sales'},inplace=True)
val = average_item_sales['avg_item_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
average_item_sales['avg_item_sales_scaled'] = val
del val; gc.collect();

In [20]:
val = average_item_sales.drop(['avg_item_sales'],axis=1)
features = pd.merge(features, val, on=['item_id'])
del val; gc.collect();

## Total Item Category ID Sales

In [21]:
total_item_cat_sales = train.groupby(['item_category_id'])['item_cnt_day'].sum().reset_index()
total_item_cat_sales.rename(columns={'item_cnt_day':'total_item_cat_sales'},inplace=True)
val = total_item_cat_sales['total_item_cat_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
total_item_cat_sales['total_item_cat_sales_scaled'] = val
del val; gc.collect();

In [22]:
val = total_item_cat_sales.drop(['total_item_cat_sales'],axis=1)
features = pd.merge(features, val, on=['item_category_id'])
del val; gc.collect();

## Average Item Category ID Sales

In [24]:
average_item_cat_sales = train.groupby(['item_category_id'])['item_cnt_day'].mean().reset_index()
average_item_cat_sales.rename(columns={'item_cnt_day':"avg_item_cat_sales"},inplace=True)
val = average_item_cat_sales['avg_item_cat_sales'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
average_item_cat_sales['avg_item_cat_sales_scaled'] = val
del val; gc.collect();

In [27]:
val = average_item_cat_sales.drop(['avg_item_cat_sales'],axis=1)
features = pd.merge(features, val, on=['item_category_id'])
del val; gc.collect();

## Average Item Price

In [28]:
average_item_price = train.groupby(['item_id'])['item_price'].mean().reset_index()
average_item_price.rename(columns={'item_price':'avg_item_price'},inplace=True)
val = average_item_price['avg_item_price'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
average_item_price['avg_item_price_scaled'] = val
del val; gc.collect();

In [29]:
val = average_item_price.drop(['avg_item_price'],axis=1)
features = pd.merge(features, val, on=['item_id'])
del val; gc.collect();

## Average Item Category Price

In [30]:
average_item_cat_price = train.groupby(['item_category_id'])['item_price'].mean().reset_index()
average_item_cat_price.rename(columns={'item_price':'avg_item_cat_price'},inplace=True)
val = average_item_cat_price['avg_item_cat_price'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
average_item_cat_price['avg_item_cat_price_scaled'] = val
del val; gc.collect();

In [31]:
val = average_item_cat_price.drop(['avg_item_cat_price'],axis=1)
features = pd.merge(features, val, on=['item_category_id'])
del val; gc.collect();

## Average Item Price For Shop

In [32]:
avg_item_price_for_shop = train.groupby(['shop_id','item_id'])['item_price'].mean().reset_index()
avg_item_price_for_shop.rename(columns={'item_price':'avg_item_price_for_shop'},inplace=True)
val = avg_item_price_for_shop['avg_item_price_for_shop'].values
val = standard_scaler.fit_transform(val.reshape(-1,1))
val = val.reshape(-1)
avg_item_price_for_shop['avg_item_price_for_shop_scaled'] = val
del val; gc.collect();

In [33]:
val = avg_item_price_for_shop.drop(['avg_item_price_for_shop'],axis=1)
features = pd.merge(features, val, on=['shop_id','item_id'])
del val; gc.collect();

In [34]:
features

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,avg_item_sales_scaled,total_item_cat_sales_scaled,avg_item_cat_sales_scaled,avg_item_price_scaled,avg_item_cat_price_scaled,avg_item_price_for_shop_scaled
0,2,27,19,0,1,2013,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
1,2,27,19,17,6,2014,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
2,11,27,19,0,1,2013,1.0,-0.671075,-0.300821,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.405674
3,11,27,19,9,10,2013,1.0,-0.671075,-0.300821,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.405674
4,19,27,19,0,1,2013,1.0,0.164206,-0.294365,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.679222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,13095,36,33,10,2015,3.0,-0.009672,1.050984,-0.119274,0.171142,-0.437036,-0.253611,-0.167401,-0.323964,-0.176162
1609118,55,13095,36,31,8,2015,1.0,-0.009672,1.050984,-0.119274,0.171142,-0.437036,-0.253611,-0.167401,-0.323964,-0.176162
1609119,55,13095,36,32,9,2015,1.0,-0.009672,1.050984,-0.119274,0.171142,-0.437036,-0.253611,-0.167401,-0.323964,-0.176162
1609120,55,13091,36,32,9,2015,1.0,-0.009672,1.050984,-0.122216,-0.099279,-0.437036,-0.253611,0.102771,-0.323964,0.115778


# Embedding

## Shop ID Embedding

In [35]:
unique_shop_id = train.shop_id.unique()
unique_shop_id = list(unique_shop_id.astype(int))
unique_shop_id.sort()
le = LabelEncoder()
le.fit(unique_shop_id)
data = np.concatenate((le.classes_.reshape(-1,1),shop_id_embedding_weights),axis=1)
columns = ['shop_id']
for i in range(shop_id_embedding_weights.shape[1]):
    columns.append('shop_embedding_'+str(i))

shop_id_embedding_df = pd.DataFrame(data=data,columns=columns)

## Item ID Embedding

In [36]:
item_ids = train.item_id.unique()
item_id_embedding_df = train.groupby(['item_id'])['item_cnt_day'].mean().reset_index()
item_id_embedding_df.rename(
    columns={'item_cnt_day':'item_id_embedding'},inplace=True)


## Item Category ID Embedding

In [37]:
unique_item_cat = list(train.item_category_id.unique())
unique_item_cat.sort()
unique_item_cat = np.asarray(unique_item_cat)
cat_data = np.concatenate((unique_item_cat.reshape(-1,1),
                           item_cat_embedding_weights),axis=1)
columns = ['item_category_id']
for i in range(item_cat_embedding_weights.shape[1]):
    columns.append('item_category_id_embedding_'+str(i))

item_cat_id_embedding_df = pd.DataFrame(data=cat_data, columns=columns)

## Month Embedding

In [38]:
months = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12]])
month_data = np.concatenate((months,month_embedding_weights),axis=1)
columns = ['month']
for i in range(month_embedding_weights.shape[1]):
    columns.append('month_embedding_'+str(i))

month_embedding_df = pd.DataFrame(data=month_data, columns=columns)

## Year Embedding

In [39]:
years = np.array([[2013],[2014],[2015]])
years_data = np.concatenate((years,year_embedding_weights),axis=1)
columns = ['year']
for i in range(year_embedding_weights.shape[1]):
    columns.append('year_embedding_'+str(i))
years_embedding_df = pd.DataFrame(data=years_data, columns=columns)

# Ignoring Data

In [40]:
# Ignore data that has less then time_step number of data points
time_step = 2
features_nice = features.groupby(['shop_id','item_id'])
features_nice = features_nice.filter(lambda x: x.shape[0] > time_step)

In [41]:
features_nice

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,avg_item_sales_scaled,total_item_cat_sales_scaled,avg_item_cat_sales_scaled,avg_item_price_scaled,avg_item_cat_price_scaled,avg_item_price_for_shop_scaled
4,19,27,19,0,1,2013,1.0,0.164206,-0.294365,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.679222
5,19,27,19,6,7,2013,1.0,0.164206,-0.294365,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.679222
6,19,27,19,4,5,2013,1.0,0.164206,-0.294365,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.679222
7,26,27,19,0,1,2013,1.0,0.068087,-0.074751,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.445670
8,26,27,19,14,3,2014,1.0,0.068087,-0.074751,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.445670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609115,55,13093,36,31,8,2015,4.0,-0.009672,1.050984,-0.114125,-0.000944,-0.437036,-0.253611,-0.302487,-0.323964,-0.322132
1609116,55,13093,36,32,9,2015,4.0,-0.009672,1.050984,-0.114125,-0.000944,-0.437036,-0.253611,-0.302487,-0.323964,-0.322132
1609117,55,13095,36,33,10,2015,3.0,-0.009672,1.050984,-0.119274,0.171142,-0.437036,-0.253611,-0.167401,-0.323964,-0.176162
1609118,55,13095,36,31,8,2015,1.0,-0.009672,1.050984,-0.119274,0.171142,-0.437036,-0.253611,-0.167401,-0.323964,-0.176162


# Fill Data

In [42]:
feat_nice_grpd = features_nice.groupby(['shop_id','item_id'])
keys = set(list(feat_nice_grpd.groups.keys()))

In [43]:
feat_list = [c for c in features.columns]
feat_idx_dict = {}
for idx in range(len(feat_list)):
    feat_idx_dict[feat_list[idx]] = idx

In [56]:
def filler_data(data, feat_list, blk_num_rep, feat_dict, keys):
    frames = []
    for key in tqdm(keys):
        shop_id = key[0]
        item_id = key[1]
        df = data.get_group(key)
        df = df.sort_values(by=['date_block_num'])
        given_blk = df['date_block_num'].values
        min_blk = given_blk.min()
        max_blk = given_blk.max()
        missing_blks = list(set(range(min_blk,max_blk+1)) - set(given_blk))
        # TODO: NOT HANDLING THE CASE IF missing_blk is empty
        if len(missing_blks) == 0:
            # print(key)
            continue
        new_data = np.zeros((len(missing_blks),len(feat_list)))
        new_data[:] = df.iloc[0].values
        new_data[:,feat_dict['date_block_num']] = np.asarray(missing_blks)
        new_data[:,feat_dict['month']] = blk_num_rep.iloc[missing_blks,1].values
        new_data[:,feat_dict['year']] = blk_num_rep.iloc[missing_blks,2].values
        new_data[:,feat_dict['item_cnt_month']] = 0
        
        new_df = pd.DataFrame(data=new_data, columns=feat_list)
        frames.append(new_df)
    return pd.concat(frames)

In [57]:
df_filler = filler_data(feat_nice_grpd, feat_list, block_num_representation, feat_idx_dict, keys)


HBox(children=(FloatProgress(value=0.0, max=197290.0), HTML(value='')))




In [58]:
final_df = pd.concat([features_nice,df_filler])

# Merge The Embeddings

In [59]:
final_df_to_train = pd.merge(final_df,shop_id_embedding_df,on=['shop_id'])
final_df_to_train = pd.merge(final_df_to_train,item_id_embedding_df,on=['item_id'])
final_df_to_train = pd.merge(final_df_to_train,item_cat_id_embedding_df,on=['item_category_id'])
final_df_to_train = pd.merge(final_df_to_train,month_embedding_df,on=['month'])
final_df_to_train = pd.merge(final_df_to_train,years_embedding_df,on=['year'])

In [60]:
final_df_to_train

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,...,item_id_embedding,item_category_id_embedding_0,item_category_id_embedding_1,item_category_id_embedding_2,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,year_embedding_0,year_embedding_1
0,19.0,27.0,19.0,0.0,1.0,2013.0,1.0,0.164206,-0.294365,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,0.302301,-0.004570,0.316923,0.207118,1.162543,-0.347262
1,26.0,27.0,19.0,0.0,1.0,2013.0,1.0,0.068087,-0.074751,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,0.302301,-0.004570,0.316923,0.207118,1.162543,-0.347262
2,25.0,1467.0,19.0,0.0,1.0,2013.0,2.0,3.073960,0.006309,0.687616,...,1.021316,0.097284,-0.027431,-0.519356,0.302301,-0.004570,0.316923,0.207118,1.162543,-0.347262
3,2.0,1467.0,19.0,0.0,1.0,2013.0,1.0,-0.575646,-0.236925,0.687616,...,1.021316,0.097284,-0.027431,-0.519356,0.302301,-0.004570,0.316923,0.207118,1.162543,-0.347262
4,31.0,1467.0,19.0,0.0,1.0,2013.0,1.0,4.263268,0.044182,0.687616,...,1.021316,0.097284,-0.027431,-0.519356,0.302301,-0.004570,0.316923,0.207118,1.162543,-0.347262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2643916,55.0,7966.0,34.0,29.0,6.0,2015.0,73.0,-0.009672,1.050984,0.838403,...,3.590659,1.622386,3.401919,0.941345,0.211681,0.729324,-0.228092,-0.715832,0.135294,-2.274620
2643917,55.0,7967.0,34.0,29.0,6.0,2015.0,96.0,-0.009672,1.050984,1.616607,...,5.449309,1.622386,3.401919,0.941345,0.211681,0.729324,-0.228092,-0.715832,0.135294,-2.274620
2643918,55.0,13097.0,34.0,29.0,6.0,2015.0,84.0,-0.009672,1.050984,0.666286,...,2.884409,1.622386,3.401919,0.941345,0.211681,0.729324,-0.228092,-0.715832,0.135294,-2.274620
2643919,55.0,13098.0,34.0,29.0,6.0,2015.0,44.0,-0.009672,1.050984,0.421350,...,2.387097,1.622386,3.401919,0.941345,0.211681,0.729324,-0.228092,-0.715832,0.135294,-2.274620


# Generate The Numpy Tensor

In [75]:
def generate_tensor(data, time_step, keys):
    Xs = deque()
    ys = deque()
    for key in tqdm(keys):
        df = data.get_group(key)
        df = df.sort_values(by=['date_block_num'])
        df.drop(['shop_id','item_id','item_category_id',
                 'date_block_num','month','year'],axis=1, inplace=True)
        y = df['item_cnt_month'].values
        for i in range(len(df) - time_step):
            v = df.iloc[i:(i+time_step)].values
            Xs.append(v)
            ys.append(y[i+time_step])
    return np.asarray(Xs), np.asarray(ys)

In [158]:
final_df_to_train_grpd = final_df_to_train.groupby(['shop_id','item_id'])
final_keys = final_df_to_train_grpd.groups.keys()

In [163]:
X, y = generate_tensor(final_df_to_train_grpd, 2, final_keys)

HBox(children=(FloatProgress(value=0.0, max=197290.0), HTML(value='')))




In [167]:
X.shape

(2249341, 2, 19)

In [168]:
y.shape

(2249341,)

In [169]:
with open('simplified_data_v2/train_features.pkl','wb') as f:
    pickle.dump(X,f)

with open('simplified_data_v2/train_labels.pkl','wb') as f:
    pickle.dump(y,f)

# Handling Spare Cases

In [61]:
# Ignoring cases where we only have one data point
features_sparse = features.groupby(['shop_id','item_id'])
features_sparse = features_sparse.filter(lambda x: x.shape[0] == time_step)

In [62]:
features_sparse

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,avg_item_sales_scaled,total_item_cat_sales_scaled,avg_item_cat_sales_scaled,avg_item_price_scaled,avg_item_cat_price_scaled,avg_item_price_for_shop_scaled
0,2,27,19,0,1,2013,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
1,2,27,19,17,6,2014,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
2,11,27,19,0,1,2013,1.0,-0.671075,-0.300821,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.405674
3,11,27,19,9,10,2013,1.0,-0.671075,-0.300821,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.405674
11,45,27,19,0,1,2013,1.0,-0.380903,-0.258521,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.699074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609020,55,4953,74,28,5,2015,2.0,-0.009672,1.050984,-0.120745,-0.099279,-0.436664,-0.343536,0.205436,0.423277,0.226715
1609023,55,7115,27,33,10,2015,1.0,-0.009672,1.050984,-0.121480,-0.099279,-0.437178,-0.389714,0.075213,-0.322132,0.086000
1609024,55,7115,27,31,8,2015,1.0,-0.009672,1.050984,-0.121480,-0.099279,-0.437178,-0.389714,0.075213,-0.322132,0.086000
1609111,55,7971,34,20,9,2014,11.0,-0.009672,1.050984,-0.095001,0.982405,-0.370803,1.707591,0.643115,-0.152239,0.699658


In [63]:
features_sparse.groupby(['shop_id','item_id']).get_group((2,27))

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,avg_item_sales_scaled,total_item_cat_sales_scaled,avg_item_cat_sales_scaled,avg_item_price_scaled,avg_item_cat_price_scaled,avg_item_price_for_shop_scaled
0,2,27,19,0,1,2013,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
1,2,27,19,17,6,2014,1.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842


## Fill Data

In [64]:
def sparse_filler_data(data, feat_list, blk_num_rep, feat_dict, keys):
    frames = []
    for key in tqdm(keys):
        shop_id = key[0]
        item_id = key[1]
        
        df = data.get_group(key)
        df = df.sort_values(by=['date_block_num'])
        
        given_blk = df['date_block_num'].values
        min_blk = given_blk.min()
        max_blk = given_blk.max()
        missing_blks = list(set(range(min_blk,max_blk+1)) - set(given_blk))
        # TODO: NOT HANDLING THE CASE IF missing_blk is empty
        if len(missing_blks) == 0:
            if min_blk == 0:
                missing_blks.append(max_blk+1)
            else:
                missing_blks.append(min_blk-1)
        new_data = np.zeros((len(missing_blks),len(feat_list)))
        
        new_data[:] = df.iloc[0].values
        new_data[:,feat_dict['date_block_num']] = np.asarray(missing_blks)
        new_data[:,feat_dict['month']] = blk_num_rep.iloc[missing_blks,1].values
        new_data[:,feat_dict['year']] = blk_num_rep.iloc[missing_blks,2].values
        new_data[:,feat_dict['item_cnt_month']] = 0
        
        new_df = pd.DataFrame(data=new_data, columns=feat_list)
        frames.append(new_df)
    return pd.concat(frames)

In [65]:
feat_sparse_grpd = features_sparse.groupby(['shop_id','item_id'])
sparse_keys = list(feat_sparse_grpd.groups.keys())

In [66]:
df_sparse_filler = sparse_filler_data(feat_sparse_grpd, feat_list, 
                                      block_num_representation, feat_idx_dict, sparse_keys)

HBox(children=(FloatProgress(value=0.0, max=79336.0), HTML(value='')))




In [67]:
df_sparse_filler

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,avg_item_sales_scaled,total_item_cat_sales_scaled,avg_item_cat_sales_scaled,avg_item_price_scaled,avg_item_cat_price_scaled,avg_item_price_for_shop_scaled
0,2.0,27.0,19.0,1.0,2.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
1,2.0,27.0,19.0,2.0,3.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
2,2.0,27.0,19.0,3.0,4.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
3,2.0,27.0,19.0,4.0,8.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
4,2.0,27.0,19.0,5.0,9.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,-0.099279,2.131144,-0.196518,0.351993,-0.093984,0.406842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,59.0,22145.0,37.0,17.0,11.0,2013.0,0.0,-0.258305,-0.266061,0.112422,0.029313,1.611160,-0.342247,-0.152824,-0.388709,-0.147552
3,59.0,22145.0,37.0,18.0,1.0,2014.0,0.0,-0.258305,-0.266061,0.112422,0.029313,1.611160,-0.342247,-0.152824,-0.388709,-0.147552
4,59.0,22145.0,37.0,19.0,2.0,2014.0,0.0,-0.258305,-0.266061,0.112422,0.029313,1.611160,-0.342247,-0.152824,-0.388709,-0.147552
0,59.0,22150.0,37.0,5.0,9.0,2013.0,0.0,-0.258305,-0.266061,-0.080290,-0.099279,1.611160,-0.342247,-0.332587,-0.388709,-0.351910


## Merge Embeddings

In [68]:
final_df_sparse = pd.merge(df_sparse_filler,shop_id_embedding_df,on=['shop_id'])
final_df_sparse = pd.merge(final_df_sparse,item_id_embedding_df,on=['item_id'])
final_df_sparse = pd.merge(final_df_sparse,item_cat_id_embedding_df,on=['item_category_id'])
final_df_sparse = pd.merge(final_df_sparse,month_embedding_df,on=['month'])
final_df_sparse = pd.merge(final_df_sparse,years_embedding_df,on=['year'])

In [69]:
final_df_sparse

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,total_shop_sales_scaled,avg_shop_sales_scaled,total_item_sales_scaled,...,item_id_embedding,item_category_id_embedding_0,item_category_id_embedding_1,item_category_id_embedding_2,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,year_embedding_0,year_embedding_1
0,2.0,27.0,19.0,1.0,2.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,-0.175444,0.098907,-0.404893,0.243646,1.162543,-0.347262
1,11.0,27.0,19.0,1.0,2.0,2013.0,0.0,-0.671075,-0.300821,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,-0.175444,0.098907,-0.404893,0.243646,1.162543,-0.347262
2,45.0,27.0,19.0,1.0,2.0,2013.0,0.0,-0.380903,-0.258521,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,-0.175444,0.098907,-0.404893,0.243646,1.162543,-0.347262
3,5.0,2249.0,19.0,1.0,2.0,2013.0,0.0,-0.365928,-0.352855,-0.057488,...,0.967391,0.097284,-0.027431,-0.519356,-0.175444,0.098907,-0.404893,0.243646,1.162543,-0.347262
4,2.0,2715.0,19.0,1.0,2.0,2013.0,0.0,-0.575646,-0.236925,-0.092059,...,1.000000,0.097284,-0.027431,-0.519356,-0.175444,0.098907,-0.404893,0.243646,1.162543,-0.347262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285806,55.0,766.0,78.0,18.0,1.0,2014.0,0.0,-0.009672,1.050984,-0.120745,...,1.500000,-0.916870,-0.122195,-0.984632,0.302301,-0.004570,0.316923,0.207118,0.262466,-0.452195
285807,55.0,769.0,78.0,18.0,1.0,2014.0,0.0,-0.009672,1.050984,-0.120745,...,1.000000,-0.916870,-0.122195,-0.984632,0.302301,-0.004570,0.316923,0.207118,0.262466,-0.452195
285808,55.0,12230.0,78.0,18.0,1.0,2014.0,0.0,-0.009672,1.050984,-0.119274,...,1.000000,-0.916870,-0.122195,-0.984632,0.302301,-0.004570,0.316923,0.207118,0.262466,-0.452195
285809,55.0,17705.0,78.0,18.0,1.0,2014.0,0.0,-0.009672,1.050984,-0.121480,...,1.000000,-0.916870,-0.122195,-0.984632,0.302301,-0.004570,0.316923,0.207118,0.262466,-0.452195


In [70]:
final_df_nice_and_sparse = pd.concat([final_df_to_train, final_df_sparse])

In [72]:
final_df_nice_and_sparse.to_pickle("simplified_data_v2/train_features_df.pkl")

In [73]:
final_df_nice_sparse_grpd = final_df_nice_and_sparse.groupby(['shop_id','item_id'])
spare_nice_keys = list(final_df_nice_sparse_grpd.groups.keys())

In [76]:
X, y = generate_tensor(final_df_nice_sparse_grpd, 2, spare_nice_keys)

HBox(children=(FloatProgress(value=0.0, max=276626.0), HTML(value='')))




In [77]:
X.shape

(2415271, 2, 23)

In [78]:
with open('simplified_data_v2/train_features_v3.pkl','wb') as f:
    pickle.dump(X,f)

with open('simplified_data_v2/train_labels_v3.pkl','wb') as f:
    pickle.dump(y,f)

In [2]:
kushal_data = pd.read_pickle('training_data_kushal_v2.pkl')

In [5]:
kushal_data.drop(['item_name'],inplace=True,axis=1)

In [9]:
scaler = MinMaxScaler()
kushal_data[['date_item_avg_item_cnt', 'date_avg_item_cnt', 
    'date_shop_avg_item_cnt', 
    'date_cat_avg_item_cnt', 
    'date_shop_cat_avg_item_cnt', 
    'date_shop_type_avg_item_cnt','date_shop_subtype_avg_item_cnt', 
    'date_city_avg_item_cnt', 
    'date_item_city_avg_item_cnt', 'date_type_avg_item_cnt', 
    'date_subtype_avg_item_cnt', 'item_avg_item_price', 
    'date_item_avg_item_price', 'date_shop_revenue', 
    'shop_avg_revenue', 'item_cnt_month']] = scaler.fit_transform(kushal_data[['date_item_avg_item_cnt', 
                                                                                        'date_avg_item_cnt', 
                                                                               'date_shop_avg_item_cnt', 
                                                                               'date_cat_avg_item_cnt', 
                                                                               'date_shop_cat_avg_item_cnt', 
                                                                               'date_shop_type_avg_item_cnt', 
                                                                               'date_shop_subtype_avg_item_cnt', 
                                                                               'date_city_avg_item_cnt',
                                                                               'date_item_city_avg_item_cnt', 
                                                                               'date_type_avg_item_cnt', 
                                                                               'date_subtype_avg_item_cnt', 
                                                                               'item_avg_item_price', 
                                                                               'date_item_avg_item_price', 
                                                                               'date_shop_revenue', 
                                                                               'shop_avg_revenue', 'item_cnt_month']])

Unnamed: 0,date_block_num,shop_id,item_id,city_code,item_category_id,type_code,subtype_code,date_item_avg_item_cnt,date_avg_item_cnt,date_shop_avg_item_cnt,...,date_type_avg_item_cnt,date_subtype_avg_item_cnt,item_avg_item_price,date_item_avg_item_price,date_shop_revenue,shop_avg_revenue,days,month,year,item_cnt_month
0,0,2,27,0,19,6,32,0.013806,0.202840,0.166246,...,0.005368,0.004961,0.028549,0.045560,0.069551,0.150392,31,1,1,0.013806
1,0,2,33,0,37,9,6,0.014426,0.202840,0.166246,...,0.003312,0.003312,0.004862,0.009575,0.069551,0.150392,31,1,1,0.013806
2,0,2,317,0,45,2,4,0.013806,0.202840,0.166246,...,0.003263,0.002961,0.005609,0.005860,0.069551,0.150392,31,1,1,0.013806
3,0,2,438,0,45,2,4,0.013806,0.202840,0.166246,...,0.003263,0.002961,0.005604,0.005321,0.069551,0.150392,31,1,1,0.013806
4,0,2,471,0,49,2,28,0.014535,0.202840,0.166246,...,0.003263,0.004239,0.007687,0.007820,0.069551,0.150392,31,1,1,0.014406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609118,33,59,22087,29,83,19,45,0.015214,0.312373,0.166625,...,0.006015,0.006015,0.001630,0.002331,0.062402,0.155623,31,10,3,0.016807
1609119,33,59,22088,29,83,19,45,0.016471,0.312373,0.166625,...,0.006015,0.006015,0.001703,0.002322,0.062402,0.155623,31,10,3,0.014406
1609120,33,59,22091,29,83,19,45,0.014106,0.312373,0.166625,...,0.006015,0.006015,0.002416,0.003456,0.062402,0.155623,31,10,3,0.013806
1609121,33,59,22100,29,42,2,44,0.016080,0.312373,0.166625,...,0.005083,0.006478,0.012207,0.012329,0.062402,0.155623,31,10,3,0.013806


In [12]:
def shift_columns(data):
    keys = set(data.groups.keys())
    frames = []
    for key in tqdm(keys):
        df = data.get_group(key)
        df =  df.sort_values(by=['date_block_num'])
        df['item_cnt_next'] = df['item_cnt_month'].values
        df['item_cnt_next'].shift(-1)
        df['item_cnt_next'].fillna(0)
        frames.append(df)
    return pd.concat(frames)

new_df = shift_columns(kushal_data.groupby(['shop_id','item_id']))

HBox(children=(FloatProgress(value=0.0, max=418908.0), HTML(value='')))


