In [2]:
import pandas as pd
import numpy as np
import gc
from tqdm.notebook import tqdm
import pickle
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [7]:
train = pd.read_csv('sales_train_cleaned.csv')
item_df = pd.read_csv('items.csv')


with open('shop_id_embedding_weights_v2.pkl','rb') as f:
    shop_id_embedding_weights = pickle.load(f)

with open('month_embedding_weights_v2.pkl','rb') as f:
    month_embedding_weights = pickle.load(f)
    
with open('year_embedding_weights_v2.pkl','rb') as f:
    year_embedding_weights = pickle.load(f)

with open('item_cat_embedding_weights_v2.pkl','rb') as f:
    item_cat_embedding_weights = pickle.load(f)

# Merging Mislabelled Shops

upon manually going through the data of the shops, discoveredthe following possible cases where the same shop have been given different shop_id, hence merging them together. The shops of interest are:
1. 0 => 57 (This is contentious)
2. 1 => 58 (This is contentious)
3. 10 => 11

In [8]:
train.loc[(train.shop_id == 0), 'shop_id'] = 57
train.loc[(train.shop_id == 1), 'shop_id'] = 58
train.loc[(train.shop_id == 10),'shop_id'] = 11

In [4]:
train['date'] = train['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))

train['day'] = train.date.apply(lambda x: int(x.strftime('%d')))
train['month'] = train.date.apply(lambda x: int(x.strftime('%m')))
train['year'] = train.date.apply(lambda x: int(x.strftime('%Y')))

# Feature Adding

## Monthly Sales

In [5]:
features = train.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day'].sum().reset_index()
features.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,2,27,0,1.0
1,2,27,17,1.0
2,2,30,2,1.0
3,2,30,5,1.0
4,2,30,15,1.0
...,...,...,...,...
1609117,59,22164,27,2.0
1609118,59,22164,30,1.0
1609119,59,22167,9,1.0
1609120,59,22167,11,2.0


## Temporal

In [6]:
block_num_representation = train[['date_block_num','month','year']]
block_num_representation = block_num_representation.drop_duplicates()

In [7]:
features = pd.merge(features,block_num_representation,on=['date_block_num'])
features

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,month,year
0,2,27,0,1.0,1,2013
1,2,33,0,1.0,1,2013
2,2,317,0,1.0,1,2013
3,2,438,0,1.0,1,2013
4,2,471,0,2.0,1,2013
...,...,...,...,...,...,...
1609117,59,22058,29,1.0,6,2015
1609118,59,22087,29,1.0,6,2015
1609119,59,22088,29,4.0,6,2015
1609120,59,22102,29,1.0,6,2015


## Item Category ID

In [15]:
item_df_no_name = item_df.drop(['item_name'],axis=1)

In [16]:
features = pd.merge(features,item_df_no_name,on=['item_id'])

In [18]:
features = features[['shop_id','item_id','item_category_id',
                    'date_block_num','month','year','item_cnt_month']]

# Insert Embedding

## Shop ID Embedding

In [9]:
unique_shop_id = train.shop_id.unique()
unique_shop_id = list(unique_shop_id.astype(int))
unique_shop_id.sort()
le = LabelEncoder()
le.fit(unique_shop_id)

LabelEncoder()

In [11]:
data = np.concatenate((le.classes_.reshape(-1,1),
                       shop_id_embedding_weights),axis=1)

In [12]:
columns = ['shop_id']
for i in range(shop_id_embedding_weights.shape[1]):
    columns.append('shop_embedding_'+str(i))

shop_id_embedding_df = pd.DataFrame(data=data,columns=columns)

In [13]:
shop_id_embedding_df.to_pickle("embeddings/shop_id_embedding.pkl")

## Item Category ID Embedding

In [15]:
unique_item_cat = list(item_df.item_category_id.unique())
unique_item_cat.sort()
unique_item_cat = np.asarray(unique_item_cat)
cat_data = np.concatenate((unique_item_cat.reshape(-1,1),
                           item_cat_embedding_weights),axis=1)
cat_data.shape

(84, 11)

In [16]:
columns = ['item_category_id']
for i in range(item_cat_embedding_weights.shape[1]):
    columns.append('item_category_id_embedding_'+str(i))

item_cat_id_embedding_df = pd.DataFrame(data=cat_data, columns=columns)

In [18]:
item_cat_id_embedding_df.to_pickle("embeddings/item_cat_id_embedding.pkl")

## Month Embedding

In [21]:
months = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12]])
month_data = np.concatenate((months,month_embedding_weights),axis=1)
month_data.shape

(12, 7)

In [22]:
columns = ['month']
for i in range(6):
    columns.append('month_embedding_'+str(i))

month_embedding_df = pd.DataFrame(data=month_data, columns=columns)

In [23]:
month_embedding_df.to_pickle("embeddings/month_embedding.pkl")

## Year Embedding

In [24]:
years = np.array([[2013],[2014],[2015]])
years_data = np.concatenate((years,year_embedding_weights),axis=1)
years_data.shape

(3, 4)

In [25]:
columns = ['year']
for i in range(3):
    columns.append('year_embedding_'+str(i))
years_embedding_df = pd.DataFrame(data=years_data, columns=columns)

In [27]:
years_embedding_df.to_pickle("embeddings/year_embedding.pkl")

## Item ID Embedding

In [28]:
item_ids = item_df.item_id.unique()
item_id_embedding = train.groupby(['item_id'])['item_cnt_day'].mean().reset_index()
item_id_embedding.rename(columns={'item_cnt_day':'item_id_embedding'},inplace=True)

In [29]:
item_id_embedding['item_id_embedding'].max()

65.47368421052632

In [30]:
item_id_embedding.to_pickle("embeddings/item_id_embedding.pkl")

# Generate Missing Data

In [34]:
features_grpd = features.groupby(['shop_id','item_id'])
feature_list = [c for c in features.columns]
feature_idx_dict = {}
for idx in range(len(feature_list)):
    feature_idx_dict[feature_list[idx]] = idx 

In [30]:
def naive_fill(data, feature_idx_dict, feat_list):
    block_nums_given = data['date_block_num'].values
    missing_block_nums = np.asarray(list(set(range(34)) - set(block_nums_given)))
    df_data = np.zeros((len(missing_block_nums),len(feature_idx_dict)))
    df_data = df_data.astype(int)
    shop_id = int(data['shop_id'].values[0])
    item_id = int(data['item_id'].values[0])
    item_category_id = int(data['item_category_id'].values[0])
    df_data[:,feature_idx_dict['date_block_num']] = missing_block_nums
    df_data[:,feature_idx_dict['shop_id']] = shop_id
    df_data[:,feature_idx_dict['item_id']] = item_id
    df_data[:,feature_idx_dict['item_category_id']] = item_category_id
    df = pd.DataFrame(data=df_data,columns=feat_list)
    return df

In [37]:
def generate_fill_data(data, feature_idx_dict, feat_list):
    frames = []
    keys = list(data.groups.keys())
    for idx in tqdm(range(len(keys))):
        key = keys[idx]
        feat = data.get_group(key)
        df = naive_fill(feat, feature_idx_dict, feat_list)
        frames.append(df)
    return pd.concat(frames)

In [47]:
df = generate_fill_data(features_grpd, feature_idx_dict, feature_list)

HBox(children=(FloatProgress(value=0.0, max=418908.0), HTML(value='')))




In [48]:
df.drop(columns=['month','year'],inplace=True)

In [49]:
df = pd.merge(df, block_num_representation,on=['date_block_num'])

In [50]:
df = df[['shop_id','item_id','item_category_id',
         'date_block_num','month','year','item_cnt_month']]
df

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month
0,2,27,19,1,2,2013,0
1,2,30,40,1,2,2013,0
2,2,32,40,1,2,2013,0
3,2,33,37,1,2,2013,0
4,2,53,57,1,2,2013,0
...,...,...,...,...,...,...,...
12633745,59,22153,37,17,6,2014,0
12633746,59,22154,37,17,6,2014,0
12633747,59,22155,37,17,6,2014,0
12633748,59,22162,40,17,6,2014,0


In [62]:
# Add the embeddings
features = pd.merge(features, shop_id_embedding_df, on=['shop_id'])
features = pd.merge(features, item_id_embedding, on=['item_id'])
features = pd.merge(features, item_cat_id_embedding_df, on=['item_category_id'])
features = pd.merge(features, month_embedding_df, on=['month'])
features = pd.merge(features, years_embedding_df, on=['year'])

In [63]:
features

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,shop_embedding_0,shop_embedding_1,shop_embedding_2,...,item_category_id_embedding_9,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,month_embedding_4,month_embedding_5,year_embedding_0,year_embedding_1,year_embedding_2
0,2,27,19,0,1,2013,1.0,-0.713061,-0.670228,1.463010,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
1,11,27,19,0,1,2013,1.0,0.572422,-0.292263,1.089834,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
2,19,27,19,0,1,2013,1.0,-0.967941,-0.688419,-0.363679,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
3,26,27,19,0,1,2013,1.0,0.552903,-1.125504,0.263646,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
4,31,27,19,0,1,2013,1.0,-0.221072,0.422647,-0.197449,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609117,55,4208,74,32,9,2015,1.0,-0.592677,0.050103,-0.552469,...,-0.236099,0.683028,0.027217,-0.029337,0.296879,-0.114201,0.586602,-2.487426,-0.417784,0.616164
1609118,55,13093,36,32,9,2015,4.0,-0.592677,0.050103,-0.552469,...,-0.502669,0.683028,0.027217,-0.029337,0.296879,-0.114201,0.586602,-2.487426,-0.417784,0.616164
1609119,55,13095,36,32,9,2015,1.0,-0.592677,0.050103,-0.552469,...,-0.502669,0.683028,0.027217,-0.029337,0.296879,-0.114201,0.586602,-2.487426,-0.417784,0.616164
1609120,55,13091,36,32,9,2015,1.0,-0.592677,0.050103,-0.552469,...,-0.502669,0.683028,0.027217,-0.029337,0.296879,-0.114201,0.586602,-2.487426,-0.417784,0.616164


In [71]:
features.to_pickle("simplified_data/features.pkl")

In [65]:
# Add the embedding for the filler data
df = pd.merge(df, shop_id_embedding_df, on=['shop_id'])
df = pd.merge(df, item_id_embedding, on=['item_id'])
df = pd.merge(df, item_cat_id_embedding_df, on=['item_category_id'])
df = pd.merge(df, month_embedding_df, on=['month'])
df = pd.merge(df, years_embedding_df, on=['year'])

In [69]:
df

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,shop_embedding_0,shop_embedding_1,shop_embedding_2,...,item_category_id_embedding_9,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,month_embedding_4,month_embedding_5,year_embedding_0,year_embedding_1,year_embedding_2
0,2,27,19,1,2,2013,0,-0.713061,-0.670228,1.463010,...,-0.013860,-0.178128,0.009054,0.665452,-0.028808,0.299549,-0.671503,-0.256262,-1.834511,-0.331701
1,3,27,19,1,2,2013,0,-0.457793,-0.163692,0.667918,...,-0.013860,-0.178128,0.009054,0.665452,-0.028808,0.299549,-0.671503,-0.256262,-1.834511,-0.331701
2,4,27,19,1,2,2013,0,-0.055994,-0.767383,0.145066,...,-0.013860,-0.178128,0.009054,0.665452,-0.028808,0.299549,-0.671503,-0.256262,-1.834511,-0.331701
3,5,27,19,1,2,2013,0,-0.775711,0.062276,-0.186296,...,-0.013860,-0.178128,0.009054,0.665452,-0.028808,0.299549,-0.671503,-0.256262,-1.834511,-0.331701
4,6,27,19,1,2,2013,0,0.437256,-0.032174,0.552518,...,-0.013860,-0.178128,0.009054,0.665452,-0.028808,0.299549,-0.671503,-0.256262,-1.834511,-0.331701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12633745,55,13091,36,24,1,2015,0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633746,55,13092,36,24,1,2015,0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633747,55,13093,36,24,1,2015,0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633748,55,13094,36,24,1,2015,0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164


In [70]:
df[0:12633750//2].to_pickle("simplified_data/filler_data_part_1.pkl")
df[12633750//2:12633750].to_pickle("simplified_data/filler_data_part_2.pkl")



In [72]:
final_data = pd.concat([features,df])
final_data

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,month,year,item_cnt_month,shop_embedding_0,shop_embedding_1,shop_embedding_2,...,item_category_id_embedding_9,month_embedding_0,month_embedding_1,month_embedding_2,month_embedding_3,month_embedding_4,month_embedding_5,year_embedding_0,year_embedding_1,year_embedding_2
0,2,27,19,0,1,2013,1.0,-0.713061,-0.670228,1.463010,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
1,11,27,19,0,1,2013,1.0,0.572422,-0.292263,1.089834,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
2,19,27,19,0,1,2013,1.0,-0.967941,-0.688419,-0.363679,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
3,26,27,19,0,1,2013,1.0,0.552903,-1.125504,0.263646,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
4,31,27,19,0,1,2013,1.0,-0.221072,0.422647,-0.197449,...,-0.013860,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-0.256262,-1.834511,-0.331701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12633745,55,13091,36,24,1,2015,0.0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633746,55,13092,36,24,1,2015,0.0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633747,55,13093,36,24,1,2015,0.0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164
12633748,55,13094,36,24,1,2015,0.0,-0.592677,0.050103,-0.552469,...,-0.502669,-0.188419,-0.012766,-0.111624,-0.500017,-0.368897,-0.856338,-2.487426,-0.417784,0.616164


In [77]:
split = 14242872//4
final_data[split*0:split*1].to_pickle("final_df_part_1.pkl")
final_data[split*1:split*2].to_pickle("final_df_part_2.pkl")
final_data[split*2:split*3].to_pickle("final_df_part_3.pkl")
final_data[split*3:split*4].to_pickle("final_df_part_4.pkl")

In [76]:
14242872//4

3560718.0

# Setup The Kaggle Test 

In [1]:
kaggle_test = pd.read_csv('test.csv')

NameError: name 'pd' is not defined