In [1]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt


## Load Data

In [11]:
data_path = '../data/ECommerce/Multi-Store'
train_file = 'train_down.parquet'
val_file = 'val_down.parquet'
test_file = 'test_down.parquet'


In [3]:
train_data = pd.read_parquet(os.path.join(data_path, train_file))


## Prep Label

In [4]:
user_ids = train_data['user_id'].unique()

In [9]:
def prep_label(filename, event_months=None):
    label = pd.read_parquet(os.path.join(data_path, filename), columns=['event_time', 'event_type', 'user_id', 'cat_0', 'price'])
    label['event_month'] = label['event_time'].str[:7]
    if event_months is None:
        event_months = label['event_month'].unique()
    print('total event months:', event_months)
    cond = ((label['event_month'].isin(event_months)) | (label['event_type']=='purchase'))
    label = label[cond]
    
    label = label.pivot_table(index=['user_id', 'event_month'], columns=['cat_0'], values=['event_time'], aggfunc='count')
    flat_cols = ['count_' + col[1] for col in label.columns]
    label.columns = flat_cols
    label.reset_index(inplace=True)
    
    for m in event_months:
        m_user_ids = set(label[label['event_month']==m]['user_id'].unique())
        nopurchase_ids = set(user_ids).difference(m_user_ids)
        print(f'add {len(nopurchase_ids)} users with no purchase to label df for month {m}')
        
        nopurchase_label = pd.DataFrame(nopurchase_ids, columns=['user_id'])
        nopurchase_label['event_month'] = m
        for col in label.columns[2:]:
            nopurchase_label[col] = np.nan
        label = pd.concat([label, nopurchase_label], ignore_index=True)
    return label

In [7]:
train_data['event_month'] = train_data['event_time'].str[:7]

In [8]:
train_data['event_month'].value_counts(dropna=True)

2019-12    527046
2020-02    500763
2019-11    492652
2020-01    416087
2019-10    145312
Name: event_month, dtype: int64

### Prep Train Label

In [12]:
train_label = prep_label(train_file, event_months=['2020-01', '2020-02'])

total event months: ['2020-01', '2020-02']
add 327390 users with no purchase to label df for month 2020-01
add 318028 users with no purchase to label df for month 2020-02


In [50]:
train_label.iloc[:, 2:].isnull().sum().sort_values()

count_construction     940258
count_electronics     1005682
count_appliances      1028956
count_NA              1046109
count_apparel         1060096
count_sport           1077225
count_furniture       1091108
count_computers       1092688
count_kids            1103388
count_auto            1108740
count_accessories     1109020
count_country_yard    1113135
count_medicine        1114068
count_stationery      1114130
dtype: int64

In [49]:
train_label.iloc[:, 2:].isnull().sum().sort_values().index

Index(['count_construction', 'count_electronics', 'count_appliances',
       'count_NA', 'count_apparel', 'count_sport', 'count_furniture',
       'count_computers', 'count_kids', 'count_auto', 'count_accessories',
       'count_country_yard', 'count_medicine', 'count_stationery'],
      dtype='object')

In [51]:
select_cats = ['count_construction', 'count_electronics', 'count_appliances',
               'count_NA', 'count_apparel', 'count_sport', 'count_furniture',
               'count_computers', 'count_kids', 'count_auto']

## Prep Val and Test Label

In [52]:
val_label = prep_label(val_file)

total event months: ['2020-03']
add 393905 users with no purchase to label df for month 2020-03


In [53]:
test_label = prep_label(test_file)

total event months: ['2020-04']
add 407640 users with no purchase to label df for month 2020-04


## Save Label

In [55]:
index_cols = ['user_id', 'event_month']

In [54]:
train_label.head()

Unnamed: 0,user_id,event_month,count_NA,count_accessories,count_apparel,count_appliances,count_auto,count_computers,count_construction,count_country_yard,count_electronics,count_furniture,count_kids,count_medicine,count_sport,count_stationery
0,100037567,2020-01,,,3.0,,,,,,,,,,,
1,100140882,2020-01,,,,2.0,,,,,,,,,,
2,146333366,2020-01,,,,,,,1.0,,,,,,,
3,151417990,2020-01,,,,,,1.0,,,,,,,,
4,154128341,2020-02,1.0,,,2.0,,,,,,,,,,


In [57]:
train_label[index_cols+select_cats].to_parquet(os.path.join(data_path, 'train_label.parquet'))

In [58]:
val_label[index_cols+select_cats].to_parquet(os.path.join(data_path, 'val_label.parquet'))

In [59]:
test_label[index_cols+select_cats].to_parquet(os.path.join(data_path, 'test_label.parquet'))