In [1]:
import datetime as dt
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
from dateutil.relativedelta import relativedelta

# Define Data Path

In [2]:
data_path = '../data/ECommerce/Multi-Store'
train_file = 'train_down.parquet'
val_file = 'val_down.parquet'
test_file = 'test_down.parquet'

train_label_file = 'train_label.parquet'
val_label_file = 'val_label.parquet'
test_label_file = 'test_label.parquet'

## Prep Train Feature

In [3]:
train_data = pd.read_parquet(os.path.join(data_path, train_file))
train_label = pd.read_parquet(os.path.join(data_path, train_label_file))

In [14]:
classes = [col.replace('count_', '') for col in train_label.columns[2:]]

In [4]:
train_data['event_month'] = train_data['event_time'].str[:7]
agg_count = pd.pivot_table(data=train_data, index=['user_id', 'event_month'],
                           columns=['cat_0', 'event_type'], values='price',
                           aggfunc='count'
                          )

### Aggregate data

In [5]:
cols = [col[1]+'_'+col[0]+'_cnt' for col in agg_count.columns]
agg_count.columns = cols

In [6]:
train_data['price'] = train_data['price'].apply(pd.to_numeric)

In [7]:
agg_price = pd.pivot_table(data=train_data, index=['user_id', 'event_month'],
                           columns=['cat_0', 'event_type'], values='price',
                           aggfunc='sum'
                          )
cols = [col[1]+'_'+col[0]+'_sum' for col in agg_price.columns]
agg_price.columns = cols
agg_price.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cart_NA_sum,purchase_NA_sum,cart_accessories_sum,purchase_accessories_sum,cart_apparel_sum,purchase_apparel_sum,cart_appliances_sum,purchase_appliances_sum,cart_auto_sum,purchase_auto_sum,...,cart_furniture_sum,purchase_furniture_sum,cart_kids_sum,purchase_kids_sum,cart_medicine_sum,purchase_medicine_sum,cart_sport_sum,purchase_sport_sum,cart_stationery_sum,purchase_stationery_sum
user_id,event_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100037567,2020-01,,,,,771.45,,,,,,...,,,,,,,,,,
100140882,2020-01,,,,,,,411.8,,,,...,,,,,,,,,,
107837897,2019-11,,,,,,,,,600.74,,...,,,,,,,,,,
124298297,2019-11,95.24,,,,,,,,,,...,,,,,,,,,,
125917727,2019-11,343.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
index_cols = ['user_id', 'event_month']

In [9]:
def get_previos_month(current, p=1):
    curr_month = dt.datetime.strptime(current, "%Y-%m")
    previous_month = curr_month-relativedelta(months=p)
    return previous_month.strftime("%Y-%m")

In [10]:
def get_feature(index_df, feature_df, p=1):
    index_df['join_month'] = index_df['event_month'].apply(get_previos_month, p)
    tmp_feature = index_df.merge(feature_df, left_on=['user_id', 'join_month'], 
                                 right_on=['user_id', 'event_month'], how='left')
    tmp_feature.drop('join_month', axis=1, inplace=True)
    tmp_feature.head()
    feature_cols = tmp_feature.columns[2:]
    feature_cols = [f'{col}_P{p}M' for col in feature_cols]
    tmp_feature.columns = index_cols + feature_cols
    return tmp_feature

In [11]:
train_index = train_label[index_cols].copy()
index_df = train_index.set_index(['user_id', 'event_month'])
for i in range(1,4):
    feature = get_feature(train_index[index_cols], agg_count, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)
    
    feature = get_feature(train_index[index_cols], agg_price, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)

In [35]:
def get_purchase_freq(df):
    for c in classes:
        cols = [f'purchase_{c}_cnt_P{i}M' for i in range(1,4)]
        col_nm = f'{c}_freq'
        df[col_nm] = (df[cols]>0).sum(axis=1)
    return df

In [36]:
train_feature = get_purchase_freq(index_df)

In [40]:
train_feature.reset_index(inplace=True)
train_feature.to_parquet(os.path.join(data_path, 'train_feature.parquet'))

In [69]:
del index_df 
del train_feature

## Prep Val Feature

In [79]:
train_data['event_month'].value_counts().sort_index()

2019-10    145312
2019-11    492652
2019-12    527046
2020-01    416087
2020-02    500763
Name: event_month, dtype: int64

In [42]:
val_label = pd.read_parquet(os.path.join(data_path, val_label_file))

In [44]:
val_index = val_label[index_cols].copy()
index_df = val_index.set_index(['user_id', 'event_month'])
for i in range(1,4):
    feature = get_feature(val_index[index_cols], agg_count, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)
    
    feature = get_feature(val_index[index_cols], agg_price, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)

In [45]:
val_feature = get_purchase_freq(index_df)

In [46]:
val_feature.reset_index(inplace=True)
val_feature.to_parquet(os.path.join(data_path, 'val_feature.parquet'))

## Prep Test Feature

In [47]:
test_label = pd.read_parquet(os.path.join(data_path, test_label_file))

In [88]:
test_label['event_month'].unique()

array(['2020-04'], dtype=object)

In [48]:
val_data = pd.read_parquet(os.path.join(data_path, val_file))
val_data['event_month'] = val_data['event_time'].str[:7]

In [49]:
val_data = pd.concat([train_data[train_data['event_month'].isin(['2020-01', '2020-02'])], val_data], axis=0)
val_data['event_month'].value_counts(dropna=False)

2020-02    500763
2020-01    416087
2020-03    242725
Name: event_month, dtype: int64

In [51]:
agg_count = pd.pivot_table(data=val_data, index=['user_id', 'event_month'],
                           columns=['cat_0', 'event_type'], values='price',
                           aggfunc='count'
                          )
cols = [col[1]+'_'+col[0]+'_cnt' for col in agg_count.columns]
agg_count.columns = cols

agg_price = pd.pivot_table(data=train_data, index=['user_id', 'event_month'],
                           columns=['cat_0', 'event_type'], values='price',
                           aggfunc='sum'
                          )
cols = [col[1]+'_'+col[0]+'_sum' for col in agg_price.columns]
agg_price.columns = cols

In [52]:
test_index = test_label[index_cols].copy()
index_df = test_index.set_index(['user_id', 'event_month'])
for i in range(1,4):
    feature = get_feature(test_index[index_cols], agg_count, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)
    
    feature = get_feature(test_index[index_cols], agg_price, p=i)
    feature.set_index(['user_id', 'event_month'], inplace=True)
    index_df = pd.concat([index_df, feature], join='inner', axis=1)

In [53]:
test_feature = get_purchase_freq(index_df)

In [54]:
test_feature.reset_index(inplace=True)
test_feature.to_parquet(os.path.join(data_path, 'test_feature.parquet'))