In [1]:
path = '.'

In [2]:
import os
import numpy as np
import pandas as pd

import pickle
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import category_encoders as ce

In [3]:
orders_df = pd.read_csv(os.path.join(path,'orders.csv'), header=0, dtype={'order_id': 'str', 'clientId': 'str'})
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169708 entries, 0 to 169707
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   order_id  169708 non-null  object 
 1   userid    169708 non-null  object 
 2   class     169708 non-null  int64  
 3   age       169350 non-null  float64
 4   clientId  169708 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 6.5+ MB


In [4]:
events_df = pd.read_csv(os.path.join(path,'events.csv'), header=0, dtype={'order_id': 'str', 'clientId': 'str'}, parse_dates=['@timestamp'])
# events_df = events_df.head(100000)
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564308 entries, 0 to 9564307
Data columns (total 24 columns):
 #   Column                    Dtype              
---  ------                    -----              
 0   @timestamp                datetime64[ns, UTC]
 1   order_id                  object             
 2   sessionId                 object             
 3   screenResolution          object             
 4   viewportSize              object             
 5   screenColors              object             
 6   lclt                      object             
 7   lcng                      object             
 8   hitType                   object             
 9   eventCategory             object             
 10  eventAction               object             
 11  referrer                  object             
 12  value                     float64            
 13  headers.http_user_agent   object             
 14  headers.x_forwarded_for   object             
 15  headers.clienti

In [5]:
def preprocess_orders(ao):
  ao = ao.drop_duplicates(subset=['order_id'], keep='last')
  ao['order_id'] = ao['order_id'].astype('str')
  ao['class'] = ao['class'].astype('str')
  ao['age'] = ao['age'].astype('float').fillna(ao['age'].mean()).round(2)
  return ao

In [6]:
orders_dfx = preprocess_orders(orders_df)
orders_dfx.head()

Unnamed: 0,order_id,userid,class,age,clientId
0,OxWZY,3KBV,0,1.2,M7Dnqzw
1,kw6yN,3KBV,0,1.2,GL97Pdp
4,br867,3KBV,0,1.2,j9PqmkG
6,ndE51,3KBV,0,1.2,4Ojk768
8,2w5lG,3KBV,0,1.2,4Ojk768


In [7]:
def preprocess_events(df):

    TIMEOUT = 300
    action_vals = ['add_to_cart', 'begin_checkout', 'checkout_progress', 'conversion', 'purchase', 'remove_from_cart', 'view_item', 'view_item_list']
    df = df.loc[df.hitType == 'event']
    df = df.loc[df.eventAction.isin(action_vals) ]

    df_cols = ['@timestamp', 'order_id', 'eventAction', 'value', 'headers.http_user_agent', 'items.quantity', 'items.category', 'items.price']

    df = df[df_cols]

    # sort by timestamp
    df['@timestamp'] = pd.to_datetime(df['@timestamp'])
    df = df.sort_values(by='@timestamp', ascending=False)

    df['order_id'] = df['order_id'].fillna('undefined').astype('str')
    df['eventAction'] = df['eventAction'].fillna('#')
    df['items.category'] = df['items.category'].fillna('undefined')
    df['items.quantity'] = df['items.quantity'].fillna(0).astype('int')
    df['items.price'] = df['items.price'].fillna(0).astype('float')
    df['value'] = df['value'].fillna(0).astype('int')
    df['eventCount'] = 1
    df['conversions'] = df['eventAction'].apply(lambda x: 1 if x == 'conversion' else 0 )

    # aggregate events
    from ua_parser import user_agent_parser

    agg_df = df.groupby(['order_id'])['items.quantity', 'items.price', 'eventCount', 'conversions'].sum()
    first_df = df.groupby(['order_id']).first()
    first_df = first_df[['headers.http_user_agent']]
    first_df['browser'] = first_df['headers.http_user_agent'].apply(lambda x: user_agent_parser.ParseUserAgent(x).get('family', 'unknown'))
    first_df['os'] = first_df['headers.http_user_agent'].apply(lambda x: user_agent_parser.ParseOS(x).get('family', 'unknown'))
    first_df['device'] = first_df['headers.http_user_agent'].apply(lambda x: user_agent_parser.ParseDevice(x).get('brand', 'unknown'))
    first_df['device'] = first_df['device'].fillna('unknown').astype('str')

    value_df = df.loc[df['eventAction'] == 'conversion'].groupby('order_id').first()['value']

    agg_df = pd.merge(agg_df, first_df, on='order_id')
    agg_df = pd.merge(agg_df, value_df, on='order_id')
    agg_df.drop(columns=['headers.http_user_agent', 'items.price'], inplace=True)

    # time dynamic variables
    activity_df = df.copy()
    activity_df = activity_df.sort_values(by='@timestamp', ascending=False)
    activity_df['@timestamp'] = pd.to_datetime(activity_df['@timestamp'])
    activity_df['hourofday'] = activity_df['@timestamp'].dt.hour
    activity_df['dayofmonth'] = activity_df['@timestamp'].dt.day
    activity_df['weekofyear'] = activity_df['@timestamp'].dt.week
    #   activity_df['monthofyear'] = activity_df['@timestamp'].dt.month
    # time delta
    activity_df['timedelta'] = (activity_df.groupby('order_id')['@timestamp'].transform('max')-activity_df.groupby('order_id')['@timestamp'].transform('min'))
    activity_df['timedelta'] = pd.to_timedelta(activity_df['timedelta'], unit='h').dt.components['hours']

    activity_df['time_diff'] = activity_df.groupby('order_id')['@timestamp'].diff(periods=-1)
    activity_df['time_spent'] = activity_df['time_diff']/np.timedelta64(1,'s')
    activity_df['time_spent'] = activity_df['time_spent'].mask(activity_df['time_spent'].gt(TIMEOUT), TIMEOUT)
    activity_df['time_spent'] =activity_df['time_spent'].fillna(0).astype('int')

    activity_df = activity_df[['order_id', 'timedelta', 'hourofday', 'dayofmonth', 'weekofyear', 'time_spent', 'eventAction', 'items.category', 'items.price']]

    return agg_df, activity_df

In [35]:
agg_df, activity_df = preprocess_events(events_df)

In [36]:
agg_df.head()

Unnamed: 0_level_0,items.quantity,eventCount,conversions,browser,os,device,value
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
111zV,1,101,1,Chrome Mobile,Android,Generic_Android,524
1127Y,2,103,1,Chrome Mobile,Android,Generic_Android,1462
1129B,4,23,1,Instagram,Android,vivo,2986
112RV,1,20,1,Instagram,Android,vivo,574
112bB,4,61,1,Chrome Mobile,Android,Generic_Android,1933


In [41]:
agg_df['device'].value_counts()

Apple                     16902
Generic_Android           13289
XiaoMi                    12968
Samsung                   11114
vivo                       7189
unknown                    3150
OnePlus                    3002
Nokia                       529
Generic                     474
Asus                        409
Huawei                      376
Lenovo                      278
Motorola                    247
Spider                      226
Tecno                       142
Gionee                       97
Google                       88
Infinix                      69
Generic_Android_Tablet       36
LG                           31
HTC                          28
Coolpad                      13
Oppo                         12
Intex                         5
Panasonic                     3
Micromax                      2
Lava                          2
iBall                         1
HP                            1
Sony                          1
SonyEricsson                  1
Pipo    

In [42]:
agg_df['browser'].value_counts()

Instagram                     25623
Chrome Mobile                 24557
Mobile Safari                  6732
Facebook                       5096
Chrome                         3161
MiuiBrowser                    1120
Chrome Mobile iOS              1064
Chrome Mobile WebView           766
Google                          693
Samsung Internet                566
UC Browser                      434
Safari                          251
Googlebot                       226
Firefox                         143
Opera Mobile                    112
Edge                             82
Opera                            19
Firefox Mobile                   16
IE                               12
Mint Browser                      6
Mobile Safari UI/WKWebView        3
Puffin                            1
Firefox iOS                       1
Android                           1
Opera Mini                        1
Name: browser, dtype: int64

In [43]:
agg_df['os'].value_counts()

Android      50408
iOS          16429
Windows       3052
Mac OS X       473
Other          226
Linux           62
Chrome OS       28
Ubuntu           8
Name: os, dtype: int64

In [10]:
activity_df.head()

Unnamed: 0,order_id,timedelta,hourofday,dayofmonth,weekofyear,time_spent,eventAction,items.category,items.price
5811679,1Xv8J,3,16,4,36,300,conversion,undefined,0.0
6156696,yno3O,1,19,2,36,0,purchase,undefined,399.0
6156692,yno3O,1,19,2,36,0,purchase,undefined,499.0
6156694,yno3O,1,19,2,36,0,purchase,undefined,399.0
6156690,yno3O,1,19,2,36,0,purchase,undefined,499.0


In [11]:
ts_size = 50

In [12]:
activity_df = activity_df.groupby('order_id').head(ts_size).set_index('order_id')

In [13]:
activity_df.head()

Unnamed: 0_level_0,timedelta,hourofday,dayofmonth,weekofyear,time_spent,eventAction,items.category,items.price
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1Xv8J,3,16,4,36,300,conversion,undefined,0.0
yno3O,1,19,2,36,0,purchase,undefined,399.0
yno3O,1,19,2,36,0,purchase,undefined,499.0
yno3O,1,19,2,36,0,purchase,undefined,399.0
yno3O,1,19,2,36,0,purchase,undefined,499.0


In [14]:
def attr_encoder(df):
    scaler_value = StandardScaler()
    le_browser = preprocessing.LabelEncoder()
    le_os = preprocessing.LabelEncoder()
    le_device = preprocessing.LabelEncoder()

    le_browser.fit(df['browser'])
    pickle.dump(le_browser, open('v3_3_le_browser.pkl', 'wb'))
    le_os.fit(df['os'])
    pickle.dump(le_os, open('v3_3_le_os.pkl', 'wb'))
    le_device.fit(df['device'])
    pickle.dump(le_device, open('v3_3_le_device.pkl', 'wb'))
    
    s_cols = ['items.quantity', 'value', 'eventCount']
    scaler_value.fit(df[s_cols])
    pickle.dump(scaler_value, open('v3_3_scaler_value.pkl', 'wb'))

    df['browser'] = le_browser.transform(df['browser'])
    df['os'] = le_os.transform(df['os'])
    df['device'] = le_device.transform(df['device'])
    df[s_cols] = scaler_value.transform(df[s_cols])
    df['value'] = df['value'].round(2)

    return df

In [15]:
agg_dfx = attr_encoder(agg_df)
agg_dfx.head()

Unnamed: 0_level_0,items.quantity,eventCount,conversions,browser,os,device,value
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
111zV,-0.171502,-0.073124,1,2,0,4,-1.14
1127Y,-0.140061,-0.062725,1,2,0,4,0.78
1129B,-0.077179,-0.478708,1,13,0,31,3.9
112RV,-0.171502,-0.494308,1,13,0,31,-1.04
112bB,-0.077179,-0.281116,1,2,0,4,1.75


In [16]:
def ts_encoder(df):
    df = pd.get_dummies(df, columns = ['eventAction'], drop_first=True)
    scaler_price = StandardScaler()
    # le_action = preprocessing.LabelEncoder()
    # le_itemcat = preprocessing.LabelEncoder()
    # le_action.fit(df['eventAction'])
    # le_itemcat.fit(df['items.category'])
    ts_cols = ['items.price', 'time_spent']
    scaler_price.fit(df[ts_cols])
    pickle.dump(scaler_price, open('v3_3_scaler_price.pkl', 'wb'))
    # df['eventAction'] = le_action.transform(df['eventAction'])
    # df['items.category'] = le_itemcat.transform(df['items.category'])
    df[ts_cols] = scaler_price.transform(df[ts_cols])
    df['items.price'] = df['items.price'].round(2)

    return df

In [17]:
activity_df = ts_encoder(activity_df)
activity_df.head()

Unnamed: 0_level_0,timedelta,hourofday,dayofmonth,weekofyear,time_spent,items.category,items.price,eventAction_begin_checkout,eventAction_checkout_progress,eventAction_conversion,eventAction_purchase,eventAction_remove_from_cart,eventAction_view_item,eventAction_view_item_list
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1Xv8J,3,16,4,36,3.956653,undefined,-0.96,0,0,1,0,0,0,0
yno3O,1,19,2,36,-0.512666,undefined,0.41,0,0,0,1,0,0,0
yno3O,1,19,2,36,-0.512666,undefined,0.76,0,0,0,1,0,0,0
yno3O,1,19,2,36,-0.512666,undefined,0.41,0,0,0,1,0,0,0
yno3O,1,19,2,36,-0.512666,undefined,0.76,0,0,0,1,0,0,0


In [18]:
activity_df.dtypes

timedelta                          int64
hourofday                          int64
dayofmonth                         int64
weekofyear                         int64
time_spent                       float64
items.category                    object
items.price                      float64
eventAction_begin_checkout         uint8
eventAction_checkout_progress      uint8
eventAction_conversion             uint8
eventAction_purchase               uint8
eventAction_remove_from_cart       uint8
eventAction_view_item              uint8
eventAction_view_item_list         uint8
dtype: object

In [19]:
timedelta_df = activity_df.groupby('order_id')['timedelta'].apply(list)
timedelta_df = timedelta_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
timedelta_df.head()

order_id
111zV    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1127Y    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1129B    [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
112RV    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
112bB    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: timedelta, dtype: object

In [23]:
time_spent_df = activity_df.groupby('order_id')['time_spent'].apply(list)
time_spent_df = time_spent_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
time_spent_df.head()

order_id
111zV    [-0.5126655240255152, 1.4836300164277096, -0.2...
1127Y    [-0.5126655240255152, -0.5126655240255152, 3.9...
1129B    [-0.5126655240255152, -0.5126655240255152, -0....
112RV    [-0.5126655240255152, 3.956652850123496, -0.49...
112bB    [-0.5126655240255152, -0.5126655240255152, -0....
Name: time_spent, dtype: object

In [24]:
eventAction_begin_checkout_df = activity_df.groupby('order_id')['eventAction_begin_checkout'].apply(list)
eventAction_begin_checkout_df = eventAction_begin_checkout_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_checkout_progress_df = activity_df.groupby('order_id')['eventAction_checkout_progress'].apply(list)
eventAction_checkout_progress_df = eventAction_checkout_progress_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_conversion_df = activity_df.groupby('order_id')['eventAction_conversion'].apply(list)
eventAction_conversion_df = eventAction_conversion_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_purchase_df = activity_df.groupby('order_id')['eventAction_purchase'].apply(list)
eventAction_purchase_df = eventAction_purchase_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_remove_from_cart_df = activity_df.groupby('order_id')['eventAction_remove_from_cart'].apply(list)
eventAction_remove_from_cart_df = eventAction_remove_from_cart_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_view_item_df = activity_df.groupby('order_id')['eventAction_view_item'].apply(list)
eventAction_view_item_df = eventAction_view_item_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
eventAction_view_item_list_df = activity_df.groupby('order_id')['eventAction_view_item_list'].apply(list)
eventAction_view_item_list_df = eventAction_view_item_list_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))

In [25]:
# itemcat_df = activity_df.groupby('order_id')['items.category'].apply(list)
# itemcat_df = itemcat_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
# itemcat_df.head()

In [26]:
itemprice_df = activity_df.groupby('order_id')['items.price'].apply(list)
itemprice_df = itemprice_df.apply(lambda x: np.pad(x, (0,ts_size-len(x)), 'constant'))
itemprice_df.head()

order_id
111zV    [0.76, -0.96, 1.79, 1.79, 1.79, -0.96, -0.96, ...
1127Y    [1.44, 1.1, -0.96, 3.85, -0.96, 3.85, -0.96, -...
1129B    [1.44, 1.44, 1.44, 1.44, -0.96, -0.96, 1.44, -...
112RV    [0.76, -0.96, 0.76, 0.76, 0.76, 0.76, 0.76, -0...
112bB    [1.1, 0.76, 0.76, 1.1, -0.96, 0.76, 0.76, 0.76...
Name: items.price, dtype: object

In [27]:
activity_dfx = pd.concat([timedelta_df, time_spent_df, eventAction_begin_checkout_df, eventAction_checkout_progress_df, eventAction_conversion_df, eventAction_purchase_df, eventAction_remove_from_cart_df, eventAction_view_item_df, eventAction_view_item_list_df, itemprice_df], axis=1)
activity_dfx.head()

Unnamed: 0_level_0,timedelta,hourofday,dayofmonth,weekofyear,time_spent,eventAction_begin_checkout,eventAction_checkout_progress,eventAction_conversion,eventAction_purchase,eventAction_remove_from_cart,eventAction_view_item,eventAction_view_item_list,items.price
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
111zV,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 1...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1...","[-0.5126655240255152, 1.4836300164277096, -0.2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[0.76, -0.96, 1.79, 1.79, 1.79, -0.96, -0.96, ..."
1127Y,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, 3.9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1.44, 1.1, -0.96, 3.85, -0.96, 3.85, -0.96, -..."
1129B,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, ...","[1.44, 1.44, 1.44, 1.44, -0.96, -0.96, 1.44, -..."
112RV,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1...","[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, 3.956652850123496, -0.49...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, ...","[0.76, -0.96, 0.76, 0.76, 0.76, 0.76, 0.76, -0..."
112bB,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, ...","[1.1, 0.76, 0.76, 1.1, -0.96, 0.76, 0.76, 0.76..."


In [28]:
activity_dfx.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71927 entries, 111zV to zxzqN
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   timedelta                      71927 non-null  object
 1   hourofday                      71927 non-null  object
 2   dayofmonth                     71927 non-null  object
 3   weekofyear                     71927 non-null  object
 4   time_spent                     71927 non-null  object
 5   eventAction_begin_checkout     71927 non-null  object
 6   eventAction_checkout_progress  71927 non-null  object
 7   eventAction_conversion         71927 non-null  object
 8   eventAction_purchase           71927 non-null  object
 9   eventAction_remove_from_cart   71927 non-null  object
 10  eventAction_view_item          71927 non-null  object
 11  eventAction_view_item_list     71927 non-null  object
 12  items.price                    71927 non-null  object
dtypes:

In [29]:
agg_dfx= agg_dfx.merge(orders_dfx, on='order_id')
agg_dfx.head()

Unnamed: 0,order_id,items.quantity,eventCount,conversions,browser,os,device,value,userid,class,age,clientId
0,111zV,-0.171502,-0.073124,1,2,0,4,-1.14,Z2BMZ,0,2.95,rPPx6nm
1,1127Y,-0.140061,-0.062725,1,2,0,4,0.78,Wan6y,0,3.49,d15N5bj
2,1129B,-0.077179,-0.478708,1,13,0,31,3.9,GJzVp,0,3.73,pxRrvbB
3,112RV,-0.171502,-0.494308,1,13,0,31,-1.04,pNAoX,0,3.72,PqPYmnR
4,112bB,-0.077179,-0.281116,1,2,0,4,1.75,Vz7PM,0,3.73,Dj4yM6m


In [30]:
dfx = agg_dfx.merge(activity_dfx, on='order_id')
dfx.columns

Index(['order_id', 'items.quantity', 'eventCount', 'conversions', 'browser',
       'os', 'device', 'value', 'userid', 'class', 'age', 'clientId',
       'timedelta', 'hourofday', 'dayofmonth', 'weekofyear', 'time_spent',
       'eventAction_begin_checkout', 'eventAction_checkout_progress',
       'eventAction_conversion', 'eventAction_purchase',
       'eventAction_remove_from_cart', 'eventAction_view_item',
       'eventAction_view_item_list', 'items.price'],
      dtype='object')

In [31]:
dfx_cols = ['value', 'items.quantity', 'eventCount',
       'conversions', 'browser', 'os', 'device', 'class',
       'age', 'timedelta', 'time_spent', 'eventAction_begin_checkout',
       'eventAction_checkout_progress', 'eventAction_conversion',
       'eventAction_purchase', 'eventAction_remove_from_cart',
       'eventAction_view_item', 'eventAction_view_item_list', 'items.price']
dfx = dfx[dfx_cols]
dfx.head()

Unnamed: 0,value,items.quantity,eventCount,conversions,browser,os,device,class,age,timedelta,...,weekofyear,time_spent,eventAction_begin_checkout,eventAction_checkout_progress,eventAction_conversion,eventAction_purchase,eventAction_remove_from_cart,eventAction_view_item,eventAction_view_item_list,items.price
0,-1.14,-0.171502,-0.073124,1,2,0,4,0,2.95,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1...","[-0.5126655240255152, 1.4836300164277096, -0.2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[0.76, -0.96, 1.79, 1.79, 1.79, -0.96, -0.96, ..."
1,0.78,-0.140061,-0.062725,1,2,0,4,0,3.49,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",...,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, 3.9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...","[1.44, 1.1, -0.96, 3.85, -0.96, 3.85, -0.96, -..."
2,3.9,-0.077179,-0.478708,1,13,0,31,0,3.73,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",...,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, ...","[1.44, 1.44, 1.44, 1.44, -0.96, -0.96, 1.44, -..."
3,-1.04,-0.171502,-0.494308,1,13,0,31,0,3.72,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, 3.956652850123496, -0.49...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, ...","[0.76, -0.96, 0.76, 0.76, 0.76, 0.76, 0.76, -0..."
4,1.75,-0.077179,-0.281116,1,2,0,4,0,3.73,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",...,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.5126655240255152, -0.5126655240255152, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, ...","[1.1, 0.76, 0.76, 1.1, -0.96, 0.76, 0.76, 0.76..."


In [32]:
dfx.to_pickle('dfx.pkl')