In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [87]:
def is_na_df(X):
    lst_1 = []
    for k in X.keys():
        lst_1.append(sum(X[k].isna()))
    df = pd.DataFrame(lst_1)
    return df

def data_analyze_df(X):
    df_new = pd.DataFrame(X.keys()).set_index(X.keys())
    df_new['count'] = X.count().values
    df_new['nunique'] = X.nunique().values
    df_new['NaN'] = is_na_df(X).values
    df_new['NaN %'] = round(df_new['NaN']/df_new['count']*100, 2)
#     df_new['sum'] = X.sum() > too even to run
    del df_new[0]
    return df_new.style.format('{:,}')

def preprocessing(X):
    X_preprocessed = X.drop_duplicates()
    X_preprocessed = X_preprocessed.dropna(subset = ['category_code', 'brand', 'user_session'])
    X_preprocessed['event_time'] = pd.to_datetime(X_preprocessed['event_time'])
    
    #change id columns to string type
    columns = ['category_id','user_id','product_id']
    X[columns] = X[columns].apply(lambda x: str(x))
    
    X_preprocessed.reset_index(inplace=True)
    ohe = OneHotEncoder(sparse = False)
    ohe.fit(X_preprocessed[['event_type']])
    event_encoded = pd.DataFrame(ohe.transform(X_preprocessed[['event_type']],), columns=ohe.get_feature_names_out())
    X_preprocessed = pd.concat([X_preprocessed, pd.DataFrame(event_encoded)], axis=1)
    del X_preprocessed['index']
    return X_preprocessed

def feature_eng(X_preprocessed):
    X_preprocessed['event_weekday'] = X_preprocessed['event_time'].apply(lambda s: s.weekday())
    X_preprocessed['category_code_level1'] = X_preprocessed['category_code'].str.split('.',expand=True)[0].astype('category')
    X_preprocessed['category_code_level2'] = X_preprocessed['category_code'].str.split('.',expand=True, n=1)[1].astype('category')
    return X_preprocessed

def group_df(X_eng):
    X_eng['activities'] = X_eng['user_session'].map(X_eng.groupby(by='user_session').count()['event_time'])
    X_eng['purchased'] = X_eng['user_session'].map(X_eng.groupby(by='user_session')['event_type_purchase'].sum())
    return X_eng

def count_events(X):
    return pd.DataFrame(X[['event_type_cart', 'event_type_purchase', 'event_type_view']].sum()).style.format('{:,}')

def purchased(X):
    y = pd.DataFrame(X['purchased'].value_counts())
    y['%']= (y['purchased']/y['purchased'].sum())*100
    y.loc['Total']= y.sum()
    return y.style.format({"purchased":"{:,.0f}","%":"{:,.0f}"})

## January 2020 - 10%

In [3]:
df_j = pd.read_csv('10%sample/2020-Jan.csv_10%.csv')
df_j.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-01-01 00:00:01 UTC,view,100063693,2053013552427434207,apparel.shirt,turtle,136.43,519046195,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e
1,2020-01-01 00:00:04 UTC,view,3600666,2232732092297380188,appliances.kitchen.washer,samsung,321.73,556820148,8748d326-2623-42b7-b422-df03db98fa58
2,2020-01-01 00:00:11 UTC,cart,1201565,2232732101407408685,apparel.shoes.slipons,apple,385.34,581430108,7d7687c4-b613-4467-8a81-54c7600e0ca9
3,2020-01-01 00:00:11 UTC,view,22700725,2232732091643068746,,ombra,342.09,513398384,991eff38-b6f4-4503-9a88-f41edfdd5912
4,2020-01-01 00:00:13 UTC,view,4801028,2232732079706079299,sport.bicycle,xiaomi,14.95,574723072,2f967c6e-b6c4-4971-8599-d34f86570e29


In [4]:
df1 = data_analyze_df(df_j)
df1

Unnamed: 0,count,nunique,NaN,NaN %
event_time,5599385,2067053,0,0.0
event_type,5599385,3,0,0.0
product_id,5599385,166187,0,0.0
category_id,5599385,1166,0,0.0
category_code,5095044,135,504341,9.9
brand,4945081,4690,654304,13.23
price,5599385,57190,0,0.0
user_id,5599385,1840519,0,0.0
user_session,5599383,3682528,2,0.0


In [5]:
df_j_pro = preprocessing(df_j)
df_j_pro

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view
0,2020-01-01 00:00:01+00:00,view,100063693,2053013552427434207,apparel.shirt,turtle,136.43,519046195,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e,0.0,0.0,1.0
1,2020-01-01 00:00:04+00:00,view,3600666,2232732092297380188,appliances.kitchen.washer,samsung,321.73,556820148,8748d326-2623-42b7-b422-df03db98fa58,0.0,0.0,1.0
2,2020-01-01 00:00:11+00:00,cart,1201565,2232732101407408685,apparel.shoes.slipons,apple,385.34,581430108,7d7687c4-b613-4467-8a81-54c7600e0ca9,1.0,0.0,0.0
3,2020-01-01 00:00:13+00:00,view,4801028,2232732079706079299,sport.bicycle,xiaomi,14.95,574723072,2f967c6e-b6c4-4971-8599-d34f86570e29,0.0,0.0,1.0
4,2020-01-01 00:00:18+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,869.46,531140669,84c838d4-6e10-4b7d-8d29-90749e577a6b,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4532199,2020-01-31 23:59:45+00:00,view,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,0.0,0.0,1.0
4532200,2020-01-31 23:59:47+00:00,view,100031704,2232732090980368698,furniture.living_room.sofa,bella,1222.68,519398356,e2d8e72b-afee-41bf-94e4-8804364e4f94,0.0,0.0,1.0
4532201,2020-01-31 23:59:48+00:00,view,1005160,2232732093077520756,construction.tools.light,xiaomi,164.98,519236281,e512f514-dc7f-4fc9-9042-e3955989d395,0.0,0.0,1.0
4532202,2020-01-31 23:59:51+00:00,purchase,1004839,2232732093077520756,construction.tools.light,oppo,178.82,513349988,ad43fda6-0401-4bc6-b5a5-f045906c197e,0.0,1.0,0.0


In [6]:
dfj = data_analyze_df(df_j)
dfj

Unnamed: 0,count,nunique,NaN,NaN %
event_time,5599385,2067053,0,0.0
event_type,5599385,3,0,0.0
product_id,5599385,1,0,0.0
category_id,5599385,1,0,0.0
category_code,5095044,135,504341,9.9
brand,4945081,4690,654304,13.23
price,5599385,57190,0,0.0
user_id,5599385,1,0,0.0
user_session,5599383,3682528,2,0.0


In [7]:
df_j_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4532204 entries, 0 to 4532203
Data columns (total 12 columns):
 #   Column               Dtype              
---  ------               -----              
 0   event_time           datetime64[ns, UTC]
 1   event_type           object             
 2   product_id           int64              
 3   category_id          int64              
 4   category_code        object             
 5   brand                object             
 6   price                float64            
 7   user_id              int64              
 8   user_session         object             
 9   event_type_cart      float64            
 10  event_type_purchase  float64            
 11  event_type_view      float64            
dtypes: datetime64[ns, UTC](1), float64(4), int64(3), object(4)
memory usage: 414.9+ MB


In [8]:
dfjj = feature_eng(df_j_pro)
dfjj

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2
0,2020-01-01 00:00:01+00:00,view,100063693,2053013552427434207,apparel.shirt,turtle,136.43,519046195,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e,0.0,0.0,1.0,2,apparel,shirt
1,2020-01-01 00:00:04+00:00,view,3600666,2232732092297380188,appliances.kitchen.washer,samsung,321.73,556820148,8748d326-2623-42b7-b422-df03db98fa58,0.0,0.0,1.0,2,appliances,kitchen.washer
2,2020-01-01 00:00:11+00:00,cart,1201565,2232732101407408685,apparel.shoes.slipons,apple,385.34,581430108,7d7687c4-b613-4467-8a81-54c7600e0ca9,1.0,0.0,0.0,2,apparel,shoes.slipons
3,2020-01-01 00:00:13+00:00,view,4801028,2232732079706079299,sport.bicycle,xiaomi,14.95,574723072,2f967c6e-b6c4-4971-8599-d34f86570e29,0.0,0.0,1.0,2,sport,bicycle
4,2020-01-01 00:00:18+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,869.46,531140669,84c838d4-6e10-4b7d-8d29-90749e577a6b,0.0,0.0,1.0,2,construction,tools.light
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4532199,2020-01-31 23:59:45+00:00,view,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,0.0,0.0,1.0,4,sport,bicycle
4532200,2020-01-31 23:59:47+00:00,view,100031704,2232732090980368698,furniture.living_room.sofa,bella,1222.68,519398356,e2d8e72b-afee-41bf-94e4-8804364e4f94,0.0,0.0,1.0,4,furniture,living_room.sofa
4532201,2020-01-31 23:59:48+00:00,view,1005160,2232732093077520756,construction.tools.light,xiaomi,164.98,519236281,e512f514-dc7f-4fc9-9042-e3955989d395,0.0,0.0,1.0,4,construction,tools.light
4532202,2020-01-31 23:59:51+00:00,purchase,1004839,2232732093077520756,construction.tools.light,oppo,178.82,513349988,ad43fda6-0401-4bc6-b5a5-f045906c197e,0.0,1.0,0.0,4,construction,tools.light


In [9]:
df2 = count_events(df_j_pro)
df2

Unnamed: 0,0
event_type_cart,225092.0
event_type_purchase,74183.0
event_type_view,4232929.0


In [10]:
df_j_pro = group_df(df_j_pro)
df_j_pro

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased
0,2020-01-01 00:00:01+00:00,view,100063693,2053013552427434207,apparel.shirt,turtle,136.43,519046195,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e,0.0,0.0,1.0,2,apparel,shirt,1,0.0
1,2020-01-01 00:00:04+00:00,view,3600666,2232732092297380188,appliances.kitchen.washer,samsung,321.73,556820148,8748d326-2623-42b7-b422-df03db98fa58,0.0,0.0,1.0,2,appliances,kitchen.washer,1,0.0
2,2020-01-01 00:00:11+00:00,cart,1201565,2232732101407408685,apparel.shoes.slipons,apple,385.34,581430108,7d7687c4-b613-4467-8a81-54c7600e0ca9,1.0,0.0,0.0,2,apparel,shoes.slipons,1,0.0
3,2020-01-01 00:00:13+00:00,view,4801028,2232732079706079299,sport.bicycle,xiaomi,14.95,574723072,2f967c6e-b6c4-4971-8599-d34f86570e29,0.0,0.0,1.0,2,sport,bicycle,1,0.0
4,2020-01-01 00:00:18+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,869.46,531140669,84c838d4-6e10-4b7d-8d29-90749e577a6b,0.0,0.0,1.0,2,construction,tools.light,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4532199,2020-01-31 23:59:45+00:00,view,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,0.0,0.0,1.0,4,sport,bicycle,1,0.0
4532200,2020-01-31 23:59:47+00:00,view,100031704,2232732090980368698,furniture.living_room.sofa,bella,1222.68,519398356,e2d8e72b-afee-41bf-94e4-8804364e4f94,0.0,0.0,1.0,4,furniture,living_room.sofa,2,0.0
4532201,2020-01-31 23:59:48+00:00,view,1005160,2232732093077520756,construction.tools.light,xiaomi,164.98,519236281,e512f514-dc7f-4fc9-9042-e3955989d395,0.0,0.0,1.0,4,construction,tools.light,2,0.0
4532202,2020-01-31 23:59:51+00:00,purchase,1004839,2232732093077520756,construction.tools.light,oppo,178.82,513349988,ad43fda6-0401-4bc6-b5a5-f045906c197e,0.0,1.0,0.0,4,construction,tools.light,1,1.0


In [51]:
df_j_pro['purchased'].value_counts()

0.0     4401866
1.0      123539
2.0        5909
3.0         637
4.0         115
19.0         94
5.0          29
7.0          15
Name: purchased, dtype: int64

In [40]:
df_j_final = group_df(df_j_pro)
df_j_final

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased
0,2020-01-01 00:00:01+00:00,view,100063693,2053013552427434207,apparel.shirt,turtle,136.43,519046195,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e,0.0,0.0,1.0,2,apparel,shirt,1,0.0
1,2020-01-01 00:00:04+00:00,view,3600666,2232732092297380188,appliances.kitchen.washer,samsung,321.73,556820148,8748d326-2623-42b7-b422-df03db98fa58,0.0,0.0,1.0,2,appliances,kitchen.washer,1,0.0
2,2020-01-01 00:00:11+00:00,cart,1201565,2232732101407408685,apparel.shoes.slipons,apple,385.34,581430108,7d7687c4-b613-4467-8a81-54c7600e0ca9,1.0,0.0,0.0,2,apparel,shoes.slipons,1,0.0
3,2020-01-01 00:00:13+00:00,view,4801028,2232732079706079299,sport.bicycle,xiaomi,14.95,574723072,2f967c6e-b6c4-4971-8599-d34f86570e29,0.0,0.0,1.0,2,sport,bicycle,1,0.0
4,2020-01-01 00:00:18+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,869.46,531140669,84c838d4-6e10-4b7d-8d29-90749e577a6b,0.0,0.0,1.0,2,construction,tools.light,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4532199,2020-01-31 23:59:45+00:00,view,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,0.0,0.0,1.0,4,sport,bicycle,1,0.0
4532200,2020-01-31 23:59:47+00:00,view,100031704,2232732090980368698,furniture.living_room.sofa,bella,1222.68,519398356,e2d8e72b-afee-41bf-94e4-8804364e4f94,0.0,0.0,1.0,4,furniture,living_room.sofa,2,0.0
4532201,2020-01-31 23:59:48+00:00,view,1005160,2232732093077520756,construction.tools.light,xiaomi,164.98,519236281,e512f514-dc7f-4fc9-9042-e3955989d395,0.0,0.0,1.0,4,construction,tools.light,2,0.0
4532202,2020-01-31 23:59:51+00:00,purchase,1004839,2232732093077520756,construction.tools.light,oppo,178.82,513349988,ad43fda6-0401-4bc6-b5a5-f045906c197e,0.0,1.0,0.0,4,construction,tools.light,1,1.0


In [91]:
purchased(df_j_final)

Unnamed: 0,purchased,%
0.000000,4401866,97
1.000000,123539,3
2.000000,5909,0
3.000000,637,0
4.000000,115,0
19.000000,94,0
5.000000,29,0
7.000000,15,0
Total,4532204,100


## February 2020 - 10%

In [12]:
df_f = pd.read_csv('10%sample/2020-Feb.csv_10%.csv')
df_f.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-02-01 00:00:01 UTC,view,1002544,2232732093077520756,construction.tools.light,apple,410.42,530488542,21f46782-fb36-4d5c-82d4-6dc26ae93fad
1,2020-02-01 00:00:02 UTC,view,1005115,2232732093077520756,construction.tools.light,apple,806.61,608822150,50d1339f-561e-41f9-944b-f3571af57b05
2,2020-02-01 00:00:03 UTC,view,15901490,2053013566142809077,construction.tools.generator,,46.33,518459407,c2a1a5e4-7bf2-4985-a726-11e135c9a533
3,2020-02-01 00:00:03 UTC,view,1004648,2232732093077520756,construction.tools.light,samsung,563.7,608821035,3e271f43-db51-46bd-b215-c2907ed1a204
4,2020-02-01 00:00:06 UTC,view,1005239,2232732093077520756,construction.tools.light,xiaomi,228.83,516144776,a168223b-0da7-47f0-ac17-b20faf4bfb03


In [13]:
df3 = data_analyze_df(df_f)
df3

Unnamed: 0,count,nunique,NaN,NaN %
event_time,5531632,1912878,0,0.0
event_type,5531632,3,0,0.0
product_id,5531632,182277,0,0.0
category_id,5531632,1104,0,0.0
category_code,5038457,138,493175,9.79
brand,4671890,4087,859742,18.4
price,5531632,56356,0,0.0
user_id,5531632,1791534,0,0.0
user_session,5531632,3605921,0,0.0


In [14]:
df_f_pro = preprocessing(df_f)
df_f_pro

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view
0,2020-02-01 00:00:01+00:00,view,1002544,2232732093077520756,construction.tools.light,apple,410.42,530488542,21f46782-fb36-4d5c-82d4-6dc26ae93fad,0.0,0.0,1.0
1,2020-02-01 00:00:02+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,806.61,608822150,50d1339f-561e-41f9-944b-f3571af57b05,0.0,0.0,1.0
2,2020-02-01 00:00:03+00:00,view,1004648,2232732093077520756,construction.tools.light,samsung,563.70,608821035,3e271f43-db51-46bd-b215-c2907ed1a204,0.0,0.0,1.0
3,2020-02-01 00:00:06+00:00,view,1005239,2232732093077520756,construction.tools.light,xiaomi,228.83,516144776,a168223b-0da7-47f0-ac17-b20faf4bfb03,0.0,0.0,1.0
4,2020-02-01 00:00:07+00:00,cart,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4284789,2020-02-29 23:59:39+00:00,view,5100767,2232732103831716449,apparel.shoes,samsung,25.46,616137068,a98cdc37-6124-4977-8108-b22bb6eb0837,0.0,0.0,1.0
4284790,2020-02-29 23:59:41+00:00,view,4804055,2232732079706079299,sport.bicycle,apple,193.75,614770853,16f253ff-f315-40c7-87bf-b331dbe0b0ef,0.0,0.0,1.0
4284791,2020-02-29 23:59:44+00:00,view,100068488,2232732093077520756,construction.tools.light,samsung,293.13,517021211,0c34308d-c455-40bb-9992-3e44920bc2b9,0.0,0.0,1.0
4284792,2020-02-29 23:59:49+00:00,view,18700066,2232732100056842769,appliances.personal.massager,sv,205.90,523152557,20df931e-62d7-4c3b-bf24-0a49acefd7ef,0.0,0.0,1.0


In [15]:
dff = data_analyze_df(df_f_pro)
dff

Unnamed: 0,count,nunique,NaN,NaN %
event_time,4284794,1773088,0,0.0
event_type,4284794,3,0,0.0
product_id,4284794,110442,0,0.0
category_id,4284794,865,0,0.0
category_code,4284794,138,0,0.0
brand,4284794,3675,0,0.0
price,4284794,49645,0,0.0
user_id,4284794,1538332,0,0.0
user_session,4284794,2929939,0,0.0
event_type_cart,4284794,2,0,0.0


In [16]:
dfff = feature_eng(df_f_pro)
dfff

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2
0,2020-02-01 00:00:01+00:00,view,1002544,2232732093077520756,construction.tools.light,apple,410.42,530488542,21f46782-fb36-4d5c-82d4-6dc26ae93fad,0.0,0.0,1.0,5,construction,tools.light
1,2020-02-01 00:00:02+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,806.61,608822150,50d1339f-561e-41f9-944b-f3571af57b05,0.0,0.0,1.0,5,construction,tools.light
2,2020-02-01 00:00:03+00:00,view,1004648,2232732093077520756,construction.tools.light,samsung,563.70,608821035,3e271f43-db51-46bd-b215-c2907ed1a204,0.0,0.0,1.0,5,construction,tools.light
3,2020-02-01 00:00:06+00:00,view,1005239,2232732093077520756,construction.tools.light,xiaomi,228.83,516144776,a168223b-0da7-47f0-ac17-b20faf4bfb03,0.0,0.0,1.0,5,construction,tools.light
4,2020-02-01 00:00:07+00:00,cart,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,1.0,0.0,0.0,5,sport,bicycle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4284789,2020-02-29 23:59:39+00:00,view,5100767,2232732103831716449,apparel.shoes,samsung,25.46,616137068,a98cdc37-6124-4977-8108-b22bb6eb0837,0.0,0.0,1.0,5,apparel,shoes
4284790,2020-02-29 23:59:41+00:00,view,4804055,2232732079706079299,sport.bicycle,apple,193.75,614770853,16f253ff-f315-40c7-87bf-b331dbe0b0ef,0.0,0.0,1.0,5,sport,bicycle
4284791,2020-02-29 23:59:44+00:00,view,100068488,2232732093077520756,construction.tools.light,samsung,293.13,517021211,0c34308d-c455-40bb-9992-3e44920bc2b9,0.0,0.0,1.0,5,construction,tools.light
4284792,2020-02-29 23:59:49+00:00,view,18700066,2232732100056842769,appliances.personal.massager,sv,205.90,523152557,20df931e-62d7-4c3b-bf24-0a49acefd7ef,0.0,0.0,1.0,5,appliances,personal.massager


In [17]:
df_f_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4284794 entries, 0 to 4284793
Data columns (total 15 columns):
 #   Column                Dtype              
---  ------                -----              
 0   event_time            datetime64[ns, UTC]
 1   event_type            object             
 2   product_id            int64              
 3   category_id           int64              
 4   category_code         object             
 5   brand                 object             
 6   price                 float64            
 7   user_id               int64              
 8   user_session          object             
 9   event_type_cart       float64            
 10  event_type_purchase   float64            
 11  event_type_view       float64            
 12  event_weekday         int64              
 13  category_code_level1  category           
 14  category_code_level2  category           
dtypes: category(2), datetime64[ns, UTC](1), float64(4), int64(4), object(4)
memory usag

In [18]:
df4 = count_events(df_f_pro)
df4

Unnamed: 0,0
event_type_cart,239290.0
event_type_purchase,104042.0
event_type_view,3941462.0


In [41]:
df_f_final = group_df(df_f_pro)
df_f_final

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased
0,2020-02-01 00:00:01+00:00,view,1002544,2232732093077520756,construction.tools.light,apple,410.42,530488542,21f46782-fb36-4d5c-82d4-6dc26ae93fad,0.0,0.0,1.0,5,construction,tools.light,3,0.0
1,2020-02-01 00:00:02+00:00,view,1005115,2232732093077520756,construction.tools.light,apple,806.61,608822150,50d1339f-561e-41f9-944b-f3571af57b05,0.0,0.0,1.0,5,construction,tools.light,2,0.0
2,2020-02-01 00:00:03+00:00,view,1004648,2232732093077520756,construction.tools.light,samsung,563.70,608821035,3e271f43-db51-46bd-b215-c2907ed1a204,0.0,0.0,1.0,5,construction,tools.light,15,0.0
3,2020-02-01 00:00:06+00:00,view,1005239,2232732093077520756,construction.tools.light,xiaomi,228.83,516144776,a168223b-0da7-47f0-ac17-b20faf4bfb03,0.0,0.0,1.0,5,construction,tools.light,1,0.0
4,2020-02-01 00:00:07+00:00,cart,100001549,2053013558978937451,sport.bicycle,puma,38.15,608821784,600f4508-f562-4b26-9750-49ae69938222,1.0,0.0,0.0,5,sport,bicycle,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4284789,2020-02-29 23:59:39+00:00,view,5100767,2232732103831716449,apparel.shoes,samsung,25.46,616137068,a98cdc37-6124-4977-8108-b22bb6eb0837,0.0,0.0,1.0,5,apparel,shoes,1,0.0
4284790,2020-02-29 23:59:41+00:00,view,4804055,2232732079706079299,sport.bicycle,apple,193.75,614770853,16f253ff-f315-40c7-87bf-b331dbe0b0ef,0.0,0.0,1.0,5,sport,bicycle,1,0.0
4284791,2020-02-29 23:59:44+00:00,view,100068488,2232732093077520756,construction.tools.light,samsung,293.13,517021211,0c34308d-c455-40bb-9992-3e44920bc2b9,0.0,0.0,1.0,5,construction,tools.light,3,0.0
4284792,2020-02-29 23:59:49+00:00,view,18700066,2232732100056842769,appliances.personal.massager,sv,205.90,523152557,20df931e-62d7-4c3b-bf24-0a49acefd7ef,0.0,0.0,1.0,5,appliances,personal.massager,2,0.0


In [90]:
purchased(df_f_final)

Unnamed: 0,purchased,%
0.000000,4106224,96
1.000000,163267,4
2.000000,13093,0
3.000000,1561,0
4.000000,428,0
5.000000,127,0
18.000000,60,0
6.000000,12,0
9.000000,12,0
7.000000,10,0


## March 2020 - 10%

In [19]:
df_m = pd.read_csv('10%sample/2020-Mar.csv_10%.csv')
df_m.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-03-01 00:00:08 UTC,view,26205155,2232732081585127530,construction.components.faucet,,103.48,622090375,cc8fb9ef-9ae7-41bc-bf39-709065ee44f7
1,2020-03-01 00:00:13 UTC,view,2401253,2232732100769874463,appliances.personal.massager,,89.84,595972218,bde65786-cc67-4523-97f7-577a5836ebb6
2,2020-03-01 00:00:15 UTC,view,100036175,2053013553056579841,computers.peripherals.printer,sokolov,38.61,587790209,07fc5203-a0d5-47b8-934c-909c9e9108cf
3,2020-03-01 00:00:16 UTC,view,10301847,2232732104888681081,apparel.scarf,mattel,6.92,592023505,1967ad00-d3c8-417f-a377-100c21a2bbc4
4,2020-03-01 00:00:19 UTC,view,3600145,2232732092297380188,appliances.kitchen.washer,indesit,179.71,621538299,189da4a9-76c0-4ede-8d64-cfdf1a3bb76c


In [20]:
df5 = data_analyze_df(df_m)
df5

Unnamed: 0,count,nunique,NaN,NaN %
event_time,5633699,2045251,0,0.0
event_type,5633699,3,0,0.0
product_id,5633699,184024,0,0.0
category_id,5633699,1075,0,0.0
category_code,5039896,138,593803,11.78
brand,4822039,4130,811660,16.83
price,5633699,52865,0,0.0
user_id,5633699,1779896,0,0.0
user_session,5633697,3614168,2,0.0


In [21]:
df_m_pro = preprocessing(df_m)
df_m_pro

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view
0,2020-03-01 00:00:15+00:00,view,100036175,2053013553056579841,computers.peripherals.printer,sokolov,38.61,587790209,07fc5203-a0d5-47b8-934c-909c9e9108cf,0.0,0.0,1.0
1,2020-03-01 00:00:16+00:00,view,10301847,2232732104888681081,apparel.scarf,mattel,6.92,592023505,1967ad00-d3c8-417f-a377-100c21a2bbc4,0.0,0.0,1.0
2,2020-03-01 00:00:19+00:00,view,3600145,2232732092297380188,appliances.kitchen.washer,indesit,179.71,621538299,189da4a9-76c0-4ede-8d64-cfdf1a3bb76c,0.0,0.0,1.0
3,2020-03-01 00:00:21+00:00,view,1307526,2053013554658804075,electronics.audio.headphone,asus,926.64,521644904,4a1763fe-d476-4b54-b3ce-795f4e013ef9,0.0,0.0,1.0
4,2020-03-01 00:00:28+00:00,view,1004856,2232732093077520756,construction.tools.light,samsung,130.39,521098149,a014d15c-f4b9-45fa-b007-8e15c88389cd,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4341994,2020-03-31 23:59:01+00:00,view,1005100,2232732093077520756,construction.tools.light,samsung,148.49,516838609,210abaa5-c93f-449f-8dab-12996014d64c,0.0,0.0,1.0
4341995,2020-03-31 23:59:03+00:00,view,100056556,2053013553325015316,appliances.kitchen.toster,altair,73.36,635166085,c041b4f2-4b05-4053-a397-425b7b02aa21,0.0,0.0,1.0
4341996,2020-03-31 23:59:05+00:00,view,100155723,2232732134441746764,apparel.tshirt,glissade,156.51,601154152,0f46540a-834f-4433-aacc-180768395c3b,0.0,0.0,1.0
4341997,2020-03-31 23:59:21+00:00,view,5500111,2232732093941547400,furniture.bedroom.blanket,vitek,17.99,514584035,ad1cc6c5-b018-44c7-bdd5-62e2582c058e,0.0,0.0,1.0


In [22]:
dfm = data_analyze_df(df_m_pro)
dfm

Unnamed: 0,count,nunique,NaN,NaN %
event_time,4341999,1881524,0,0.0
event_type,4341999,3,0,0.0
product_id,4341999,109963,0,0.0
category_id,4341999,843,0,0.0
category_code,4341999,138,0,0.0
brand,4341999,3697,0,0.0
price,4341999,45700,0,0.0
user_id,4341999,1519941,0,0.0
user_session,4341999,2913740,0,0.0
event_type_cart,4341999,2,0,0.0


In [23]:
dfmm = feature_eng(df_m_pro)
dfmm

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2
0,2020-03-01 00:00:15+00:00,view,100036175,2053013553056579841,computers.peripherals.printer,sokolov,38.61,587790209,07fc5203-a0d5-47b8-934c-909c9e9108cf,0.0,0.0,1.0,6,computers,peripherals.printer
1,2020-03-01 00:00:16+00:00,view,10301847,2232732104888681081,apparel.scarf,mattel,6.92,592023505,1967ad00-d3c8-417f-a377-100c21a2bbc4,0.0,0.0,1.0,6,apparel,scarf
2,2020-03-01 00:00:19+00:00,view,3600145,2232732092297380188,appliances.kitchen.washer,indesit,179.71,621538299,189da4a9-76c0-4ede-8d64-cfdf1a3bb76c,0.0,0.0,1.0,6,appliances,kitchen.washer
3,2020-03-01 00:00:21+00:00,view,1307526,2053013554658804075,electronics.audio.headphone,asus,926.64,521644904,4a1763fe-d476-4b54-b3ce-795f4e013ef9,0.0,0.0,1.0,6,electronics,audio.headphone
4,2020-03-01 00:00:28+00:00,view,1004856,2232732093077520756,construction.tools.light,samsung,130.39,521098149,a014d15c-f4b9-45fa-b007-8e15c88389cd,0.0,0.0,1.0,6,construction,tools.light
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4341994,2020-03-31 23:59:01+00:00,view,1005100,2232732093077520756,construction.tools.light,samsung,148.49,516838609,210abaa5-c93f-449f-8dab-12996014d64c,0.0,0.0,1.0,1,construction,tools.light
4341995,2020-03-31 23:59:03+00:00,view,100056556,2053013553325015316,appliances.kitchen.toster,altair,73.36,635166085,c041b4f2-4b05-4053-a397-425b7b02aa21,0.0,0.0,1.0,1,appliances,kitchen.toster
4341996,2020-03-31 23:59:05+00:00,view,100155723,2232732134441746764,apparel.tshirt,glissade,156.51,601154152,0f46540a-834f-4433-aacc-180768395c3b,0.0,0.0,1.0,1,apparel,tshirt
4341997,2020-03-31 23:59:21+00:00,view,5500111,2232732093941547400,furniture.bedroom.blanket,vitek,17.99,514584035,ad1cc6c5-b018-44c7-bdd5-62e2582c058e,0.0,0.0,1.0,1,furniture,bedroom.blanket


In [24]:
df_m_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4341999 entries, 0 to 4341998
Data columns (total 15 columns):
 #   Column                Dtype              
---  ------                -----              
 0   event_time            datetime64[ns, UTC]
 1   event_type            object             
 2   product_id            int64              
 3   category_id           int64              
 4   category_code         object             
 5   brand                 object             
 6   price                 float64            
 7   user_id               int64              
 8   user_session          object             
 9   event_type_cart       float64            
 10  event_type_purchase   float64            
 11  event_type_view       float64            
 12  event_weekday         int64              
 13  category_code_level1  category           
 14  category_code_level2  category           
dtypes: category(2), datetime64[ns, UTC](1), float64(4), int64(4), object(4)
memory usag

In [42]:
df_m_final = group_df(df_m_pro)
df_m_final

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased
0,2020-03-01 00:00:15+00:00,view,100036175,2053013553056579841,computers.peripherals.printer,sokolov,38.61,587790209,07fc5203-a0d5-47b8-934c-909c9e9108cf,0.0,0.0,1.0,6,computers,peripherals.printer,2,0.0
1,2020-03-01 00:00:16+00:00,view,10301847,2232732104888681081,apparel.scarf,mattel,6.92,592023505,1967ad00-d3c8-417f-a377-100c21a2bbc4,0.0,0.0,1.0,6,apparel,scarf,1,0.0
2,2020-03-01 00:00:19+00:00,view,3600145,2232732092297380188,appliances.kitchen.washer,indesit,179.71,621538299,189da4a9-76c0-4ede-8d64-cfdf1a3bb76c,0.0,0.0,1.0,6,appliances,kitchen.washer,1,0.0
3,2020-03-01 00:00:21+00:00,view,1307526,2053013554658804075,electronics.audio.headphone,asus,926.64,521644904,4a1763fe-d476-4b54-b3ce-795f4e013ef9,0.0,0.0,1.0,6,electronics,audio.headphone,1,0.0
4,2020-03-01 00:00:28+00:00,view,1004856,2232732093077520756,construction.tools.light,samsung,130.39,521098149,a014d15c-f4b9-45fa-b007-8e15c88389cd,0.0,0.0,1.0,6,construction,tools.light,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4341994,2020-03-31 23:59:01+00:00,view,1005100,2232732093077520756,construction.tools.light,samsung,148.49,516838609,210abaa5-c93f-449f-8dab-12996014d64c,0.0,0.0,1.0,1,construction,tools.light,1,0.0
4341995,2020-03-31 23:59:03+00:00,view,100056556,2053013553325015316,appliances.kitchen.toster,altair,73.36,635166085,c041b4f2-4b05-4053-a397-425b7b02aa21,0.0,0.0,1.0,1,appliances,kitchen.toster,1,0.0
4341996,2020-03-31 23:59:05+00:00,view,100155723,2232732134441746764,apparel.tshirt,glissade,156.51,601154152,0f46540a-834f-4433-aacc-180768395c3b,0.0,0.0,1.0,1,apparel,tshirt,1,0.0
4341997,2020-03-31 23:59:21+00:00,view,5500111,2232732093941547400,furniture.bedroom.blanket,vitek,17.99,514584035,ad1cc6c5-b018-44c7-bdd5-62e2582c058e,0.0,0.0,1.0,1,furniture,bedroom.blanket,1,0.0


In [89]:
purchased(df_m_final)

Unnamed: 0,purchased,%
0.000000,4185490,96
1.000000,148375,3
2.000000,7038,0
3.000000,746,0
4.000000,168,0
12.000000,62,0
8.000000,39,0
5.000000,32,0
9.000000,30,0
6.000000,19,0


## April 2020 - 10%

In [26]:
df_a = pd.read_csv('10%sample/2020-Apr.csv_10%.csv')
df_a.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-04-01 00:00:03 UTC,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9
1,2020-04-01 00:00:09 UTC,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc
2,2020-04-01 00:00:14 UTC,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f
3,2020-04-01 00:00:19 UTC,view,13200853,2232732061804790604,furniture.bedroom.bed,,354.96,635166340,f94c2c0d-abee-4a9b-ad1b-47b85b57c036
4,2020-04-01 00:00:31 UTC,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e


In [27]:
df7 = data_analyze_df(df_a)
df7

Unnamed: 0,count,nunique,NaN,NaN %
event_time,6664139,2136596,0,0.0
event_type,6664139,3,0,0.0
product_id,6664139,189276,0,0.0
category_id,6664139,1134,0,0.0
category_code,5988642,140,675497,11.28
brand,5765451,4234,898688,15.59
price,6664139,46311,0,0.0
user_id,6664139,2044745,0,0.0
user_session,6664125,3952202,14,0.0


In [28]:
df_a_pro = preprocessing(df_a)
df_a_pro

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view
0,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0
1,2020-04-01 00:00:09+00:00,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc,0.0,0.0,1.0
2,2020-04-01 00:00:14+00:00,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,0.0,0.0,1.0
3,2020-04-01 00:00:31+00:00,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e,0.0,0.0,1.0
4,2020-04-01 00:00:35+00:00,cart,5100328,2232732103101907535,electronics.clocks,xiaomi,117.12,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5225561,2020-04-30 23:59:47+00:00,view,100079844,2232732093077520756,construction.tools.light,realme,192.80,649760110,56472da4-6d85-4770-9fe4-87a5990502b6,0.0,0.0,1.0
5225562,2020-04-30 23:59:48+00:00,view,1004229,2232732093077520756,construction.tools.light,apple,952.15,529225489,adc5652c-cae8-49ae-88a8-1d192d60f6fa,0.0,0.0,1.0
5225563,2020-04-30 23:59:50+00:00,view,100101230,2232732093077520756,construction.tools.light,samsung,465.93,646463987,3b2022ca-0cc7-4152-bd54-62aaa23daf92,0.0,0.0,1.0
5225564,2020-04-30 23:59:51+00:00,view,28300873,2053013554751078769,appliances.kitchen.grill,karya,32.18,550540468,a9afddeb-f9ed-4eb5-a157-d903dd6ae124,0.0,0.0,1.0


In [29]:
dfa = data_analyze_df(df_a_pro)
dfa

Unnamed: 0,count,nunique,NaN,NaN %
event_time,5225566,2002292,0,0.0
event_type,5225566,3,0,0.0
product_id,5225566,117190,0,0.0
category_id,5225566,859,0,0.0
category_code,5225566,140,0,0.0
brand,5225566,3750,0,0.0
price,5225566,40943,0,0.0
user_id,5225566,1758012,0,0.0
user_session,5225566,3247375,0,0.0
event_type_cart,5225566,2,0,0.0


In [30]:
dfaa = feature_eng(df_a_pro)
dfaa

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2
0,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0,2,construction,tools.light
1,2020-04-01 00:00:09+00:00,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc,0.0,0.0,1.0,2,apparel,scarf
2,2020-04-01 00:00:14+00:00,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,0.0,0.0,1.0,2,electronics,clocks
3,2020-04-01 00:00:31+00:00,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e,0.0,0.0,1.0,2,appliances,kitchen.refrigerators
4,2020-04-01 00:00:35+00:00,cart,5100328,2232732103101907535,electronics.clocks,xiaomi,117.12,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,1.0,0.0,0.0,2,electronics,clocks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5225561,2020-04-30 23:59:47+00:00,view,100079844,2232732093077520756,construction.tools.light,realme,192.80,649760110,56472da4-6d85-4770-9fe4-87a5990502b6,0.0,0.0,1.0,3,construction,tools.light
5225562,2020-04-30 23:59:48+00:00,view,1004229,2232732093077520756,construction.tools.light,apple,952.15,529225489,adc5652c-cae8-49ae-88a8-1d192d60f6fa,0.0,0.0,1.0,3,construction,tools.light
5225563,2020-04-30 23:59:50+00:00,view,100101230,2232732093077520756,construction.tools.light,samsung,465.93,646463987,3b2022ca-0cc7-4152-bd54-62aaa23daf92,0.0,0.0,1.0,3,construction,tools.light
5225564,2020-04-30 23:59:51+00:00,view,28300873,2053013554751078769,appliances.kitchen.grill,karya,32.18,550540468,a9afddeb-f9ed-4eb5-a157-d903dd6ae124,0.0,0.0,1.0,3,appliances,kitchen.grill


In [31]:
df_a_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5225566 entries, 0 to 5225565
Data columns (total 15 columns):
 #   Column                Dtype              
---  ------                -----              
 0   event_time            datetime64[ns, UTC]
 1   event_type            object             
 2   product_id            int64              
 3   category_id           int64              
 4   category_code         object             
 5   brand                 object             
 6   price                 float64            
 7   user_id               int64              
 8   user_session          object             
 9   event_type_cart       float64            
 10  event_type_purchase   float64            
 11  event_type_view       float64            
 12  event_weekday         int64              
 13  category_code_level1  category           
 14  category_code_level2  category           
dtypes: category(2), datetime64[ns, UTC](1), float64(4), int64(4), object(4)
memory usag

In [32]:
df8 = count_events(df_a_pro)
df8

Unnamed: 0,0
event_type_cart,270717.0
event_type_purchase,82191.0
event_type_view,4872658.0


In [43]:
df_a_final = group_df(df_a_pro)
df_a_final

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased
0,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0,2,construction,tools.light,1,0.0
1,2020-04-01 00:00:09+00:00,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc,0.0,0.0,1.0,2,apparel,scarf,1,0.0
2,2020-04-01 00:00:14+00:00,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,0.0,0.0,1.0,2,electronics,clocks,2,0.0
3,2020-04-01 00:00:31+00:00,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e,0.0,0.0,1.0,2,appliances,kitchen.refrigerators,3,0.0
4,2020-04-01 00:00:35+00:00,cart,5100328,2232732103101907535,electronics.clocks,xiaomi,117.12,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,1.0,0.0,0.0,2,electronics,clocks,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5225561,2020-04-30 23:59:47+00:00,view,100079844,2232732093077520756,construction.tools.light,realme,192.80,649760110,56472da4-6d85-4770-9fe4-87a5990502b6,0.0,0.0,1.0,3,construction,tools.light,207,0.0
5225562,2020-04-30 23:59:48+00:00,view,1004229,2232732093077520756,construction.tools.light,apple,952.15,529225489,adc5652c-cae8-49ae-88a8-1d192d60f6fa,0.0,0.0,1.0,3,construction,tools.light,1,0.0
5225563,2020-04-30 23:59:50+00:00,view,100101230,2232732093077520756,construction.tools.light,samsung,465.93,646463987,3b2022ca-0cc7-4152-bd54-62aaa23daf92,0.0,0.0,1.0,3,construction,tools.light,1,0.0
5225564,2020-04-30 23:59:51+00:00,view,28300873,2053013554751078769,appliances.kitchen.grill,karya,32.18,550540468,a9afddeb-f9ed-4eb5-a157-d903dd6ae124,0.0,0.0,1.0,3,appliances,kitchen.grill,1,0.0


In [88]:
purchased(df_a_final)

Unnamed: 0,purchased,%
0.000000,5064358,97
1.000000,152560,3
2.000000,7527,0
3.000000,758,0
5.000000,128,0
4.000000,127,0
7.000000,63,0
6.000000,23,0
10.000000,22,0
Total,5225566,100


In [None]:
# df_m[df_m.duplicated() == True]

In [37]:
# data.user_session = '9e56c827-59e0-4c1f-9c5d-cf189c3ba19e'

df_a_pro.query("product_id == 100068493")

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2
0,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0,2,construction,tools.light
361,2020-04-01 00:33:15+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,625670559,ed3868b8-d557-4502-9c57-31bce3205fe2,0.0,0.0,1.0,2,construction,tools.light
436,2020-04-01 00:39:52+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635043119,a2856b1d-1dd5-4d66-9fbe-6e6236b99eed,0.0,0.0,1.0,2,construction,tools.light
678,2020-04-01 01:00:53+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,627485148,0edcc654-7c09-4d2b-81da-4b49050e7516,0.0,0.0,1.0,2,construction,tools.light
800,2020-04-01 01:09:04+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635168853,f8619298-5328-4b67-8880-69915306697e,0.0,0.0,1.0,2,construction,tools.light
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5225199,2020-04-30 23:52:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,301.20,637620227,759cef3e-4f19-4226-b289-e705c06d868f,0.0,0.0,1.0,3,construction,tools.light
5225375,2020-04-30 23:55:33+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,301.20,622163358,0b016bbe-8146-44ae-b2fe-0575c49b7c3e,0.0,0.0,1.0,3,construction,tools.light
5225393,2020-04-30 23:55:52+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,301.20,647594253,d4c95868-ac82-4251-a0e2-e9ad0a333209,0.0,0.0,1.0,3,construction,tools.light
5225411,2020-04-30 23:56:14+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,301.20,627374507,d9a6f263-9dab-4320-aba3-dc85334a2fd1,0.0,0.0,1.0,3,construction,tools.light


In [38]:
df_m.query("user_session == 'bde905b5-20f1-40ff-8ef6-a60d875e31e0'")

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
9376,2020-03-01 03:48:42 UTC,cart,0 26205155\n1 2401253\n2 ...,0 2232732081585127530\n1 223...,electronics.clocks,apple,516.56,0 622090375\n1 595972218\n2 ...,bde905b5-20f1-40ff-8ef6-a60d875e31e0
9377,2020-03-01 03:48:42 UTC,cart,0 26205155\n1 2401253\n2 ...,0 2232732081585127530\n1 223...,electronics.clocks,apple,516.56,0 622090375\n1 595972218\n2 ...,bde905b5-20f1-40ff-8ef6-a60d875e31e0


In [94]:
df_a_final['month'] = pd.to_datetime(df_a_final['event_time']).dt.to_period('M')
df_a_final



Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased,month
0,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0,2,construction,tools.light,1,0.0,2020-04
1,2020-04-01 00:00:09+00:00,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc,0.0,0.0,1.0,2,apparel,scarf,1,0.0,2020-04
2,2020-04-01 00:00:14+00:00,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,0.0,0.0,1.0,2,electronics,clocks,2,0.0,2020-04
3,2020-04-01 00:00:31+00:00,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e,0.0,0.0,1.0,2,appliances,kitchen.refrigerators,3,0.0,2020-04
4,2020-04-01 00:00:35+00:00,cart,5100328,2232732103101907535,electronics.clocks,xiaomi,117.12,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,1.0,0.0,0.0,2,electronics,clocks,2,0.0,2020-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5225561,2020-04-30 23:59:47+00:00,view,100079844,2232732093077520756,construction.tools.light,realme,192.80,649760110,56472da4-6d85-4770-9fe4-87a5990502b6,0.0,0.0,1.0,3,construction,tools.light,207,0.0,2020-04
5225562,2020-04-30 23:59:48+00:00,view,1004229,2232732093077520756,construction.tools.light,apple,952.15,529225489,adc5652c-cae8-49ae-88a8-1d192d60f6fa,0.0,0.0,1.0,3,construction,tools.light,1,0.0,2020-04
5225563,2020-04-30 23:59:50+00:00,view,100101230,2232732093077520756,construction.tools.light,samsung,465.93,646463987,3b2022ca-0cc7-4152-bd54-62aaa23daf92,0.0,0.0,1.0,3,construction,tools.light,1,0.0,2020-04
5225564,2020-04-30 23:59:51+00:00,view,28300873,2053013554751078769,appliances.kitchen.grill,karya,32.18,550540468,a9afddeb-f9ed-4eb5-a157-d903dd6ae124,0.0,0.0,1.0,3,appliances,kitchen.grill,1,0.0,2020-04


In [96]:
df_a_final.groupby(['user_id'])['user_session'].count().reset_index()

Unnamed: 0,user_id,user_session
0,29515875,3
1,42896738,1
2,86517859,3
3,94566147,1
4,96369466,2
...,...,...
1758007,649775252,1
1758008,649775296,1
1758009,649775362,1
1758010,649775534,1


In [103]:
df_a_ = df_a_final.copy()
df_a_.set_index(df_a_final['user_id'], inplace=True)
df_a_

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_type_cart,event_type_purchase,event_type_view,event_weekday,category_code_level1,category_code_level2,activities,purchased,month
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
635165435,2020-04-01 00:00:03+00:00,view,100068493,2232732093077520756,construction.tools.light,samsung,319.41,635165435,861f2378-076f-4ddd-85e3-9844923d03a9,0.0,0.0,1.0,2,construction,tools.light,1,0.0,2020-04
533896443,2020-04-01 00:00:09+00:00,view,9200640,2232732104343421549,apparel.scarf,defender,20.19,533896443,6a220235-f4d6-4987-a51e-8f315b3027fc,0.0,0.0,1.0,2,apparel,scarf,1,0.0,2020-04
635164513,2020-04-01 00:00:14+00:00,view,5100375,2232732103101907535,electronics.clocks,xiaomi,84.94,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,0.0,0.0,1.0,2,electronics,clocks,2,0.0,2020-04
514203043,2020-04-01 00:00:31+00:00,view,2702580,2232732091718566220,appliances.kitchen.refrigerators,midea,710.44,514203043,5adc4e92-9475-407a-ac1b-8c5529f3f23e,0.0,0.0,1.0,2,appliances,kitchen.refrigerators,3,0.0,2020-04
635164513,2020-04-01 00:00:35+00:00,cart,5100328,2232732103101907535,electronics.clocks,xiaomi,117.12,635164513,c40d1b96-90aa-4cee-b9aa-9475d9c4f17f,1.0,0.0,0.0,2,electronics,clocks,2,0.0,2020-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649760110,2020-04-30 23:59:47+00:00,view,100079844,2232732093077520756,construction.tools.light,realme,192.80,649760110,56472da4-6d85-4770-9fe4-87a5990502b6,0.0,0.0,1.0,3,construction,tools.light,207,0.0,2020-04
529225489,2020-04-30 23:59:48+00:00,view,1004229,2232732093077520756,construction.tools.light,apple,952.15,529225489,adc5652c-cae8-49ae-88a8-1d192d60f6fa,0.0,0.0,1.0,3,construction,tools.light,1,0.0,2020-04
646463987,2020-04-30 23:59:50+00:00,view,100101230,2232732093077520756,construction.tools.light,samsung,465.93,646463987,3b2022ca-0cc7-4152-bd54-62aaa23daf92,0.0,0.0,1.0,3,construction,tools.light,1,0.0,2020-04
550540468,2020-04-30 23:59:51+00:00,view,28300873,2053013554751078769,appliances.kitchen.grill,karya,32.18,550540468,a9afddeb-f9ed-4eb5-a157-d903dd6ae124,0.0,0.0,1.0,3,appliances,kitchen.grill,1,0.0,2020-04


In [None]:
# df_m_pro.query("category_id == 2232732103101907535")

In [None]:
# cart_purchase_users = df_m_pro.loc[df_m_pro["event_type"].isin(["cart","purchase"])].drop_duplicates(subset=['user_id'])
# cart_purchase_users

In [None]:
# cart_purchase_users.dropna(how='any', inplace=True)