In [1]:
!pip3 install polars


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np
import multiprocessing
import os
import pickle
import glob
from collections import Counter
import itertools
import sys
import gc
import polars as pl

In [3]:
DISK_PIECES = 4
data_dir = "../data/"
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

In [4]:
def load_data(dfs,path):    
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True)

In [None]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
print(train.shape,test.shape)
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()
print(df.shape)

(216716096, 4) (6928123, 4)


In [None]:
## The below code generates, 
1. user features - these are session based features such as length of the session, avg hour,day of the session, number of adids that are clicked, carted and ordered within the session, along with how long was the session streached, first day and last day. 
2. user item features - these are based on sesion and adidd
3. Item features 

## Generating User Features

In [5]:
dts = pd.to_datetime(df['ts'], unit='s') 
df['day'] = (dts.dt.weekday).astype('int8')
df['hour'] = (dts.dt.hour).astype('int8')
df['hm'] = (dts.dt.hour*100 + dts.dt.minute*100//60).astype('int16')

In [6]:
df["prev_ts"]=df.sort_values(['session','ts']).groupby(['session'])['ts'].shift(1)
df["session_break"] = ((df.ts-df.prev_ts)>24*60*60)*1.0
df.loc[df.prev_ts.isnull(),"session_break"]=1

df["time_gap"] = df["ts"] - df["prev_ts"]

In [7]:
user_features = (df.groupby('session').agg({'session':'count',
                                            'aid':'nunique',
                                            'type':'mean',
                                            'hour':'mean',
                                            'day':'nunique',
                                            'session_break':'sum'})
                .rename(columns = {"session":"user_session_length",
                                   "aid":"unique_aid",
                                   "type":"user_type_score",
                                   'hour':'user_hh_mean',
                                   'day':'unique_days',
                                   'session_break':'unique_sessions'})
                .reset_index()
                )
user_features.head()

Unnamed: 0,session,user_session_length,unique_aid,user_type_score,user_hh_mean,unique_days,unique_sessions
0,0,276,183,0.09058,14.181159,7,8.0
1,1,32,22,0.25,19.1875,5,8.0
2,2,33,29,0.030303,15.818182,2,3.0
3,3,226,140,0.137168,17.367257,7,7.0
4,4,19,12,0.263158,10.526316,4,3.0


In [8]:
time_gap = df[df.session_break==0].groupby("session").agg({"time_gap":"mean"}).reset_index()

first_day = df.groupby('session').first().reset_index()[["session","day"]]
first_day.columns=["session",'first_day']

last_day = df.groupby('session').last().reset_index()[["session","day"]]
last_day.columns=["session",'last_day']

click_hour = df[df.type==0].groupby("session").agg({"hour":"mean"}).reset_index()
click_hour.columns=["session",'avg_clk_hr']

cart_hour = df[df.type==1].groupby("session").agg({"hour":"mean"}).reset_index()
cart_hour.columns=["session",'avg_cart_hr']

order_hour = df[df.type==2].groupby("session").agg({"hour":"mean"}).reset_index()
order_hour.columns=["session",'avg_ord_hr']

print(first_day.shape,last_day.shape,click_hour.shape,cart_hour.shape,order_hour.shape)

interim = (first_day
           .merge(last_day,how="left",on="session")
           .merge(click_hour,how="left",on="session")
           .merge(cart_hour,how="left",on="session")
           .merge(order_hour,how="left",on="session")
)
interim = interim.fillna(-1)
print(interim.shape)

del first_day,last_day,click_hour,cart_hour,order_hour,

user_features = user_features.merge(interim,how="left",on="session").fillna(-1)

del interim

(14571582, 2) (14571582, 2) (14569970, 2) (4053539, 2) (1662187, 2)
(14571582, 6)


In [9]:
def getclk_car_order_counts(df):
    x = pd.pivot_table(df, values='aid', index='session', columns='type',aggfunc='count').reset_index()
    x = x.rename(columns={0:"user_click_cnt",1:"user_cart_cnt",2:"user_order_cnt"})
    return x.fillna(0)

temp = getclk_car_order_counts(df)
del df

print(user_features.shape,temp.shape)

user_features = user_features.merge(temp,on="session",how="inner")
del temp

(14571582, 12) (14571582, 4)


In [10]:
user_features["user_click_rate"] = user_features["user_click_cnt"] / user_features["user_session_length"]
user_features["user_cart_rate"] = user_features["user_cart_cnt"] / user_features["user_session_length"]
user_features["user_order_rate"] = user_features["user_order_cnt"] / user_features["user_session_length"]
user_features["user_click_rate_per_session"] = user_features["user_click_cnt"] / user_features["unique_sessions"]
user_features["user_cart_rate_per_session"] = user_features["user_cart_cnt"] / user_features["unique_sessions"]
user_features["user_order_rate_per_session"] = user_features["user_order_cnt"] / user_features["unique_sessions"]
user_features.head()

Unnamed: 0,session,user_session_length,unique_aid,user_type_score,user_hh_mean,unique_days,unique_sessions,first_day,last_day,avg_clk_hr,...,avg_ord_hr,user_click_cnt,user_cart_cnt,user_order_cnt,user_click_rate,user_cart_rate,user_order_rate,user_click_rate_per_session,user_cart_rate_per_session,user_order_rate_per_session
0,0,276,183,0.09058,14.181159,7,8.0,6,6,13.913725,...,19.0,255.0,17.0,4.0,0.923913,0.061594,0.014493,31.875,2.125,0.5
1,1,32,22,0.25,19.1875,5,8.0,6,6,18.666667,...,-1.0,24.0,8.0,0.0,0.75,0.25,0.0,3.0,1.0,0.0
2,2,33,29,0.030303,15.818182,2,3.0,6,6,15.9375,...,-1.0,32.0,1.0,0.0,0.969697,0.030303,0.0,10.666667,0.333333,0.0
3,3,226,140,0.137168,17.367257,7,7.0,6,6,17.25,...,21.8,200.0,21.0,5.0,0.884956,0.09292,0.022124,28.571429,3.0,0.714286
4,4,19,12,0.263158,10.526316,4,3.0,6,5,10.2,...,22.0,15.0,3.0,1.0,0.789474,0.157895,0.052632,5.0,1.0,0.333333


In [11]:
print(user_features.shape)
user_features.to_parquet(f"{data_dir}features/user_feat.pqt")

(14571582, 21)


In [12]:
del user_features

## Generating user Item Features

In [5]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()

In [6]:
print("start")
df["count_prev_click_action"] = df[df.type==0].groupby(["session","aid"]).cumcount()
df["count_prev_cart_action"] = df[df.type==1].groupby(["session","aid"]).cumcount()
df["count_prev_order_action"] = df[df.type==2].groupby(["session","aid"]).cumcount()

print("ph1")
df["click_cg_rank"] = df[df.type==0].groupby(["session"]).cumcount(ascending=False)
df['prev_click_cg_rank'] = df.groupby(['session','aid'])['click_cg_rank'].shift()
df['hist_click'] = df["prev_click_cg_rank"] - df["click_cg_rank"]
df.drop(columns=['click_cg_rank', 'prev_click_cg_rank'],inplace=True)

print("ph2")
df["cart_cg_rank"] = df[df.type==1].groupby(["session"]).cumcount(ascending=False)
df['prev_cart_cg_rank'] = df.groupby(['session','aid'])['cart_cg_rank'].shift()
df['hist_cart'] = df["prev_cart_cg_rank"] - df["cart_cg_rank"]
df.drop(columns=['cart_cg_rank', 'prev_cart_cg_rank'],inplace=True)

print("ph3")
df["order_cg_rank"] = df[df.type==2].groupby(["session"]).cumcount(ascending=False)
df['prev_order_cg_rank'] = df.groupby(['session','aid'])['order_cg_rank'].shift()
df['hist_order'] = df["prev_order_cg_rank"] - df["order_cg_rank"]
df.drop(columns=['order_cg_rank', 'prev_order_cg_rank'],inplace=True)

print("ph4")
df.fillna(value={"count_prev_click_action":-1,"count_prev_cart_action":-1,"count_prev_order_action":-1},inplace=True)
df.fillna(value={"hist_click":-1,"hist_cart":-1,"hist_order":-1},inplace=True)

start
ph1
ph2
ph3
ph4


In [7]:
def add_action_num_reverse_chrono(df):
        return df.select([
            pl.col('*'),
            pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
        ])

def add_session_length(df):
    return df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns(pl.Series(2**linear_interpolation - 1).alias('log_recency_score')).fill_nan(1)

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3}
    type_weighted_log_recency_score = pl.Series(df['type'].apply(lambda x: type_weights[x]) * df['log_recency_score'])
    return df.with_columns(type_weighted_log_recency_score.alias('type_weighted_log_recency_score'))

def apply(df, pipeline):
    for f in pipeline:
        df = f(df)
    return df

pipeline = [add_action_num_reverse_chrono, add_session_length, add_log_recency_score, add_type_weighted_log_recency_score]
df = apply(pl.from_pandas(df), pipeline).select(pl.exclude(["sesssion_length"])).to_pandas()
df.drop(columns=["session_length"],inplace=True)
df.head()

Series.apply is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
In this case, you can replace your `apply` with the following:
  - s.apply(lambda x: ...)
  + s.map_dict(type_weights)

  type_weighted_log_recency_score = pl.Series(df['type'].apply(lambda x: type_weights[x]) * df['log_recency_score'])


Unnamed: 0,index,session,aid,ts,type,count_prev_click_action,count_prev_cart_action,count_prev_order_action,hist_click,hist_cart,hist_order,action_num_reverse_chrono,log_recency_score,type_weighted_log_recency_score
0,1029224,0,1517085,1659304800,0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,275,0.071773,0.071773
1,1029225,0,1563459,1659304904,0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,274,0.074208,0.074208
2,1029226,0,1309446,1659367439,0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,273,0.076647,0.076647
3,1029227,0,16246,1659367719,0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,272,0.079092,0.079092
4,1029228,0,1781822,1659367871,0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,271,0.081543,0.081543


In [9]:
df.drop(columns=["index"]).to_parquet("../data/features/user_item.pqt",index=False)

## Generating Item Features

In [8]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()
df.head()

Unnamed: 0,index,session,aid,ts,type
0,1029224,0,1517085,1659304800,0
1,1029225,0,1563459,1659304904,0
2,1029226,0,1309446,1659367439,0
3,1029227,0,16246,1659367719,0
4,1029228,0,1781822,1659367871,0


In [9]:
dts = pd.to_datetime(df['ts'], unit='s') 
df['day'] = (dts.dt.weekday).astype('int8')
df['hour'] = (dts.dt.hour).astype('int8')
df['hm'] = (dts.dt.hour*100 + dts.dt.minute*100//60).astype('int16')
df = df[["session","aid","type","ts","day","hour","hm"]]

In [20]:
def first_mode(s):
    modes = s.mode()  # Compute the modes
    if not modes.empty:
        return modes.iloc[0]  # Return the first mode value
    else:
        return None 

item_features = (df.groupby('aid').agg({'aid':'count',
                                        'session':'nunique',
                                        'type':'mean',
                                        'day':first_mode,
                                        'hour':'mean'})
                .rename(columns = {"aid":"item_overall_count",
                                   "session":"item_unique_session",
                                   "type":"item_type_score",
                                   "day":"item_common_day",
                                   "hour":"item_mean_hour"})
                .reset_index()
                )

In [21]:
def getitem_clk_car_order_counts(df):
    x = pd.pivot_table(df, values='session', index='aid', columns='type',aggfunc='count').reset_index()
    x = x.rename(columns={0:"item_click_cnt",1:"item_cart_cnt",2:"item_order_cnt"})
    return x.fillna(0)

temp = getitem_clk_car_order_counts(df)
item_features = item_features.merge(temp,on="aid",how="inner")
del temp

In [22]:
item_features["item_click_rate"] = item_features["item_click_cnt"] / item_features["item_overall_count"]
item_features["item_cart_rate"] = item_features["item_cart_cnt"] / item_features["item_overall_count"]
item_features["item_order_rate"] = item_features["item_order_cnt"] / item_features["item_overall_count"]
item_features.head()

Unnamed: 0,aid,item_overall_count,item_unique_session,item_type_score,item_common_day,item_mean_hour,item_click_cnt,item_cart_cnt,item_order_cnt,item_click_rate,item_cart_rate,item_order_rate
0,0,48,40,0.0,2,13.666667,48.0,0.0,0.0,1.0,0.0,0.0
1,1,34,30,0.029412,6,13.323529,33.0,1.0,0.0,0.970588,0.029412,0.0
2,2,17,16,0.0,6,11.529412,17.0,0.0,0.0,1.0,0.0,0.0
3,3,2759,1392,0.104023,6,13.472273,2513.0,205.0,41.0,0.910837,0.074302,0.01486
4,4,221,143,0.040724,4,14.606335,212.0,9.0,0.0,0.959276,0.040724,0.0


In [24]:
item_features.to_parquet("../data/features/item_feat.pqt")