In [1]:
!pip3 install polars


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np
import multiprocessing
import os
import pickle
import glob
from collections import Counter
import itertools
import sys
import gc
import polars as pl

In [3]:
DISK_PIECES = 4
data_dir = "../data/"
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

In [4]:
def load_data(dfs,path):    
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True)

In [5]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
print(train.shape,test.shape)
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()
print(df.shape)

(216716096, 4) (6928123, 4)
(223644219, 5)


In [None]:
## The below code generates 3 kinds of features
1. user features - these are session based features such as length of the session, avg hour,day of the session, number of adids that are clicked, carted and ordered within the session, along with how long was the session streached, first day and last day. 
2. user item features - these are based on sesion and adid=> features would include number of times the same adid was clicked/carted/ordered, historical clicks, recency score 
3. Item features - thes are item based features => features would include, unique sessions in which this adid was found, avg type, most common day, number of times this adid was clicked, carted or ordered.

## Generating User Features

In [None]:
dts = pd.to_datetime(df['ts'], unit='s') 
df['day'] = (dts.dt.weekday).astype('int8')
df['hour'] = (dts.dt.hour).astype('int8')
df['hm'] = (dts.dt.hour*100 + dts.dt.minute*100//60).astype('int16')

In [None]:
df["prev_ts"]=df.sort_values(['session','ts']).groupby(['session'])['ts'].shift(1)
df["session_break"] = ((df.ts-df.prev_ts)>24*60*60)*1.0
df.loc[df.prev_ts.isnull(),"session_break"]=1

df["time_gap"] = df["ts"] - df["prev_ts"]

In [None]:
user_features = (df.groupby('session').agg({'session':'count',
                                            'aid':'nunique',
                                            'type':'mean',
                                            'hour':'mean',
                                            'day':'nunique',
                                            'session_break':'sum'})
                .rename(columns = {"session":"user_session_length",
                                   "aid":"unique_aid",
                                   "type":"user_type_score",
                                   'hour':'user_hh_mean',
                                   'day':'unique_days',
                                   'session_break':'unique_sessions'})
                .reset_index()
                )
user_features.head()

In [None]:
time_gap = df[df.session_break==0].groupby("session").agg({"time_gap":"mean"}).reset_index()

first_day = df.groupby('session').first().reset_index()[["session","day"]]
first_day.columns=["session",'first_day']

last_day = df.groupby('session').last().reset_index()[["session","day"]]
last_day.columns=["session",'last_day']

click_hour = df[df.type==0].groupby("session").agg({"hour":"mean"}).reset_index()
click_hour.columns=["session",'avg_clk_hr']

cart_hour = df[df.type==1].groupby("session").agg({"hour":"mean"}).reset_index()
cart_hour.columns=["session",'avg_cart_hr']

order_hour = df[df.type==2].groupby("session").agg({"hour":"mean"}).reset_index()
order_hour.columns=["session",'avg_ord_hr']

print(first_day.shape,last_day.shape,click_hour.shape,cart_hour.shape,order_hour.shape)

interim = (first_day
           .merge(last_day,how="left",on="session")
           .merge(click_hour,how="left",on="session")
           .merge(cart_hour,how="left",on="session")
           .merge(order_hour,how="left",on="session")
)
interim = interim.fillna(-1)
print(interim.shape)

del first_day,last_day,click_hour,cart_hour,order_hour,

user_features = user_features.merge(interim,how="left",on="session").fillna(-1)

del interim

In [None]:
def getclk_car_order_counts(df):
    x = pd.pivot_table(df, values='aid', index='session', columns='type',aggfunc='count').reset_index()
    x = x.rename(columns={0:"user_click_cnt",1:"user_cart_cnt",2:"user_order_cnt"})
    return x.fillna(0)

temp = getclk_car_order_counts(df)
del df

print(user_features.shape,temp.shape)

user_features = user_features.merge(temp,on="session",how="inner")
del temp

In [None]:
user_features["user_click_rate"] = user_features["user_click_cnt"] / user_features["user_session_length"]
user_features["user_cart_rate"] = user_features["user_cart_cnt"] / user_features["user_session_length"]
user_features["user_order_rate"] = user_features["user_order_cnt"] / user_features["user_session_length"]
user_features["user_click_rate_per_session"] = user_features["user_click_cnt"] / user_features["unique_sessions"]
user_features["user_cart_rate_per_session"] = user_features["user_cart_cnt"] / user_features["unique_sessions"]
user_features["user_order_rate_per_session"] = user_features["user_order_cnt"] / user_features["unique_sessions"]
user_features.head()

In [None]:
print(user_features.shape)
user_features.to_parquet(f"{data_dir}features/user_feat.pqt")

In [None]:
del user_features

## Generating user Item Features

In [None]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()

In [None]:
print("start")
df["count_prev_click_action"] = df[df.type==0].groupby(["session","aid"]).cumcount()
df["count_prev_cart_action"] = df[df.type==1].groupby(["session","aid"]).cumcount()
df["count_prev_order_action"] = df[df.type==2].groupby(["session","aid"]).cumcount()

print("ph1")
df["click_cg_rank"] = df[df.type==0].groupby(["session"]).cumcount(ascending=False)
df['prev_click_cg_rank'] = df.groupby(['session','aid'])['click_cg_rank'].shift()
df['hist_click'] = df["prev_click_cg_rank"] - df["click_cg_rank"]
df.drop(columns=['click_cg_rank', 'prev_click_cg_rank'],inplace=True)

print("ph2")
df["cart_cg_rank"] = df[df.type==1].groupby(["session"]).cumcount(ascending=False)
df['prev_cart_cg_rank'] = df.groupby(['session','aid'])['cart_cg_rank'].shift()
df['hist_cart'] = df["prev_cart_cg_rank"] - df["cart_cg_rank"]
df.drop(columns=['cart_cg_rank', 'prev_cart_cg_rank'],inplace=True)

print("ph3")
df["order_cg_rank"] = df[df.type==2].groupby(["session"]).cumcount(ascending=False)
df['prev_order_cg_rank'] = df.groupby(['session','aid'])['order_cg_rank'].shift()
df['hist_order'] = df["prev_order_cg_rank"] - df["order_cg_rank"]
df.drop(columns=['order_cg_rank', 'prev_order_cg_rank'],inplace=True)

print("ph4")
df.fillna(value={"count_prev_click_action":-1,"count_prev_cart_action":-1,"count_prev_order_action":-1},inplace=True)
df.fillna(value={"hist_click":-1,"hist_cart":-1,"hist_order":-1},inplace=True)

In [None]:
def add_action_num_reverse_chrono(df):
        return df.select([
            pl.col('*'),
            pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
        ])

def add_session_length(df):
    return df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns(pl.Series(2**linear_interpolation - 1).alias('log_recency_score')).fill_nan(1)

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3}
    type_weighted_log_recency_score = pl.Series(df['type'].apply(lambda x: type_weights[x]) * df['log_recency_score'])
    return df.with_columns(type_weighted_log_recency_score.alias('type_weighted_log_recency_score'))

def apply(df, pipeline):
    for f in pipeline:
        df = f(df)
    return df

pipeline = [add_action_num_reverse_chrono, add_session_length, add_log_recency_score, add_type_weighted_log_recency_score]
df = apply(pl.from_pandas(df), pipeline).select(pl.exclude(["sesssion_length"])).to_pandas()
df.drop(columns=["session_length"],inplace=True)
df.head()

In [None]:
df.drop(columns=["index"]).to_parquet("../data/features/user_item.pqt",index=False)

## Generating Item Features

In [None]:
train = load_data([],f'{data_dir}train_pqt_chunks/*')
test = load_data([],f'{data_dir}test_pqt_chunks/*')
df = pd.concat([train,test])
del train,test
df = df.sort_values(['session','ts']).reset_index()
df.head()

In [None]:
dts = pd.to_datetime(df['ts'], unit='s') 
df['day'] = (dts.dt.weekday).astype('int8')
df['hour'] = (dts.dt.hour).astype('int8')
df['hm'] = (dts.dt.hour*100 + dts.dt.minute*100//60).astype('int16')
df = df[["session","aid","type","ts","day","hour","hm"]]

In [None]:
def first_mode(s):
    modes = s.mode()  # Compute the modes
    if not modes.empty:
        return modes.iloc[0]  # Return the first mode value
    else:
        return None 

item_features = (df.groupby('aid').agg({'aid':'count',
                                        'session':'nunique',
                                        'type':'mean',
                                        'day':first_mode,
                                        'hour':'mean'})
                .rename(columns = {"aid":"item_overall_count",
                                   "session":"item_unique_session",
                                   "type":"item_type_score",
                                   "day":"item_common_day",
                                   "hour":"item_mean_hour"})
                .reset_index()
                )

In [None]:
def getitem_clk_car_order_counts(df):
    x = pd.pivot_table(df, values='session', index='aid', columns='type',aggfunc='count').reset_index()
    x = x.rename(columns={0:"item_click_cnt",1:"item_cart_cnt",2:"item_order_cnt"})
    return x.fillna(0)

temp = getitem_clk_car_order_counts(df)
item_features = item_features.merge(temp,on="aid",how="inner")
del temp

In [None]:
item_features["item_click_rate"] = item_features["item_click_cnt"] / item_features["item_overall_count"]
item_features["item_cart_rate"] = item_features["item_cart_cnt"] / item_features["item_overall_count"]
item_features["item_order_rate"] = item_features["item_order_cnt"] / item_features["item_overall_count"]
item_features.head()

In [None]:
item_features.to_parquet("../data/features/item_feat.pqt")