In [1]:
import os
import sys
# Change directory to two levels up
os.chdir("../..")
print(f"Current working directory: {os.getcwd()}")

Current working directory: /Users/lucapagano/Developer/embedNbreakfast


---

# Folder variables definition

In [12]:
DATA_DIR='/Users/lucapagano/Developer/RecSys/ACM 2025/new_ubc_data/' # Product Properties
DATA_INPUT_DIR='/Users/lucapagano/Developer/RecSys/ACM 2025/new_ubc_data/input/' # Relevant Client
RESULT_DIR='gru_preprocessing/results'

In [9]:
import polars as pl
import numpy as np
from gru_preprocessing.util_functions import collapse, get_entropy_from_counts, add_nearby_skus, parse_embedding,fill_nulls

data_dir=DATA_DIR
data_input_dir= DATA_INPUT_DIR
relevant_clients=np.load(data_input_dir+"relevant_clients.npy")

buy_product= pl.read_parquet(data_input_dir+"product_buy.parquet")
add_to_cart= pl.read_parquet(data_input_dir+"add_to_cart.parquet")
remove_from_cart= pl.read_parquet(data_input_dir+"remove_from_cart.parquet")

product_properties= pl.read_parquet(data_dir+"product_properties.parquet")

In [10]:
buy_product = buy_product.with_columns(
    pl.lit("buy_product").alias("event_type")
)

add_to_cart = add_to_cart.with_columns(
    pl.lit("add_to_cart").alias("event_type")
)

remove_from_cart = remove_from_cart.with_columns(
    pl.lit("remove_from_cart").alias("event_type")
)

product_events = pl.concat([buy_product, add_to_cart, remove_from_cart])
product_events

client_id,timestamp,sku,event_type
i64,datetime[ns],i64,str
19433713,2022-06-23 00:12:15,649662,"""buy_product"""
11106698,2022-06-23 00:12:25,965816,"""buy_product"""
3334023,2022-06-23 00:15:25,419981,"""buy_product"""
3334023,2022-06-23 00:15:25,1161623,"""buy_product"""
2310948,2022-06-23 00:17:20,520725,"""buy_product"""
…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart"""
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart"""
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart"""
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart"""


## Top 150 skus and categories

In [11]:
tmp = buy_product.join(product_properties, on="sku", how="left").select(["sku", "category"])
top_skus = (
    buy_product
    .group_by("sku")
    .agg(pl.count())
    .sort("count", descending=True)
    .head(150)
)
top_cats = (
    tmp
    .group_by("category")
    .agg(pl.count())
    .sort("count", descending=True)
    .head(150)
)
top_skus=np.array(top_skus["sku"])
top_cats=np.array(top_cats["category"])
top_skus,top_cats

(Deprecated in version 0.20.5)
  .agg(pl.count())
(Deprecated in version 0.20.5)
  .agg(pl.count())


(array([ 867128,   52586, 1086081,  342897,  965816,  638314,  301108,
        1093832,  940412,  219232,  393893, 1107354,  832892,  697887,
        1068914, 1265323,  427852,  234831, 1487186, 1492640, 1243077,
        1361428, 1272215,  976261,  649662,  156481,  511214, 1333612,
         609067,  903811,  676739,  479155,  957599,  831416, 1498536,
         978606,  631815, 1040230,  609119,   88324,  724841,   12727,
         402362,  337267, 1586516,  212610, 1108725, 1133516,  555143,
         534158, 1540737,  725456, 1028047, 1051628,  558108, 1426204,
         352708,  327834, 1389878,  971897,  595942,  644287, 1039074,
         828437,    1350,  561077,  691917,   68738, 1202337, 1387357,
         628725,  640169,  692309, 1264060,  149543, 1535795, 1015344,
         786044,  489791, 1397283, 1152665,  180183, 1034465,  199120,
         541003, 1597810,  580542,  828058,  987703,  285186,  390669,
        1126259, 1336838, 1082943,  195952,  568797,  183540,  843260,
      

In [13]:
np.save(f"{RESULT_DIR}/top_skus.npy", top_skus)
np.save(f"{RESULT_DIR}/top_cats.npy", top_cats)

In [15]:
top_skus=np.load(f"{RESULT_DIR}/top_skus.npy")
top_cats=np.load(f"{RESULT_DIR}/top_cats.npy")

In [16]:
product_events=product_events.join(product_properties[["sku","category"]],on="sku",how="left")
product_events

client_id,timestamp,sku,event_type,category
i64,datetime[ns],i64,str,i64
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114
…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014


## Basic Counts

In [17]:
add_count=(
    product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by("sku")
    .agg(pl.len().alias("count_add"))
)
buy_count=(
    product_events.filter(pl.col("event_type")=="buy_product")
    .group_by("sku")
    .agg(pl.len().alias("count_buy"))
)

remove_count=(
    product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by("sku")
    .agg(pl.len().alias("count_remove"))
)

add_user_count=(
    product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by(["sku", "client_id"])
    .agg(pl.len().alias("count_user_add"))
)
buy_user_count=(
    product_events.filter(pl.col("event_type")=="buy_product")
    .group_by(["sku", "client_id"])
    .agg(pl.len().alias("count_user_buy"))
)

remove_user_count=(
    product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by(["sku", "client_id"])
    .agg(pl.len().alias("count_user_remove"))
)

tot_add_user_count=(
    product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by("client_id")
    .agg(pl.len().alias("tot_user_add"))
)
tot_buy_user_count=(
    product_events.filter(pl.col("event_type")=="buy_product")
    .group_by("client_id")
    .agg(pl.len().alias("tot_user_buy"))
)

tot_remove_user_count=(
    product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by("client_id")
    .agg(pl.len().alias("tot_user_remove"))
)



product_events=(product_events
                .join(add_count,how="left",on="sku")
                .join(buy_count,how="left",on="sku")
                .join(remove_count,how="left",on="sku")
                .join(add_user_count,how="left",on=["sku", "client_id"])
                .join(buy_user_count,how="left",on=["sku", "client_id"])
                .join(remove_user_count,how="left",on=["sku", "client_id"])
                .join(tot_add_user_count,how="left",on="client_id")
                .join(tot_buy_user_count,how="left",on="client_id")
                .join(tot_remove_user_count,how="left",on="client_id")
               )
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,,1,1,
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,,1,,,1,
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,,1,,32,14,18
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,,,1,,32,14,18
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,,1,,1,1,
…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,,1,9,2,4
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,,1,5,7,2
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,,1,2,,1,25,6,16
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,,1,1,,1,7,,1


In [18]:
#category
add_cat_count=(
    product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by("category")
    .agg(pl.len().alias("count_cat_add"))
)
buy_cat_count=(
    product_events.filter(pl.col("event_type")=="buy_product")
    .group_by("category")
    .agg(pl.len().alias("count_cat_buy"))
)

remove_cat_count=(
    product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by("category")
    .agg(pl.len().alias("count_cat_remove"))
)

add_cat_user_count=(
    product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by(["category", "client_id"])
    .agg(pl.len().alias("count_cat_user_add"))
)
buy_cat_user_count=(
    product_events.filter(pl.col("event_type")=="buy_product")
    .group_by(["category", "client_id"])
    .agg(pl.len().alias("count_cat_user_buy"))
)

remove_cat_user_count=(
    product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by(["category", "client_id"])
    .agg(pl.len().alias("count_cat_user_remove"))
)





product_events=(product_events
                .join(add_cat_count,how="left",on="category")
                .join(buy_cat_count,how="left",on="category")
                .join(remove_cat_count,how="left",on="category")
                .join(add_cat_user_count,how="left",on=["category", "client_id"])
                .join(buy_cat_user_count,how="left",on=["category", "client_id"])
                .join(remove_cat_user_count,how="left",on=["category", "client_id"])
               )


In [19]:
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,,1,1,,7493,3013,1883,1,1,
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,,1,,,1,,18211,6683,6248,,1,
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,,1,,32,14,18,56052,21711,19208,,1,1
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,,,1,,32,14,18,23595,11265,7316,,1,
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,,1,,1,1,,9830,3573,3602,1,1,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,,1,9,2,4,48428,16901,15461,1,,1
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,,1,5,7,2,3526,1137,1321,1,,1
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,,1,2,,1,25,6,16,786,272,223,3,,1
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,,1,1,,1,7,,1,10364,3280,3477,1,,1


In [21]:
from gru_preprocessing.util_functions import collapse, get_entropy_from_counts, add_nearby_skus, parse_embedding,fill_nulls
product_events= fill_nulls(product_events)

In [22]:
#conversion rates
product_events = product_events.with_columns([
    (pl.col("count_cat_buy") / pl.col("count_cat_add")).alias("global_cat_cr"),
    (pl.col("count_cat_user_buy") / pl.col("count_cat_user_add")).alias("local_cat_cr")
])

product_events = product_events.with_columns([
    (pl.col("count_buy") / pl.col("count_add")).alias("global_cr"),
    (pl.col("tot_user_buy") / pl.col("tot_user_add")).alias("local_cr"),
    (pl.col("count_user_buy") / pl.col("count_user_add")).alias("cr")
])

columns=["local_cr","cr","global_cr"]

product_events = product_events.with_columns([
    pl.when(pl.col(c).is_infinite())
      .then(1.0)
      .otherwise(pl.col(c))
      .fill_nan(0.0)
      .alias(c) for c in columns
])

In [24]:
#17
product_events 

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0


In [25]:
#top sku/cat flag
product_events = product_events.with_columns([
    pl.col("sku").is_in(top_skus).cast(pl.Int8).alias("top_sku"),
    pl.col("category").is_in(top_cats).cast(pl.Int8).alias("top_category")
])
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0,1,1
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0,1,1
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0,0,1
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0,0,1
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0,0,1
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0,0,0
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0,0,0
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0,0,1


#### Clustering names

In [26]:
from sklearn.cluster import KMeans
import numpy as np

names=product_properties[["sku","name"]]
names = names.with_columns(
    pl.col("name").map_elements(parse_embedding, return_dtype=pl.List(pl.Int32)).alias("embedding")
)


name_embeddings = np.array(names["embedding"].to_list())

k = 100 #found using elbow method but can be tuned too
kmeans = KMeans(n_clusters=k, random_state=42)
names_cluster_ids = kmeans.fit_predict(name_embeddings)
centroids_names = kmeans.cluster_centers_

names = names.with_columns([
    pl.Series("cluster_id",names_cluster_ids)
])

centroid_names_dict = {i: list(map(float, centroid)) for i, centroid in enumerate(centroids_names)}
names= names.with_columns([
    pl.col("cluster_id")
    .map_elements(lambda cid: centroid_names_dict.get(cid, []), return_dtype=pl.List(pl.Float32))
    .alias("centroid")
])

In [27]:
import pandas as pd
names.write_parquet(f"{RESULT_DIR}/names_clusters_map.parquet")

In [29]:
names_clusters=pl.read_parquet(f"{RESULT_DIR}/names_clusters_map.parquet")
names_clusters

sku,name,embedding,cluster_id,centroid
i64,str,list[i32],i32,list[f32]
101733,"""[131 245 189 142 164 164 138 2…","[131, 245, … 95]",24,"[165.968613, 154.623581, … 76.853607]"
184680,"""[219 48 162 96 67 72 96 …","[219, 48, … 72]",0,"[130.007111, 171.595444, … 91.597466]"
540546,"""[ 56 212 52 212 212 212 212 …","[56, 212, … 204]",49,"[60.182995, 183.732925, … 156.882355]"
1601877,"""[164 192 102 16 237 106 83 1…","[164, 192, … 26]",61,"[110.437096, 199.206009, … 75.275246]"
1022239,"""[167 191 141 57 130 76 45 …","[167, 191, … 76]",34,"[69.919189, 96.202217, … 114.052193]"
…,…,…,…,…
1351470,"""[ 35 35 254 34 28 218 183 …","[35, 35, … 125]",9,"[75.820915, 85.437355, … 64.627884]"
988917,"""[255 35 173 48 225 56 197 1…","[255, 35, … 56]",13,"[159.935028, 85.955086, … 138.057709]"
54657,"""[ 8 189 8 183 220 157 155 …","[8, 189, … 13]",17,"[69.125427, 140.620453, … 67.11866]"
540863,"""[156 156 190 18 128 128 156 1…","[156, 156, … 128]",50,"[166.239761, 187.127182, … 173.844498]"


In [30]:
product_events=product_events.join(names_clusters[["sku","cluster_id"]],on="sku",how="left")
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0,1,1,91
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0,1,1,32
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0,0,1,31
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0,0,1,42
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0,0,1,59
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0,0,1,63
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0,0,0,72
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0,0,0,65
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0,0,1,7


## Clusters counts

In [31]:
cluster_add_count=(product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by("cluster_id")
    .agg(pl.len().alias("cluster_add_count")))

cluster_buy_count=(product_events.filter(pl.col("event_type")=="buy_product")
    .group_by("cluster_id")
    .agg(pl.len().alias("cluster_buy_count")))

cluster_remove_count=(product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by("cluster_id")
    .agg(pl.len().alias("cluster_remove_count")))


cluster_add_user_count=(product_events.filter(pl.col("event_type")=="add_to_cart")
    .group_by(["client_id","cluster_id"])
    .agg(pl.len().alias("cluster_add_user_count")))

cluster_buy_user_count=(product_events.filter(pl.col("event_type")=="buy_product")
    .group_by(["client_id","cluster_id"])
    .agg(pl.len().alias("cluster_buy_user_count")))

cluster_remove_user_count=(product_events.filter(pl.col("event_type")=="remove_from_cart")
    .group_by(["client_id","cluster_id"])
    .agg(pl.len().alias("cluster_remove_user_count")))
product_events=(product_events
                .join(cluster_add_count,on="cluster_id",how="left")
                .join(cluster_buy_count,on="cluster_id",how="left")
                .join(cluster_remove_count,on="cluster_id",how="left")
                .join(cluster_add_user_count,on=["client_id","cluster_id"],how="left")
                .join(cluster_buy_user_count,on=["client_id","cluster_id"],how="left")
                .join(cluster_remove_user_count,on=["client_id","cluster_id"],how="left")
               )
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32,u32,u32,u32,u32,u32,u32
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0,1,1,91,35554,11072,11746,1,1,
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0,1,1,32,36437,12403,11725,,1,
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0,0,1,31,75635,21839,26948,,1,1
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0,0,1,42,50469,16397,16795,,2,
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0,0,1,59,45890,15408,15551,,1,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0,0,1,63,57821,19401,18779,1,,1
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0,0,0,72,34255,10568,11712,1,,1
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0,0,0,65,44139,15377,14088,3,,1
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0,0,1,7,41460,13486,13583,1,,1


In [32]:
product_events=fill_nulls(product_events)

### Entropies

In [33]:
product_events=(product_events
    .join(get_entropy_from_counts(product_events,"sku","sku_entropy"),on="sku",how="left")
    .join(get_entropy_from_counts(product_events,"cluster_id","cluster_entropy"),on="cluster_id",how="left")
     .join(get_entropy_from_counts(product_events,"category","cat_entropy"),on="category",how="left")
    .join(get_entropy_from_counts(product_events,"sku","user_sku_entropy",user_entropy=True),on="client_id",how="left")
    .join(get_entropy_from_counts(product_events,"cluster_id","user_cluster_entropy",user_entropy=True),on="client_id",how="left")
     .join(get_entropy_from_counts(product_events,"category","user_cat_entropy",user_entropy=True),on="client_id",how="left")
    )
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count,sku_entropy,cluster_entropy,cat_entropy,user_sku_entropy,user_cluster_entropy,user_cat_entropy
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0,1,1,91,35554,11072,11746,1,1,0,9.564421,14.467295,12.313158,0.000001,0.000001,0.000001
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0,1,1,32,36437,12403,11725,0,1,0,10.467132,14.601585,12.959382,0.000001,0.000001,0.000001
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0,0,1,31,75635,21839,26948,0,1,1,4.043802,15.198576,13.344043,5.162349,4.677669,4.286085
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0,0,1,42,50469,16397,16795,0,2,0,3.277613,14.824326,13.444943,5.162349,4.677669,4.286085
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0,0,1,59,45890,15408,15551,0,1,0,2.947703,14.593345,12.01325,1.0,1.0,0.000001
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0,0,1,63,57821,19401,18779,1,0,1,7.34714,14.951241,14.535677,2.289246,2.155913,2.155913
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0,0,0,72,34255,10568,11712,1,0,1,4.158751,14.40685,10.891624,2.556657,2.556657,2.4138
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0,0,0,65,44139,15377,14088,3,0,1,0.000001,14.855344,8.929645,2.876464,2.722312,2.481221
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0,0,1,7,41460,13486,13583,1,0,1,0.918296,14.747392,11.454443,0.811278,0.811278,0.811278


In [35]:
#adding normalized price
product_events=product_events.join(product_properties[["sku","price"]],on="sku",how="left").with_columns(((pl.col("price")+1)/100).alias("price"))

In [36]:
product_events

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count,sku_entropy,cluster_entropy,cat_entropy,user_sku_entropy,user_cluster_entropy,user_cat_entropy,price
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1022,457,216,1,1,0,1,1,0,7493,3013,1883,1,1,0,0.402109,1.0,0.447162,1.0,1.0,1,1,91,35554,11072,11746,1,1,0,9.564421,14.467295,12.313158,0.000001,0.000001,0.000001,0.66
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1892,875,472,0,1,0,0,1,0,18211,6683,6248,0,1,0,0.366976,inf,0.462474,1.0,1.0,1,1,32,36437,12403,11725,0,1,0,10.467132,14.601585,12.959382,0.000001,0.000001,0.000001,0.75
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,32,14,18,56052,21711,19208,0,1,1,0.387337,inf,0.823529,0.4375,1.0,0,1,31,75635,21839,26948,0,1,1,4.043802,15.198576,13.344043,5.162349,4.677669,4.286085,0.75
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,32,14,18,23595,11265,7316,0,1,0,0.477432,inf,1.2,0.4375,1.0,0,1,42,50469,16397,16795,0,2,0,3.277613,14.824326,13.444943,5.162349,4.677669,4.286085,0.49
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,9830,3573,3602,1,1,0,0.363479,1.0,0.142857,1.0,1.0,0,1,59,45890,15408,15551,0,1,0,2.947703,14.593345,12.01325,1.0,1.0,0.000001,0.68
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,239,71,76,1,0,1,9,2,4,48428,16901,15461,1,0,1,0.348992,0.0,0.297071,0.222222,0.0,0,1,63,57821,19401,18779,1,0,1,7.34714,14.951241,14.535677,2.289246,2.155913,2.155913,0.83
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,31,8,18,1,0,1,5,7,2,3526,1137,1321,1,0,1,0.322462,0.0,0.258065,1.4,0.0,0,0,72,34255,10568,11712,1,0,1,4.158751,14.40685,10.891624,2.556657,2.556657,2.4138,0.57
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,2,0,1,2,0,1,25,6,16,786,272,223,3,0,1,0.346056,0.0,0.0,0.24,0.0,0,0,65,44139,15377,14088,3,0,1,0.000001,14.855344,8.929645,2.876464,2.722312,2.481221,0.82
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,2,0,1,1,0,1,7,0,1,10364,3280,3477,1,0,1,0.31648,0.0,0.0,0.0,0.0,0,1,7,41460,13486,13583,1,0,1,0.918296,14.747392,11.454443,0.811278,0.811278,0.811278,0.08


In [40]:
product_events_rc = product_events

### Scaling product events features

In [41]:

import math
count_columns=['count_add','count_buy','count_remove','count_user_add','count_user_buy','count_user_remove','tot_user_add',
 'tot_user_buy', 'tot_user_remove','cluster_add_count','cluster_buy_count','cluster_remove_count','cluster_add_user_count','cluster_buy_user_count',
'cluster_remove_user_count', 'count_cat_add', 'count_cat_buy', 'count_cat_remove', 'count_cat_user_add','count_cat_user_buy',
 'count_cat_user_remove']
count_columns_log=[
 'count_user_add','count_user_buy','count_user_remove','tot_user_add','tot_user_buy','tot_user_remove','cluster_add_user_count',
 'cluster_buy_user_count','cluster_remove_user_count']
count_columns_max=['count_add','count_buy','count_remove','count_cat_add','count_cat_buy','count_cat_remove','cluster_add_count','cluster_buy_count',
 'cluster_remove_count']

for c in count_columns:
    max_val=product_events_rc[c].max()
    product_events_rc=product_events_rc.with_columns([
    (pl.col(c).fill_null(0.0) / (max_val + 1e-9)).alias(c)  
    ])

#entropies

entropies_columns=['sku_entropy','user_sku_entropy','cat_entropy','user_cat_entropy','cluster_entropy','user_cluster_entropy']
max_en_sku=math.log2(product_events["sku"].n_unique())
max_en_clie=math.log2(product_events["client_id"].n_unique())
max_en_cat=math.log2(product_events["category"].n_unique())
max_en_clu=math.log2(product_events["cluster_id"].n_unique())

for c in entropies_columns:
    if( c in ['sku_entropy','cluster_entropy','cat_entropy']):
        k=max_en_clie
    else:
        if(c=='user_sku_entropy'):
            k=max_en_sku
        elif(c=='user_cat_entropy'):
            k=max_en_cat
        else:
            k=max_en_clu
    product_events_rc=product_events_rc.with_columns([
        (pl.col(c).fill_null(0.0) /k).alias(c)  
        ])  
    


In [42]:
product_events_columns=['count_add','count_buy','count_remove',
 'count_user_add',
 'count_user_buy',
 'count_user_remove',
 'tot_user_add',
 'tot_user_buy',
 'tot_user_remove',
 'count_cat_add',
 'count_cat_buy',
 'count_cat_remove',
 'count_cat_user_add',
 'count_cat_user_buy',
 'count_cat_user_remove',
 'global_cat_cr',
 'local_cat_cr',
 'global_cr',
 'local_cr',
 'cr',
 'top_sku',
 'top_category',
 'cluster_add_count',
 'cluster_buy_count',
 'cluster_remove_count',
 'cluster_add_user_count',
 'cluster_buy_user_count',
 'cluster_remove_user_count',
 'sku_entropy',
 'cluster_entropy',
 'cat_entropy',
 'user_sku_entropy',
 'user_cluster_entropy',
 'user_cat_entropy']

In [43]:
product_events_rc

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count,sku_entropy,cluster_entropy,cat_entropy,user_sku_entropy,user_cluster_entropy,user_cat_entropy,price
i64,datetime[ns],i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,i8,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,0.203221,0.399127,0.117583,0.003497,0.003953,0.0,0.000823,0.001553,0.0,0.077366,0.093749,0.059459,0.00123,0.003175,0.0,0.402109,1.0,0.447162,1.0,1.0,1,1,91,0.406183,0.410652,0.397819,0.0016,0.003953,0.0,0.460813,0.697033,0.593247,7.2144e-8,2.1693e-7,1.1355e-7,0.66
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,0.376218,0.764192,0.256941,0.0,0.003953,0.0,0.0,0.001553,0.0,0.188031,0.207941,0.197291,0.0,0.003175,0.0,0.366976,inf,0.462474,1.0,1.0,1,1,32,0.416271,0.460018,0.397108,0.0,0.003953,0.0,0.504306,0.703503,0.624382,7.2144e-8,2.1693e-7,1.1355e-7,0.75
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,0.00338,0.012227,0.001089,0.0,0.003953,0.0,0.026337,0.021739,0.019956,0.578745,0.675534,0.606524,0.0,0.003175,0.001969,0.387337,inf,0.823529,0.4375,1.0,0,1,31,0.864084,0.809992,0.912687,0.0,0.003953,0.002717,0.19483,0.732266,0.642915,0.258408,0.704059,0.337672,0.75
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,0.000994,0.00524,0.0,0.0,0.003953,0.0,0.026337,0.021739,0.019956,0.243622,0.350509,0.231015,0.0,0.003175,0.0,0.477432,inf,1.2,0.4375,1.0,0,1,42,0.576578,0.608152,0.568821,0.0,0.007905,0.0,0.157915,0.714235,0.647777,0.258408,0.704059,0.337672,0.49
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,0.001392,0.000873,0.000544,0.0,0.003953,0.0,0.000823,0.001553,0.0,0.101496,0.111173,0.113739,0.00123,0.003175,0.0,0.363479,1.0,0.142857,1.0,1.0,0,1,59,0.524265,0.571471,0.526688,0.0,0.003953,0.0,0.14202,0.703106,0.578798,0.050056,0.150515,1.1355e-7,0.68
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11656025,2022-10-12 23:56:40,1423665,"""remove_from_cart""",4092,0.047524,0.062009,0.041372,0.003497,0.0,0.004329,0.007407,0.003106,0.004435,0.500026,0.525872,0.488206,0.00123,0.0,0.001969,0.348992,0.0,0.297071,0.222222,0.0,0,1,63,0.66057,0.719568,0.636016,0.0016,0.0,0.002717,0.353985,0.72035,0.700328,0.114591,0.324497,0.16985,0.83
2536512,2022-10-12 23:58:35,655021,"""remove_from_cart""",2419,0.006164,0.006987,0.009799,0.003497,0.0,0.004329,0.004115,0.01087,0.002217,0.036406,0.035378,0.041713,0.00123,0.0,0.001969,0.322462,0.0,0.258065,1.4,0.0,0,0,72,0.391343,0.391959,0.396667,0.0016,0.0,0.002717,0.200368,0.694121,0.524758,0.127977,0.384815,0.190167,0.57
6985434,2022-10-12 23:58:35,1391170,"""remove_from_cart""",582,0.000398,0.0,0.000544,0.006993,0.0,0.004329,0.020576,0.009317,0.017738,0.008116,0.008463,0.007042,0.00369,0.0,0.001969,0.346056,0.0,0.0,0.24,0.0,0,0,65,0.504261,0.570321,0.477139,0.0048,0.0,0.002717,6.9439e-8,0.71573,0.43023,0.143985,0.409749,0.195479,0.82
5657364,2022-10-12 23:58:35,1608171,"""remove_from_cart""",1014,0.000398,0.0,0.000544,0.003497,0.0,0.004329,0.005761,0.0,0.001109,0.10701,0.102057,0.109792,0.00123,0.0,0.001969,0.31648,0.0,0.0,0.0,0.0,0,1,7,0.473655,0.500185,0.460035,0.0016,0.0,0.002717,0.044243,0.710528,0.551874,0.04061,0.12211,0.063915,0.08


In [None]:
#last check to keep al values in [0,1]
for c in product_events_columns:
    product_events_rc= product_events_rc.with_columns([
    pl.when(pl.col(c).is_null())
      .then(0)
      .when(pl.col(c) > 1)
      .then(1)
      .otherwise(pl.col(c))
      .alias(c)
])

In [145]:
#adding "normalized" name embedding
max_val_names=(
    names_clusters.explode("embedding")
      .select(pl.col("embedding").max()).item()
)
names_clusters=names_clusters.with_columns([(pl.col("embedding")/max_val_names).alias("norm")])

## No scaling

In [122]:
names_clusters

sku,name,embedding,cluster_id,centroid
i64,str,list[i32],i32,list[f32]
101733,"""[131 245 189 142 164 164 138 2…","[131, 245, … 95]",24,"[169.778397, 162.961182, … 73.131523]"
184680,"""[219 48 162 96 67 72 96 …","[219, 48, … 72]",0,"[132.902405, 171.484818, … 92.399406]"
540546,"""[ 56 212 52 212 212 212 212 …","[56, 212, … 204]",49,"[59.435379, 183.447372, … 158.563034]"
1601877,"""[164 192 102 16 237 106 83 1…","[164, 192, … 26]",61,"[110.192261, 198.609497, … 73.90358]"
1022239,"""[167 191 141 57 130 76 45 …","[167, 191, … 76]",34,"[70.916702, 93.046402, … 115.120361]"
…,…,…,…,…
1351470,"""[ 35 35 254 34 28 218 183 …","[35, 35, … 125]",9,"[75.79863, 84.49025, … 64.059021]"
988917,"""[255 35 173 48 225 56 197 1…","[255, 35, … 56]",13,"[173.406296, 86.643761, … 106.386887]"
54657,"""[ 8 189 8 183 220 157 155 …","[8, 189, … 13]",17,"[69.144821, 151.732391, … 68.70739]"
540863,"""[156 156 190 18 128 128 156 1…","[156, 156, … 128]",50,"[165.984512, 188.794327, … 172.100449]"


In [47]:
product_events_rc=product_events_rc.join(names_clusters[["sku","embedding"]],on="sku",how="left")
product_events_rc

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count,sku_entropy,cluster_entropy,cat_entropy,user_sku_entropy,user_cluster_entropy,user_cat_entropy,price,embedding
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,list[i32]
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1025,457,217,1,1,0,1,1,0,7837,3122,2002,1,1,0,0.398367,1.0,0.445854,1.0,1.0,1,1,91,41202,12660,13833,1,1,0,9.569382,14.642244,12.367911,0.000001,0.000001,0.000001,0.66,"[175, 21, … 154]"
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1932,889,485,0,1,0,0,1,0,20372,7301,7076,0,1,0,0.358384,inf,0.460145,1.0,1.0,1,1,32,41226,13691,13507,0,1,0,10.498544,14.752243,13.073346,0.000001,0.000001,0.000001,0.75,"[229, 250, … 239]"
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,43,14,19,63592,24389,22060,0,1,1,0.383523,inf,0.823529,0.325581,1.0,0,1,31,86914,24813,31349,0,1,1,4.043802,15.365003,13.457867,5.479973,4.891433,4.388925,0.75,"[64, 64, … 64]"
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,43,14,19,26712,12656,8395,0,1,0,0.473795,inf,1.2,0.325581,1.0,0,1,42,61346,19412,21009,0,2,0,3.277613,15.015508,13.576828,5.479973,4.891433,4.388925,0.49,"[64, 173, … 206]"
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,11117,4018,4150,1,1,0,0.361428,1.0,0.142857,1.0,1.0,0,1,59,50988,16876,17746,0,1,0,2.947703,14.690793,12.144928,1.0,1.0,0.000001,0.68,"[93, 37, … 145]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
21088753,2022-10-25 23:59:15,1279918,"""remove_from_cart""",5083,9,2,5,3,0,2,27,5,10,1776,605,638,6,0,3,0.340653,0.0,0.222222,0.185185,0.0,0,0,13,40673,13274,13432,4,0,2,2.055037,14.74194,10.080589,3.978411,3.193023,2.528488,0.48,"[201, 102, … 125]"
21088753,2022-10-25 23:59:25,1618609,"""remove_from_cart""",5083,7,0,3,1,0,1,27,5,10,1776,605,638,6,0,3,0.340653,0.0,0.0,0.185185,0.0,0,0,97,40755,13413,13286,1,0,1,2.921928,14.720924,10.080589,3.978411,3.193023,2.528488,0.39,"[168, 102, … 69]"
18613684,2022-10-25 23:59:35,694766,"""remove_from_cart""",4174,60,22,24,1,0,1,4,0,2,10358,3459,3511,2,0,1,0.333945,0.0,0.366667,0.0,0.0,0,1,57,43685,13811,14541,1,0,1,5.229818,14.770542,12.220769,1.459148,1.459148,1.0,0.32,"[88, 1, … 88]"
18613684,2022-10-25 23:59:40,674854,"""remove_from_cart""",1187,40,7,15,2,0,1,4,0,2,4934,1780,1715,2,0,1,0.360762,0.0,0.175,0.0,0.0,0,0,50,36075,11702,11818,2,0,1,5.175734,14.52058,11.688044,1.459148,1.459148,1.0,0.41,"[149, 149, … 149]"


In [71]:
product_events_rc= pl.read_parquet("noscale/split14/product_events_sub.parquet")

In [76]:
product_events_rc.write_parquet("noscale/split14/product_events_sub.parquet")

In [49]:
product_events_rc=product_events.filter(pl.col("client_id").is_in(relevant_clients))

In [52]:
product_events_rc= fill_nulls(product_events_rc)

In [73]:
product_events_rc = product_events_rc.with_columns(
    [
        pl.when(pl.col(c).is_nan())
          .then(0)          # oppure .then(float("nan"))
          .otherwise(pl.col(c))
          .alias(c)
        for c in float_cols
    ]
)

In [54]:
float_cols = [
    col for col in product_events_rc.columns
    product_events_rc[col] = product_events_rc[col].fillna(0)
]


SyntaxError: invalid syntax (3225709363.py, line 3)

In [74]:
float_cols = [
    col for col in product_events_rc.columns
    product_events_rc[col] = product_events_rc[col].fillna(0)
]

nan_counts = product_events_rc.select(
    [pl.col(c).is_nan().sum().alias(f"nan_count_{c}") for c in float_cols]
)

print(nan_counts)

SyntaxError: invalid syntax (850054222.py, line 3)

In [75]:
product_events_rc

client_id,timestamp,sku,event_type,category,count_add,count_buy,count_remove,count_user_add,count_user_buy,count_user_remove,tot_user_add,tot_user_buy,tot_user_remove,count_cat_add,count_cat_buy,count_cat_remove,count_cat_user_add,count_cat_user_buy,count_cat_user_remove,global_cat_cr,local_cat_cr,global_cr,local_cr,cr,top_sku,top_category,cluster_id,cluster_add_count,cluster_buy_count,cluster_remove_count,cluster_add_user_count,cluster_buy_user_count,cluster_remove_user_count,sku_entropy,cluster_entropy,cat_entropy,user_sku_entropy,user_cluster_entropy,user_cat_entropy,price,embedding
i64,datetime[ns],i64,str,i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,i8,i8,i32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,list[i32]
19433713,2022-06-23 00:12:15,649662,"""buy_product""",3863,1025,457,217,1,1,0,1,1,0,7837,3122,2002,1,1,0,0.398367,1.0,0.445854,1.0,1.0,1,1,91,41202,12660,13833,1,1,0,9.569382,14.642244,12.367911,0.000001,0.000001,0.000001,0.66,"[175, 21, … 154]"
11106698,2022-06-23 00:12:25,965816,"""buy_product""",3679,1932,889,485,0,1,0,0,1,0,20372,7301,7076,0,1,0,0.358384,1.0,0.460145,1.0,1.0,1,1,32,41226,13691,13507,0,1,0,10.498544,14.752243,13.073346,0.000001,0.000001,0.000001,0.75,"[229, 250, … 239]"
3334023,2022-06-23 00:15:25,419981,"""buy_product""",1096,17,14,2,0,1,0,43,14,19,63592,24389,22060,0,1,1,0.383523,1.0,0.823529,0.325581,1.0,0,1,31,86914,24813,31349,0,1,1,4.043802,15.365003,13.457867,5.479973,4.891433,4.388925,0.75,"[64, 64, … 64]"
3334023,2022-06-23 00:15:25,1161623,"""buy_product""",2964,5,6,0,0,1,0,43,14,19,26712,12656,8395,0,1,0,0.473795,1.0,1.2,0.325581,1.0,0,1,42,61346,19412,21009,0,2,0,3.277613,15.015508,13.576828,5.479973,4.891433,4.388925,0.49,"[64, 173, … 206]"
2310948,2022-06-23 00:17:20,520725,"""buy_product""",2114,7,1,1,0,1,0,1,1,0,11117,4018,4150,1,1,0,0.361428,1.0,0.142857,1.0,1.0,0,1,59,50988,16876,17746,0,1,0,2.947703,14.690793,12.144928,1.0,1.0,0.000001,0.68,"[93, 37, … 145]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
21088753,2022-10-25 23:59:15,1279918,"""remove_from_cart""",5083,9,2,5,3,0,2,27,5,10,1776,605,638,6,0,3,0.340653,0.0,0.222222,0.185185,0.0,0,0,13,40673,13274,13432,4,0,2,2.055037,14.74194,10.080589,3.978411,3.193023,2.528488,0.48,"[201, 102, … 125]"
21088753,2022-10-25 23:59:25,1618609,"""remove_from_cart""",5083,7,0,3,1,0,1,27,5,10,1776,605,638,6,0,3,0.340653,0.0,0.0,0.185185,0.0,0,0,97,40755,13413,13286,1,0,1,2.921928,14.720924,10.080589,3.978411,3.193023,2.528488,0.39,"[168, 102, … 69]"
18613684,2022-10-25 23:59:35,694766,"""remove_from_cart""",4174,60,22,24,1,0,1,4,0,2,10358,3459,3511,2,0,1,0.333945,0.0,0.366667,0.0,0.0,0,1,57,43685,13811,14541,1,0,1,5.229818,14.770542,12.220769,1.459148,1.459148,1.0,0.32,"[88, 1, … 88]"
18613684,2022-10-25 23:59:40,674854,"""remove_from_cart""",1187,40,7,15,2,0,1,4,0,2,4934,1780,1715,2,0,1,0.360762,0.0,0.175,0.0,0.0,0,0,50,36075,11702,11818,2,0,1,5.175734,14.52058,11.688044,1.459148,1.459148,1.0,0.41,"[149, 149, … 149]"
