In [1]:
import polars as pl

from pathlib import Path
import os
import json

In [2]:
DATA_DIR = Path.cwd().parent / "data" / "avito_ml_cup"

# Просмотр данных

**clickstream.pq**
- *cookie* - id пользователя [int64]
- *item* - id объявления [int64]
- *event* - id тип события [int64]. Например “клик”, “показать телефон”, “написать в мессенджер”
- *event_date* - время, когда пользователь провзаимодействовал с айтемом datetime[ns]
- *surface*  - экран, с которого было взаимодействие. Например пользователь может взаимодействовать с айтемом на поисковой выдаче, во вкладке “Избранное” [int64]
- *platform* -  тип устройства, с которого пользователь совершил действие. Например - android, ios, desktop, браузер в телефоне [int64]
- *node* - id группы товара. Сущность , которую необходимо предсказать [int64]

In [3]:
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

print(df_clickstream.shape)
df_clickstream.head()

(68806152, 7)


cookie,item,event,event_date,platform,surface,node
i64,i64,i64,datetime[ns],i64,i64,u32
0,19915558,17,2025-02-05 02:30:59,3,2,115659
0,2680232,17,2025-01-24 21:16:57,3,2,115829
1,4247649,17,2025-01-29 23:00:58,2,2,7
1,4247649,17,2025-02-17 14:55:17,2,2,7
1,2171135,17,2025-01-17 19:23:29,2,2,214458


In [4]:
df_clickstream["event"].value_counts().sort("count", descending=True).head(10)

event,count
i64,u32
17,61089584
11,4314754
12,866124
10,827467
15,339072
5,298697
3,297855
19,191843
8,167883
4,131489


In [5]:
print(df_clickstream["surface"].n_unique())
df_clickstream["surface"].value_counts().sort(by="count", descending=True).head(10)

19


surface,count
i64,u32
2,33199960
11,20610292
5,3949883
3,3943848
15,2799442
8,1967748
17,1091996
14,933390
4,87965
10,71475


In [6]:
print(df_clickstream["platform"].n_unique())
df_clickstream["platform"].value_counts().sort(by="count", descending=True)

7


platform,count
i64,u32
2,40252188
3,24307843
0,2253252
5,1980699
1,10299
4,1669
6,202


In [7]:
df_clickstream["node"].n_unique()

408474

**cat_features.pq**
- *item* -  id объявления [int64]
- *location* - id локации айтема.  [int64]
- *category* - id категории товара. [int64]
- *clean_params* - параметры объявления. В примере [{"attr":859,"value":653982} …]  attr - это id атрибута, value - это id значения атрибута [str]
- *node* - id группы товара. Сущность , которую необходимо предсказать [int64]

In [4]:
df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features.pq')

df_cat_features.head()

item,location,category,clean_params,node
i64,i64,i64,str,u32
9,8385,57,"""[{""attr"":1157,""value"":664427},…",194747
17,2707,35,"""[{""attr"":2140,""value"":501466},…",352905
144,8383,8,"""[{""attr"":802,""value"":35791},{""…",17188
202,5397,57,"""[{""attr"":1157,""value"":490527},…",194766
236,2105,64,"""[{""attr"":112,""value"":420797},{…",153951


In [9]:
json_feature = json.loads(df_cat_features["clean_params"].sample(1)[0])

len(json_feature)

9

In [10]:
df_cat_features["location"].n_unique()

4823

In [11]:
print(df_cat_features["category"].n_unique())
df_cat_features["category"].value_counts().sort(by="count", descending=True).head(10)

53


category,count
i64,u32
35,3693923
51,2341955
19,1280756
24,1134356
40,1120838
7,981234
37,942007
46,918996
57,917895
52,809191


In [12]:
df_cat_features["node"].n_unique()

408474

**text_features.pq**
- *item*  - id объявления [int64]
- *title_projection* - вектор тайтла объявления array[int8, 64]. Чем выше скалярное произведение между 2-мя векторами, тем более похожи заголовки объявлений

In [3]:
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')

df_text_features.head()

item,title_projection
i64,"array[i8, 64]"
9,"[-128, 90, … -36]"
17,"[-128, 127, … -3]"
144,"[-128, 97, … 18]"
202,"[-128, 127, … 15]"
236,"[-128, 29, … -128]"


**events.pq**
- *event* - id типы события [int64]
- *is_contact* - 0 или 1. Является ли событие контактным [int64]

In [3]:
df_events = pl.read_parquet(f'{DATA_DIR}/events.pq')

df_events.head()

event,is_contact
i64,i64
0,1
11,0
4,1
16,0
15,1


In [8]:
df_events.sort("is_contact", descending=True).tail(10)

event,is_contact
i64,i64
13,1
10,1
14,1
19,1
11,0
16,0
3,0
12,0
8,0
17,0


Соединим clickstream и df_events, чтобы посмотреть количество всех интеракций по *типу события* и какие из них являются *контакными* 

In [5]:
df_clickstream["event"].value_counts().join(
    df_events,
    on="event"
).sort(by="count", descending=True).tail(10)

NameError: name 'df_clickstream' is not defined

In [17]:
df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')

df_test_users.head()

cookie
i64
52564
105000
57152
87303
37755


In [20]:
df_test_users.select("cookie").join(
    df_clickstream.select("cookie").unique(),
    on="cookie",
    how="anti"
)

cookie
i64


In [22]:
df_test_users["cookie"].shape, df_clickstream["cookie"].unique().shape

((92319,), (134294,))

-------

In [12]:
df_cat_features.group_by("node").len("item").sort("item", descending=True)

node,item
u32,u32
170538,463451
151453,307435
71514,185301
71511,146887
71546,136767
…,…
284557,1
414456,1
75733,1
413030,1


In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [14]:
df_combined = df_cat_features.join(
    df_text_features,
    on="item",
    how="inner"
)

In [16]:
def calculate_intra_node_similarity(df):
    """Вычисляет статистики схожести внутри каждой node"""
    
    intra_node_stats = []
    
    for i, node_id in tqdm(enumerate(df["node"].unique()), total=len(df)):
        node_data = df.filter(pl.col("node") == node_id)
        if node_data.shape[0] > 100:
            node_data = node_data.sample(100, seed=123)
        
        if len(node_data) < 2:
            continue
            
        # Извлекаем векторы title_projection
        vectors = np.array([row for row in node_data["title_projection"]])
        
        # Вычисляем косинусное сходство между всеми парами
        similarity_matrix = cosine_similarity(vectors)
        
        # Извлекаем только верхний треугольник (исключая диагональ)
        upper_triangle = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]
        
        stats = {
            "node": node_id,
            "items_count": len(node_data),
            "mean_similarity": np.mean(upper_triangle),
            "std_similarity": np.std(upper_triangle),
            "min_similarity": np.min(upper_triangle),
            "max_similarity": np.max(upper_triangle),
            "median_similarity": np.median(upper_triangle)
        }
        
        intra_node_stats.append(stats)

        if i == 2000:
            break
    
    return pl.DataFrame(intra_node_stats)

# Вычисляем статистики для каждой node
intra_node_similarity = calculate_intra_node_similarity(df_combined)
intra_node_similarity.sort("mean_similarity", descending=True).head(10)

  0%|          | 2000/22646691 [00:20<64:29:13, 97.54it/s] 


node,items_count,mean_similarity,std_similarity,min_similarity,max_similarity,median_similarity
i64,i64,f64,f64,f64,f64,f64
509,2,1.0,0.0,1.0,1.0,1.0
1649,2,1.0,0.0,1.0,1.0,1.0
1720,2,1.0,0.0,1.0,1.0,1.0
181,2,1.0,0.0,1.0,1.0,1.0
1257,2,1.0,0.0,1.0,1.0,1.0
1551,3,1.0,0.0,1.0,1.0,1.0
1700,4,1.0,0.0,1.0,1.0,1.0
1701,4,1.0,2.2204e-16,1.0,1.0,1.0
1706,4,1.0,0.0,1.0,1.0,1.0
1834,4,1.0,0.0,1.0,1.0,1.0


In [18]:
intra_node_similarity.sort("items_count", descending=True).head(10)

node,items_count,mean_similarity,std_similarity,min_similarity,max_similarity,median_similarity
i64,i64,f64,f64,f64,f64,f64
7,100,0.743431,0.112529,0.471041,1.0,0.730563
8,100,0.74457,0.122594,0.471041,1.0,0.730563
9,100,0.759509,0.121617,0.471041,1.0,0.741305
10,100,0.720131,0.126703,0.543007,1.0,0.707692
11,100,0.754346,0.125342,0.543007,1.0,0.763427
12,100,0.689366,0.131492,0.543007,1.0,0.655116
13,100,0.701344,0.119672,0.543007,1.0,0.675457
14,100,0.739596,0.12461,0.471041,1.0,0.722745
15,100,0.736214,0.121867,0.471041,1.0,0.719943
16,100,0.737138,0.115141,0.521468,1.0,0.726516


In [11]:
df_text_features_columns = df_text_features.with_columns([
    pl.col("title_projection").arr.get(i).alias(f"t_{i}") for i in range(64)
])

In [None]:
test_means = df_cat_features.join(
    df_text_features_columns,
    on="item",
    how="inner"
)

In [14]:
agg_exprs = [
    pl.mean(f"t_{i}").alias(f"emb_{i}") for i in range(64)
] + [
    pl.col("category").mode().alias("category_mode"),
    pl.col("location").mode().alias("location_mode"),
]

df_node = test_means.group_by("node").agg(agg_exprs)

In [25]:
import gc
import sys

In [27]:
sys.getrefcount(test_means)

5

In [28]:
del test_means

In [29]:
gc.collect()

2082

In [15]:
df_node

node,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,emb_35,emb_36,emb_37,emb_38,emb_39,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49,emb_50,emb_51,emb_52,emb_53,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63,category_mode,location_mode
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,list[i64],list[i64]
101749,-128.0,127.0,127.0,-2.0,21.0,-78.0,-128.0,127.0,127.0,37.0,-56.0,31.0,-84.0,-128.0,127.0,-128.0,32.0,30.0,-100.0,127.0,-84.0,37.0,-128.0,127.0,-4.0,-128.0,-85.0,-33.0,127.0,-62.0,40.0,-115.0,49.0,60.0,-58.0,-66.0,127.0,127.0,-113.0,127.0,31.0,-127.0,115.0,-116.0,-30.0,14.0,-128.0,54.0,53.0,-128.0,-69.0,61.0,127.0,-22.0,39.0,-94.0,7.0,-71.0,-5.0,51.0,72.0,111.0,-7.0,86.0,[49],[8383]
315783,-128.0,119.925926,35.592593,-55.148148,-56.740741,-78.592593,-14.148148,97.148148,28.444444,-9.407407,36.185185,64.666667,-36.555556,-105.444444,27.925926,27.777778,-14.0,-128.0,115.148148,-87.666667,-112.185185,-62.666667,32.185185,125.62963,50.62963,-29.259259,-109.740741,-77.703704,77.518519,-15.518519,56.518519,-58.444444,54.777778,24.222222,-82.111111,-29.0,48.037037,70.962963,-128.0,107.111111,-2.296296,3.62963,3.555556,-94.555556,4.666667,76.925926,-100.481481,-85.333333,108.222222,-19.481481,-24.592593,-60.851852,99.62963,-53.777778,99.037037,-122.814815,79.666667,-54.703704,-90.555556,45.407407,46.185185,125.296296,-66.185185,41.074074,[35],[2348]
214466,-126.837887,-58.27616,126.884021,-116.605412,18.674613,-91.68299,-37.285052,23.595747,104.764046,-88.464948,59.396392,49.606572,30.542526,-113.031572,64.836727,93.651675,14.285438,-124.66134,111.231057,-85.07951,-54.95232,-48.094845,11.893041,74.899227,84.955799,-9.296134,-123.89317,-15.53518,118.303608,-23.849485,-12.150515,70.765077,30.089304,-34.042139,-26.023196,-85.36933,96.678866,-27.256701,-86.269072,-5.795361,98.327835,12.793814,58.740593,0.753737,86.774098,101.486856,-99.162758,-46.301933,46.786082,22.263144,46.855284,10.631959,97.306057,-49.483634,43.561727,-78.965722,82.018814,-6.217655,-75.959923,15.698969,105.158892,69.468428,48.231572,51.922938,[37],[2269]
119967,-127.522936,89.422018,124.454128,64.266055,77.426606,-107.788991,88.09633,42.747706,54.798165,-31.853211,43.275229,-0.701835,7.045872,-121.09633,64.536697,-112.651376,57.986239,-114.545872,51.922018,-85.944954,-25.761468,6.821101,48.344037,124.711009,102.463303,5.082569,-117.761468,-4.321101,34.252294,13.09633,12.633028,-60.408257,-35.724771,-6.701835,-49.940367,25.550459,34.440367,88.582569,-120.5,51.885321,-4.889908,79.068807,112.729358,-28.261468,3.37156,91.761468,-49.738532,20.834862,45.422018,-101.06422,4.165138,42.431193,98.366972,-30.775229,88.747706,-107.477064,8.889908,-107.316514,-11.977064,-23.197248,118.440367,84.940367,-11.972477,36.669725,[19],[2348]
159038,-17.461538,-118.076923,124.846154,126.0,-15.846154,-106.692308,-28.307692,37.076923,-39.769231,11.076923,39.230769,114.769231,-34.076923,-85.538462,120.923077,-122.230769,-97.230769,-75.384615,101.230769,-114.923077,-127.076923,3.538462,68.153846,101.384615,-31.230769,40.0,-128.0,-9.692308,113.615385,-116.230769,-92.923077,-86.846154,54.846154,-97.153846,-82.307692,115.153846,-114.923077,-34.538462,-84.692308,47.923077,-91.230769,-36.846154,59.846154,-80.230769,61.076923,-26.846154,-21.384615,-106.923077,-1.153846,-19.153846,14.923077,4.307692,89.384615,-9.0,82.384615,-115.692308,61.692308,-30.923077,-112.538462,-12.384615,120.615385,105.846154,67.538462,-8.692308,[43],"[4972, 5865, … 4842]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
166710,-128.0,127.0,-31.0,-128.0,1.0,78.0,127.0,80.0,-2.0,127.0,77.0,-107.0,18.0,-128.0,127.0,0.0,127.0,30.0,100.0,109.0,-128.0,-46.0,-117.0,127.0,24.0,-115.0,-33.0,-105.0,127.0,-82.0,-83.0,-83.0,127.0,-52.0,-55.0,58.0,32.0,127.0,-108.0,106.0,62.0,71.0,46.0,15.0,92.0,87.0,-128.0,109.0,62.0,-102.0,30.0,-49.0,72.0,-65.0,127.0,-79.0,69.0,-44.0,46.0,77.0,117.0,127.0,88.0,-53.0,[59],[1174]
388741,-128.0,-128.0,127.0,-20.0,-126.0,-128.0,127.0,127.0,-95.0,-128.0,127.0,127.0,92.0,-47.0,34.0,35.0,46.0,-19.0,-73.0,29.0,-57.0,-18.0,27.0,-58.0,-128.0,8.0,-128.0,-128.0,12.0,-128.0,-20.0,55.0,120.0,79.0,-126.0,42.0,23.0,90.0,-128.0,81.0,127.0,-8.0,17.0,50.0,66.0,-26.0,-128.0,-48.0,-5.0,-97.0,46.0,9.0,107.0,-117.0,65.0,-115.0,118.0,-58.0,-1.0,-21.0,116.0,127.0,29.0,-52.0,[51],[1310]
409457,-6.0,-128.0,127.0,-128.0,-25.0,-80.0,-37.0,127.0,127.0,-128.0,70.0,127.0,76.0,-73.0,-114.0,67.0,127.0,-62.0,20.0,-128.0,-14.0,-128.0,-125.0,127.0,127.0,-120.0,-85.0,-5.0,127.0,-128.0,46.0,-10.0,102.0,-128.0,0.0,-85.0,-65.0,127.0,-54.0,-6.0,43.0,-55.0,106.0,25.0,43.0,104.0,-128.0,-40.0,-42.0,-40.0,58.0,-128.0,106.0,-92.0,127.0,-103.0,108.0,-128.0,-127.0,-128.0,118.0,127.0,-118.0,22.0,[51],[2348]
261471,-128.0,-59.2,120.4,91.0,-42.6,-128.0,99.2,119.4,-37.4,-41.4,9.2,37.2,6.8,-4.0,-69.2,79.0,-57.0,-80.2,111.8,-25.4,-100.8,-30.0,16.8,127.0,40.6,18.4,-79.0,26.0,10.4,20.8,-33.2,-103.6,127.0,-35.0,-33.2,93.0,4.8,85.6,-39.4,-108.0,71.6,-54.4,50.0,-62.8,62.8,84.6,-60.0,-11.8,-14.8,7.2,-43.2,50.2,16.8,-32.6,46.4,-116.6,33.4,-63.4,-52.0,82.2,108.8,120.4,28.6,-22.8,[35],[2348]


In [10]:
test_means.group_by("node").agg({
    "title_projection": lambda x: np.mean(np.stack(x), axis=0),
    "category": lambda x: x.mode().iloc[0] if not x.mode().empty else -1,
    "location": lambda x: x.mode().iloc[0] if not x.mode().empty else -1,
})

TypeError: specifying aggregations as a dictionary is not supported

Try unpacking the dictionary to take advantage of the keyword syntax of the `agg` method.