In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats
import scipy.sparse as sps
import gc


from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors, 
    min_max_impression_time_history
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL
)
from ebrec.evaluation.metrics_protocols import *
from polimi.utils._catboost import add_other_rec_features
from polimi.utils._catboost import add_trendiness_feature

from catboost import CatBoostClassifier

# Memory Utils

In [2]:
from os import system, getpid, walk
from psutil import Process
from colorama import Fore, Style, init
from IPython.display import display, HTML

def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    print(style + color + text + Style.RESET_ALL)
    
def GetMemUsage():   
    pid = getpid()
    py = Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return f"RAM memory GB usage = {memory_use :.4}"

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 0.323[0m


In [3]:
def reduce_polars_df_memory_size(df, set_categorical=True):

    start_mem = df.estimated_size('mb')
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type in [pl.Int16, pl.Int32, pl.Int64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(pl.col(col).cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif col_type in [pl.UInt16, pl.UInt32, pl.UInt64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt8))
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt16))
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt32))
        elif col_type == pl.Float64:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(pl.col(col).cast(pl.Float32))

    gc.collect()
    end_mem = df.estimated_size('mb')
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Data import

In [4]:
articles = pl.read_parquet("C:\\Users\\franc\\Desktop\\CHALLENGE\\dataset\\ebnerd_small\\articles.parquet")

behaviors_train = pl.read_parquet('C:\\Users\\franc\\Desktop\\CHALLENGE\\dataset\\ebnerd_small\\train\\behaviors.parquet')
history_train = pl.read_parquet('C:\\Users\\franc\\Desktop\\CHALLENGE\\dataset\\ebnerd_small\\train\history.parquet')

behaviors_val = pl.read_parquet("C:\\Users\\franc\\Desktop\\CHALLENGE\\dataset\\ebnerd_small\\validation\\behaviors.parquet")
history_val = pl.read_parquet("C:\\Users\\franc\\Desktop\\CHALLENGE\\dataset\\ebnerd_small\\validation\\history.parquet")

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 0.6142[0m


# Articles Preprocessing

In [5]:
articles = articles.with_columns(
    pl.col('image_ids').list.len().alias('num_images'),
    pl.col('title').str.split(by=' ').list.len().alias('title_len'),
    pl.col('subtitle').str.split(by=' ').list.len().alias('subtitle_len'),
    pl.col('body').str.split(by=' ').list.len().alias('body_len'),
    # very important for tf-idf, otherwise multiple tokens for topics with spaces are built
    pl.col('topics').list.eval(pl.element().str.split(by=' ').list.join('_')) 
)
articles = reduce_polars_df_memory_size(articles)

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)
articles.head(2)

Memory usage of dataframe is 64.11 MB
Memory usage after optimization is: 63.96 MB
Decreased by 0.2%
[1m[31m
RAM memory GB usage = 0.7293[0m


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,u32,u8,u8,u16
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig_kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative""",1,5,22,200
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film_og_tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive""",1,5,19,116


In [6]:
unique_entities = articles.select('entity_groups').explode('entity_groups')['entity_groups'].unique().to_list()
unique_entities = [e for e in unique_entities if e is not None]
unique_entities

['PROD', 'ORG', 'PER', 'LOC', 'MISC', 'EVENT']

In [7]:
unique_categories = articles.select('category')['category'].unique().to_list()
unique_categories = [c for c in unique_categories if c is not None]
len(unique_categories)

25

Mapping sentiment labels to -1, 0, 1 to save memory in the history dataframe.
Taking only article_default type and mapping it to 0/1 since after only this is needed (and it saves memory) for the history at the moment

In [8]:
articles = articles.with_columns(
    pl.when(pl.col('sentiment_label') == 'Negative').then(-1) \
        .otherwise(
            pl.when(pl.col('sentiment_label') == 'Positive').then(1).otherwise(0)
        ).cast(pl.Int8).alias('sentiment_label_int'),
    (pl.col('article_type') == 'article_default').cast(pl.UInt8).alias('is_article_default')
)

## Topics TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
articles = articles.with_columns(
    pl.Series(
        vectorizer.fit_transform(
            articles.with_columns(pl.col('topics').list.join(separator=' '))['topics'].to_list()
        ).toarray().astype(np.float32)
    ).alias('topics_idf')
)

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)
articles.head(2)

[1m[31m
RAM memory GB usage = 0.7397[0m


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,sentiment_label_int,is_article_default,topics_idf
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,u32,u8,u8,u16,i8,u8,list[f32]
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig_kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative""",1,5,22,200,-1,1,"[0.0, 0.0, … 0.0]"
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film_og_tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive""",1,5,19,116,1,1,"[0.0, 0.0, … 0.536059]"


In [10]:
# without split + join with _ of the single topics this returns 94
len(vectorizer.get_feature_names_out())

79

## Precalculating similarities

In [11]:
# from sklearn.feature_extraction.text import CountVectorizer

# count_vectorizer = CountVectorizer()
# topics_sparse = count_vectorizer.fit_transform(
#     articles.with_columns(pl.col('topics').list.join(separator=' '))['topics'].to_list()
# )
# topics_sparse

In [12]:
# len(count_vectorizer.get_feature_names_out())

In [13]:
# %%time

# topics_sparse.dot(topics_sparse.T)

In [14]:
# %%time


# intersections = topics_sparse.dot(topics_sparse.T.tocsc())
# row_sums = intersections.diagonal()
# unions = row_sums[:,None] + row_sums - intersections
# similarities = intersections / unions

# gc.collect()
# PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)
# articles.head(2)

# History preprocessing

In [15]:
def add_topics_tf_idf_columns(df, topics_col, vectorizer):
    return df.with_columns(
        pl.Series(
            vectorizer.transform(
                df.with_columns(pl.col(topics_col).list.join(separator=' '))[topics_col].to_list()
            ).toarray().astype(np.float32)
        ).alias(f'{topics_col}_tf_idf')
    )

In [16]:
%%time
import tqdm

STRONG_THR = 0.8

history_train = pl.concat(
    rows.with_columns(pl.col('article_id_fixed').list.len().alias('NumArticlesHistory')) \
        .explode(['article_id_fixed', 'impression_time_fixed', 'read_time_fixed', 'scroll_percentage_fixed']) \
        .sort(by=['user_id', 'impression_time_fixed']) \
        .with_columns(
            pl.col('impression_time_fixed').dt.weekday().alias('weekday'),
            pl.col('impression_time_fixed').dt.hour().alias('hour'),
        ).join(articles.select(['article_id', 'category', 'is_article_default', 'sentiment_label_int', 
                                'sentiment_score', 'entity_groups', 'topics']), 
               left_on='article_id_fixed', right_on='article_id', how='left') \
        .with_columns(
            (pl.col('sentiment_label_int') == 0).alias('is_neutral'),
            (pl.col('sentiment_label_int') == 1).alias('is_positive'),
            (pl.col('sentiment_label_int') == -1).alias('is_negative'),
            ((pl.col('sentiment_label_int') == 0) & (pl.col('sentiment_score') > STRONG_THR)).alias('strong_neutral'),
            ((pl.col('sentiment_label_int') == 1) & (pl.col('sentiment_score') > STRONG_THR)).alias('strong_positive'),
            ((pl.col('sentiment_label_int') == -1) & (pl.col('sentiment_score') > STRONG_THR)).alias('strong_negative'),
            pl.col('entity_groups').list.unique(),
        ).group_by('user_id').agg(
            pl.col('article_id_fixed'),
            pl.col('impression_time_fixed'),
            pl.col('category'),
            pl.col('NumArticlesHistory').first(),
            pl.col('read_time_fixed').median().alias('MedianReadTime'),
            pl.col('read_time_fixed').max().alias('MaxReadTime'),
            pl.col('read_time_fixed').sum().alias('TotalReadTime'),
            pl.col('scroll_percentage_fixed').median().alias('MedianScrollPercentage'),
            pl.col('scroll_percentage_fixed').max().alias('MaxScrollPercentage'),
            (pl.col('is_neutral').sum() / pl.col('NumArticlesHistory').first()).alias('NeutralPct'),
            (pl.col('is_positive').sum() / pl.col('NumArticlesHistory').first()).alias('PositivePct'),
            (pl.col('is_negative').sum() / pl.col('NumArticlesHistory').first()).alias('NegativePct'),
            (pl.col('strong_neutral').sum() / pl.col('NumArticlesHistory').first()).alias('PctStrongNeutral'),
            (pl.col('strong_positive').sum() / pl.col('NumArticlesHistory').first()).alias('PctStrongPositive'),
            (pl.col('strong_negative').sum() / pl.col('NumArticlesHistory').first()).alias('PctStrongNegative'),
            (1 - (pl.col('is_article_default').sum() / pl.col('NumArticlesHistory').first())).alias('PctNotDefaultArticles'),
            pl.col('category').mode().alias('MostFrequentCategory'),
            pl.col('weekday').mode().alias('MostFrequentWeekday'),
            pl.col('hour').mode().alias('MostFrequentHour'),
            pl.col('entity_groups').flatten(),
            pl.col('topics').flatten().alias('topics_flatten')
        ).pipe(add_topics_tf_idf_columns, topics_col='topics_flatten', vectorizer=vectorizer) \
        .drop('topics_flatten').with_columns(
            pl.col('MostFrequentCategory').list.first(),
            pl.col('MostFrequentWeekday').list.first(),
            pl.col('MostFrequentHour').list.first(),
        ).with_columns(
            (pl.col('entity_groups').list.count_matches(entity) / pl.col('NumArticlesHistory')).alias(f'{entity}Pct')
            for entity in unique_entities
        ).drop('entity_groups')
    for rows in tqdm.tqdm(history_train.iter_slices(1000), total=history_train.shape[0] // 1000)
)

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)
history_train = reduce_polars_df_memory_size(history_train)
history_train.head(2)

16it [00:08,  1.94it/s]                        


[1m[31m
RAM memory GB usage = 0.5151[0m
Memory usage of dataframe is 39.39 MB
Memory usage after optimization is: 38.61 MB
Decreased by 2.0%
CPU times: total: 7.02 s
Wall time: 8.62 s


user_id,article_id_fixed,impression_time_fixed,category,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,NeutralPct,PositivePct,NegativePct,PctStrongNeutral,PctStrongPositive,PctStrongNegative,PctNotDefaultArticles,MostFrequentCategory,MostFrequentWeekday,MostFrequentHour,topics_flatten_tf_idf,PRODPct,ORGPct,PERPct,LOCPct,MISCPct,EVENTPct
u32,list[i32],list[datetime[μs]],list[i16],u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i16,i8,i8,list[f32],f32,f32,f32,f32,f32,f32
10623,"[9739035, 9739065, … 9770533]","[2023-04-27 14:29:44, 2023-04-27 14:29:47, … 2023-05-17 17:36:24]","[118, 414, … 140]",143,3.0,1563.0,9623.0,42.5,100.0,0.230769,0.111888,0.657343,0.097902,0.083916,0.608392,0.06993,118,3,18,"[0.016041, 0.062928, … 0.178427]",0.237762,0.888112,0.874126,0.804196,0.223776,0.202797
12016,"[9738216, 9735383, … 9769622]","[2023-04-27 07:39:48, 2023-04-27 07:41:30, … 2023-05-17 23:56:30]","[140, 118, … 140]",264,156.0,1775.0,63672.0,100.0,100.0,0.257576,0.075758,0.666667,0.106061,0.034091,0.602273,0.07197,140,5,20,"[0.009749, 0.045896, … 0.075912]",0.318182,0.852273,0.787879,0.776515,0.117424,0.087121


In [17]:
history_train.shape

(15143, 27)

# Creating Training Dataframe

In [18]:
%%time

NPRATIO = 2

# dropping duplicates that can occur because of sampling strategy
train_ds = behaviors_train.pipe(sampling_strategy_wu2019, npratio=NPRATIO, shuffle=False, 
                                with_replacement=True, seed=123) \
    .pipe(create_binary_labels_column, shuffle=True, seed=123) \
    .with_columns(pl.col('gender').fill_null(2)) \
    .select(['impression_id', 'article_ids_inview', 'impression_time', 'labels', 
             'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
             'age', 'is_subscriber', 'session_id']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .with_columns(pl.col('article').cast(pl.Int32)) \
    .pipe(add_trendiness_feature, articles=articles, period='3d') \
    .unique(['impression_id', 'article']) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category',
                            'sentiment_score', 'sentiment_label', 'entity_groups',
                            'num_images', 'title_len', 'subtitle_len', 'body_len']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).with_columns(
        pl.col('entity_groups').list.contains(entity).alias(f'Entity_{entity}_Present')
        for entity in unique_entities
    ).drop('entity_groups')

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

train_ds = reduce_polars_df_memory_size(train_ds)
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)
train_ds.head()

[1m[31m
RAM memory GB usage = 0.735[0m
Memory usage of dataframe is 67.63 MB
Memory usage after optimization is: 56.96 MB
Decreased by 15.8%
[1m[31m
RAM memory GB usage = 0.7419[0m
CPU times: total: 33.1 s
Wall time: 9.31 s


impression_id,article,impression_time,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,session_id,trendiness_score,weekday,hour,premium,published_time,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,article_delay_days,article_delay_hours,Entity_PROD_Present,Entity_ORG_Present,Entity_PER_Present,Entity_LOC_Present,Entity_MISC_Present,Entity_EVENT_Present
u32,i32,datetime[μs],i8,i8,f32,f32,u32,bool,i8,i8,bool,u32,i16,i8,i8,bool,datetime[μs],i16,f32,str,u32,u8,u8,u16,i16,i32,bool,bool,bool,bool,bool,bool
150528,9778682,2023-05-24 07:33:25,0,2,25.0,,143471,False,2,,False,1240,206,3,7,False,2023-05-24 06:32:42,498,0.9546,"""Negative""",1,5,20,267,0,1,True,True,True,False,False,True
153068,9778669,2023-05-24 07:09:04,1,1,78.0,100.0,151570,False,2,,False,1976,199,3,7,False,2023-05-24 06:09:26,118,0.9481,"""Negative""",1,5,11,150,0,0,True,True,True,False,False,False
153070,7213923,2023-05-24 07:13:14,0,1,26.0,100.0,151570,False,2,,False,1976,84,3,7,True,2018-07-28 04:41:38,565,0.8109,"""Neutral""",5,10,42,1048,1761,42266,False,False,False,False,False,False
153075,9778386,2023-05-24 07:13:58,0,1,26.0,100.0,151570,False,2,,False,1976,333,3,7,False,2023-05-24 03:17:40,142,0.5723,"""Negative""",3,8,14,401,0,3,True,True,True,False,False,True
153078,9778226,2023-05-24 07:13:46,1,1,7.0,100.0,151570,False,2,,False,1976,279,3,7,False,2023-05-24 05:34:01,414,0.9722,"""Positive""",1,5,13,230,0,1,True,True,True,False,False,False


### PROVA POI CANCELLA

In [19]:
from polimi.utils._catboost import add_other_rec_features

In [25]:
algorithms = ["ItemKNNCFRecommender", "RP3betaRecommender", "PureSVDRecommender"]
new_ds = add_other_rec_features(ds=train_ds, algorithms=algorithms,is_testset=True,history_train=history_train, behaviors_train=behaviors_train,history_val=history_val,behaviors_val=behaviors_val, evaluate=False)
new_ds

Training ItemKNNCFRecommender ...
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 8786 (100.0%), 1706.90 column/sec. Elapsed time 5.15 sec
Training RP3betaRecommender ...
RP3betaRecommender: Similarity column 8786 (100.0%), 3817.40 column/sec. Elapsed time 2.30 sec
Training PureSVDRecommender ...
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 0.88 sec


impression_id,article,impression_time,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,session_id,trendiness_score,weekday,hour,premium,published_time,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,article_delay_days,article_delay_hours,Entity_PROD_Present,Entity_ORG_Present,Entity_PER_Present,Entity_LOC_Present,Entity_MISC_Present,Entity_EVENT_Present,item_index,user_index,ItemKNNCFRecommender,RP3betaRecommender,PureSVDRecommender
u32,i32,datetime[μs],i8,i8,f32,f32,u32,bool,i8,i8,bool,u32,i16,i8,i8,bool,datetime[μs],i16,f32,str,u32,u8,u8,u16,i16,i32,bool,bool,bool,bool,bool,bool,u32,u32,f32,f32,f32
385198819,9762352,2023-05-24 18:58:45,0,2,17.0,100.0,10068,false,2,,false,1969198,206,3,18,false,2023-05-12 15:26:56,498,0.6981,"""Neutral""",2,4,15,102,12,291,true,false,true,true,false,true,8242,0,0.910534,0.00881,0.038083
13130056,9099237,2023-05-21 18:03:17,0,2,10.0,,10200,false,2,,false,869773,302,7,18,true,2022-01-26 19:03:27,140,0.9951,"""Negative""",4,8,27,842,479,11518,true,true,true,true,false,true,2060,1,0.126396,0.0,0.003439
458793473,9142581,2023-05-20 17:45:42,0,2,15.0,100.0,10200,false,2,,false,869770,229,6,17,true,2022-03-09 08:02:34,414,0.6342,"""Positive""",8,5,27,961,437,10497,true,true,true,false,false,false,2111,1,0.093124,0.0,0.00236
280158285,9220931,2023-05-18 08:19:54,0,2,9.0,100.0,10200,false,2,,false,869759,33,4,8,true,2022-04-21 07:02:13,565,0.824,"""Neutral""",9,9,25,840,392,9409,false,false,false,false,false,false,2261,1,0.095409,0.0,0.003917
382570352,9268227,2023-05-22 18:40:35,0,2,10.0,,10200,false,2,,false,869774,126,1,18,true,2022-05-31 05:01:41,140,0.9946,"""Negative""",5,6,25,1102,356,8557,false,false,false,false,false,false,2353,1,0.068082,0.0,0.002141
382570356,9268227,2023-05-22 18:41:07,0,2,50.0,,10200,false,2,,false,869774,126,1,18,true,2022-05-31 05:01:41,140,0.9946,"""Negative""",5,6,25,1102,356,8557,false,false,false,false,false,false,2353,1,0.068082,0.0,0.002141
124243623,9440508,2023-05-18 21:23:30,1,2,16.0,,10200,false,2,,false,869762,33,4,21,true,2022-10-14 05:13:03,457,0.9076,"""Neutral""",2,12,18,880,216,5200,false,true,false,true,false,false,2852,1,0.209815,0.008292,0.016469
243047100,9514481,2023-05-21 14:31:03,0,2,11.0,,10200,false,2,,false,869772,34,7,14,true,2022-11-16 11:15:32,414,0.9501,"""Neutral""",7,9,30,371,186,4467,true,true,true,false,false,false,3176,1,0.072407,0.0,0.008561
243047107,9514481,2023-05-21 14:31:35,0,2,4.0,,10200,false,2,,false,869772,34,7,14,true,2022-11-16 11:15:32,414,0.9501,"""Neutral""",7,9,30,371,186,4467,true,true,true,false,false,false,3176,1,0.072407,0.0,0.008561
243047101,9514481,2023-05-21 14:31:48,0,2,13.0,,10200,false,2,,false,869772,34,7,14,true,2022-11-16 11:15:32,414,0.9501,"""Neutral""",7,9,30,371,186,4467,true,true,true,false,false,false,3176,1,0.072407,0.0,0.008561


## Session Features

In [22]:
# behaviors_train.select(['session_id', 'impression_time', 'article_ids_clicked']) \
#     .group_by('session_id').map_groups(
#         lambda group: group.explode('article_ids_clicked').sort('impression_time') \
#             .select(
#                 pl.col('session_id').first(),
#                 pl.col('article_ids_clicked').first().alias('article_id'),
#                 (pl.col('impression_time').last() - pl.col('impression_time').first()).dt.total_minutes().alias('induced_session_len'),
#                 (pl.col('impression_time').count() - 1).alias('num_clicked_articles_following')
#             )
#     ).group_by('article_id').agg(
#         pl.col('num_clicked_articles_following').mean().alias('mean_num_clicked_articles_following'),
#         pl.col('induced_session_len').mean().alias('mean_induced_session_len'),
#     )

## Popularities

# Adding history features

In [23]:
# .pipe(batch_cosine_similarity, col1='topics_idf', col2='topics_flatten_tf_idf', target_col='topics_cosine') \

## add recmodels features

# Training