This notebook shows how to retrieve the necessary data from the EB-NeRD data to create datasets with the correct structure for the Prompt4NR framework and our extensions.
The data files themselves can easily be accessed from the README file!

Load EB-NeRD files

In [66]:
#!pip install pandas pyarrow needed for reading parquet files
import pandas as pd

articles_df = pd.read_parquet('data/ebnerd_large/articles.parquet')
history_df = pd.read_parquet('data/ebnerd_large/train/history.parquet')
behaviors_df = pd.read_parquet('data/ebnerd_large/train/behaviors.parquet')
val_history_df = pd.read_parquet('data/ebnerd_large/validation/history.parquet')
val_behaviors_df = pd.read_parquet('data/ebnerd_large/validation/behaviors.parquet')

Turning articles.parquet into news.txt file (dictionary) for all articles

In [3]:
import pickle
def restructure_article(row):
    return {
        str(row['article_id']): {
            'cate': str(row['category']),
            'subcate': str(row['subcategory']),
            'title': row['title'],
            'abstract': row['subtitle'], 
            'sentiment': row['sentiment_label'],
            'topics': row['topics']  
        }
    }

new_dataset_dict = {}
for _, row in articles_df.iterrows():
    new_dataset_dict.update(restructure_article(row))

with open('news.txt', 'wb') as handle:
    pickle.dump(new_dataset_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

note at this point that we also subsetted the news.txt dict to only include the articles actually used in the train/val and test files. But this is technically not necessary.

Stratified sampling based on gender and age

In [None]:
def stratified_sample_df(df, gender_col, age_col, num_samples):
    gender_distribution = df[gender_col].value_counts(normalize=True)
    samples_per_gender = (gender_distribution * num_samples).round().astype(int)
    
    sample_df = pd.DataFrame()
    for gender_value, n_samples in samples_per_gender.items():
        gender_df = df[df[gender_col] == gender_value]
        age_distribution = gender_df[age_col].value_counts(normalize=True)
        if age_distribution.sum() == 0 or n_samples == 0:
            continue
        sampled_df = gender_df.sample(n=min(n_samples, len(gender_df)), weights=gender_df[age_col].map(age_distribution), replace=False)
        sample_df = pd.concat([sample_df, sampled_df])
    return sample_df

#adjust the dataframe and number of samples to get the amount of data you want for train and val 
#e.g.
train_stratified_sample = stratified_sample_df(behaviors_df, 'gender', 'age', 7849)
val_stratified_sample = stratified_sample_df(val_behaviors_df, 'gender', 'age', 7849)

Turning history and behaviors parquet into train.txt, val.txt and test.txt files

In [69]:
def get_users_click_history(user_id, current_impression_time, cur_history_df):
    #get histories until current impression (aka session click)
    user_history = cur_history_df[cur_history_df['user_id'] == user_id]
    past_interactions = []
    for index, row in user_history.iterrows():
        for art_id, imp_time in zip(row['article_id_fixed'], row['impression_time_fixed']):
            if imp_time < current_impression_time:
                past_interactions.append(str(art_id))
            else:
                print("nope")
    
    return past_interactions

In [70]:
new_train_data = [ [],[],[], [] ]

for index, row in train_stratified_sample.iterrows():
    new_train_data[0].append(row['impression_id'])
    new_train_data[1].append(str(row['user_id']))
    new_train_data[2].append(str(row['impression_time']))
    
    clicked_articles = set(row['article_ids_clicked']) if len(row['article_ids_clicked']) > 0 else set()
    inview_articles = set(row['article_ids_inview']) if len(row['article_ids_inview']) > 0 else set()
    positive_samples = [str(article) for article in clicked_articles] 
    negative_samples = [str(article) for article in (inview_articles - clicked_articles)]
    click_history = get_users_click_history(row['user_id'], row['impression_time'],history_df)
    new_train_data[3].append([
        click_history,
        positive_samples,
        negative_samples
    ])

with open('train.txt', 'wb') as f:
    pickle.dump(new_train_data, f)

In [78]:
new_val_data = [ [],[],[], [] ]

for index, row in val_stratified_sample.iterrows():
    new_val_data[0].append(row['impression_id'])
    new_val_data[1].append(str(row['user_id']))
    new_val_data[2].append(str(row['impression_time']))
    
    clicked_articles = set(row['article_ids_clicked']) if len(row['article_ids_clicked']) > 0 else set()
    inview_articles = set(row['article_ids_inview']) if len(row['article_ids_inview']) > 0 else set()
    positive_samples = [str(article) for article in clicked_articles] 
    negative_samples = [str(article) for article in (inview_articles - clicked_articles)]
    click_history = get_users_click_history(row['user_id'], row['impression_time'], val_history_df)
    new_val_data[3].append([
        click_history,
        positive_samples,
        negative_samples
    ])

with open('val.txt', 'wb') as f:
    pickle.dump(new_val_data, f)

In [None]:
#making a test subset from validation data
subset_val_df = val_behaviors_df[~val_behaviors_df['impression_id'].isin(new_val_data[0])]
test_df = subset_val_df.sample(n=73152, random_state=42)

In [93]:
new_test_data = [ [],[],[], [] ]

for index, row in test_df.iterrows():
    new_test_data[0].append(row['impression_id'])
    new_test_data[1].append(str(row['user_id']))
    new_test_data[2].append(str(row['impression_time']))
    
    clicked_articles = set(row['article_ids_clicked']) if len(row['article_ids_clicked']) > 0 else set()
    inview_articles = set(row['article_ids_inview']) if len(row['article_ids_inview']) > 0 else set()
    positive_samples = [str(article) for article in clicked_articles] 
    negative_samples = [str(article) for article in (inview_articles - clicked_articles)]
    click_history = get_users_click_history(row['user_id'], row['impression_time'], val_history_df)
    new_test_data[3].append([
        click_history,
        positive_samples,
        negative_samples
    ])

with open('test.txt', 'wb') as f:
    pickle.dump(new_test_data, f)

Function to create dictionary mapping users to histories (for clustering data)

In [None]:
all_users = new_train_data[1]
all_users.extend(new_val_data[1])
all_users.extend(new_test_data[1])

all_hists = []
for i in new_train_data[3]:
    ids = i[0]
    all_hists.append(ids)
    
for i in new_val_data[3]:
    ids = i[0]
    all_hists.append(ids)
    
for i in new_test_data[3]:
    ids = i[0]
    all_hists.append(ids)
    
def create_user_history(user_ids, article_id_lists):
    user_history = {}
    
    for user_id, article_ids in zip(user_ids, article_id_lists):
        if user_id not in user_history:
            user_history[user_id] = set()
        user_history[user_id].update(article_ids)
    
    return user_history

users_arts = create_user_history(all_users, all_hists)

Create user-article information from user-history dictionary for clustering

In [2]:
def create_user_article_info(user_history, articles_df):

    articles_df = articles_df.reset_index(drop=True)
    articles_dict = articles_df.set_index('title').T.to_dict()

    user_info = {}
    total_users = len(user_history)
    for user_counter, (user_id, article_ids) in enumerate(user_history.items(), 1):
        user_info[user_id] = {
            'topics': [],
            'sentiment': [],
            'page_views': [],
            'time_published': [],
            'article_ids': []
        }
        
        for article_id in article_ids:
            title = new_dataset_dict[article_id]["title"]
            
            if title in articles_dict:
                article_data = articles_dict[title]
                user_info[user_id]['topics'].append(article_data['topics'])
                user_info[user_id]['sentiment'].append(article_data['sentiment_score'])
                user_info[user_id]['page_views'].append(article_data['total_pageviews'])
                user_info[user_id]['time_published'].append(article_data['published_time'])
                user_info[user_id]['article_ids'].append(article_id)
        
        user_progress = (user_counter / total_users) * 100
        print(f"Overall progress: {user_progress:.2f}%")
    
    return user_info

user_article_info = create_user_article_info(users_arts, articles_df)

In [None]:
#adding info about index of each articleIDs info in the data for clustering
def transform_data(user_data):
    transformed_data = {}
    
    for user_id, data in user_data.items():
        new_data = data.copy()  
        article_ids = new_data.pop('article_ids', [])
        index_dict = {index: article_id for index, article_id in enumerate(article_ids)}
        new_data['index'] = index_dict
        transformed_data[user_id] = new_data
    return transformed_data
newewst_dict = transform_data(user_article_info)
with open('final_cluster_data.txt', 'wb') as handle:
    pickle.dump(newewst_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)