# Make article embeddings using Hugging Face

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from tqdm import tqdm
import polars as pl
import numpy as np
import torch

from ebrec.utils._nlp import generate_embeddings_with_transformers
from ebrec.utils._python import batch_items_generator
from ebrec.utils._polars import concat_str_columns

In [2]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"

Path for loading data:

In [7]:
DATA_PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DUMP_DIR = DATA_PATH.joinpath("artifacts", TRANSFORMER_MODEL_NAME.replace("/", "_"))
DUMP_DIR.mkdir(parents=True, exist_ok=True)
print(f"Embeddings will be stored at: {DUMP_DIR}")

Embeddings will be stored at: /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/artifacts/FacebookAI_xlm-roberta-large


In [8]:
## REMEMBER CHANGE DATA PATH IF USING LARGE!

df_articles = pl.read_parquet(DATA_PATH.joinpath("ebnerd_small/articles.parquet"))
df_articles.head(5)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""
3012771,"""Morten Bruun f…","""FODBOLD: Morte…",2023-06-29 06:20:39,False,"""Kemien mellem …",2006-05-01 14:28:40,[3177953],"""article_defaul…","""https://ekstra…",[],[],"[""Erhverv"", ""Kendt"", … ""Ansættelsesforhold""]",142,"[196, 199]","""sport""",,,,0.8241,"""Negative"""
3023463,"""Luderne flytte…","""I landets tynd…",2023-06-29 06:20:43,False,"""Det frække erh…",2007-03-24 08:27:59,[3184029],"""article_defaul…","""https://ekstra…",[],[],"[""Livsstil"", ""Erotik""]",118,[133],"""nyheder""",,,,0.7053,"""Neutral"""
3032577,"""Cybersex: Hvor…","""En flirtende s…",2023-06-29 06:20:46,False,"""De fleste af o…",2007-01-18 10:30:37,[3030463],"""article_defaul…","""https://ekstra…",[],[],"[""Livsstil"", ""Partnerskab""]",565,[],"""sex_og_samliv""",,,,0.9307,"""Neutral"""


We're just going to demo, set to False to run all articles.

In [9]:
DEMO = False #True
if DEMO:
    df_articles = df_articles[:10]

In [10]:
concat_columns = ["title", "subtitle", "body"]

Select the Transformer model and the batch-size of which it will iterate the articles

In [11]:
model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)



Make the column with text data you want to embed:

In [12]:
df_articles, col_name = concat_str_columns(df_articles, concat_columns)

df_articles.select(col_name).head(5)

title-subtitle-body
str
"""Natascha var i…"
"""Kun Star Wars …"
"""Morten Bruun f…"
"""Luderne flytte…"
"""Cybersex: Hvor…"


Embed text:

In [13]:
BATCH_SIZE = 32
n_batches = int(np.ceil(df_articles.height / BATCH_SIZE))

chunked_text_list = batch_items_generator(df_articles[col_name].to_list(), BATCH_SIZE)
embeddings = (
    generate_embeddings_with_transformers( #Passes batched texted to tranformer 
        model=model,
        tokenizer=tokenizer,
        text_list=text_list,
        batch_size=BATCH_SIZE,
        disable_tqdm=True,
    )
    for text_list in tqdm(
        chunked_text_list, desc="Encoding", total=n_batches, unit="text"
    )
)
embeddings = torch.vstack(list(embeddings)) #Stacks the embeddings into a single tensor

Encoding:   0%|          | 0/649 [00:00<?, ?text/s]

Encoding: 100%|██████████| 649/649 [09:41<00:00,  1.12text/s]


Make the dataframe

In [14]:
embeddings_name = f"{col_name}-{TRANSFORMER_MODEL_NAME}"
series_emb = pl.Series(embeddings_name, embeddings.to("cpu").numpy())
df_emb = df_articles.select("article_id").with_columns(series_emb)

Dump the embeddings:

In [15]:
file_path = DUMP_DIR.joinpath(f"{embeddings_name.replace('/', '_')}.parquet")
df_emb.write_parquet(file_path)
print(f"Embeddings saved to: {file_path}")

Embeddings saved to: /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/artifacts/FacebookAI_xlm-roberta-large/title-subtitle-body-FacebookAI_xlm-roberta-large.parquet


# DONE 🚀