# Make article embeddings using Hugging Face

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from tqdm import tqdm
import polars as pl
import numpy as np
import torch

from ebrec.utils._nlp import generate_embeddings_with_transformers
from ebrec.utils._python import batch_items_generator
from ebrec.utils._polars import concat_str_columns

In [2]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"

Path for loading data:

In [3]:
DATA_PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DUMP_DIR = DATA_PATH.joinpath("artifacts", TRANSFORMER_MODEL_NAME.replace("/", "_"))
DUMP_DIR.mkdir(parents=True, exist_ok=True)
print(f"Embeddings will be stored at: {DUMP_DIR}")

Embeddings will be stored at: /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/artifacts/FacebookAI_xlm-roberta-large


In [None]:
## REMEMBER CHANGE DATA PATH IF USING LARGE!

df_articles = pl.read_parquet(DATA_PATH.joinpath("ebnerd_small/articles.parquet"))
df_articles.head(5)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative"""
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral"""
3000613,"""Jesper Olsen r…","""Den tidligere …",2023-06-29 06:20:33,False,"""Jesper Olsen, …",2006-05-09 11:29:00,[3164998],"""article_defaul…","""https://ekstra…","[""Frankrig"", ""Jesper Olsen"", … ""Jesper Olsen""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Sport"", … ""Sygdom og behandling""]",142,"[196, 271]","""sport""",,,,0.9876,"""Negative"""
3000700,"""Madonna topløs…","""47-årige Madon…",2023-06-29 06:20:33,False,"""Skal du have s…",2006-05-04 11:03:12,[3172046],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",,,,0.8786,"""Neutral"""
3000840,"""Otto Brandenbu…","""Sangeren og sk…",2023-06-29 06:20:33,False,"""'Og lidt for S…",2007-03-01 18:34:00,[3914446],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Musik og lyd""]",118,[133],"""nyheder""",,,,0.9468,"""Negative"""


We're just going to demo, set to False to run all articles.

In [5]:
DEMO = False #True
if DEMO:
    df_articles = df_articles[:10]

In [6]:
concat_columns = ["title", "subtitle", "body"]

Select the Transformer model and the batch-size of which it will iterate the articles

In [7]:
model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)



Make the column with text data you want to embed:

In [8]:
df_articles, col_name = concat_str_columns(df_articles, concat_columns)

df_articles.select(col_name).head(5)

title-subtitle-body
str
"""Hanks beskyldt…"
"""Bostrups aske …"
"""Jesper Olsen r…"
"""Madonna topløs…"
"""Otto Brandenbu…"


Embed text:

In [9]:
BATCH_SIZE = 32*2
n_batches = int(np.ceil(df_articles.height / BATCH_SIZE))

chunked_text_list = batch_items_generator(df_articles[col_name].to_list(), BATCH_SIZE)
embeddings = (
    generate_embeddings_with_transformers( #Passes batched texted to tranformer 
        model=model,
        tokenizer=tokenizer,
        text_list=text_list,
        batch_size=BATCH_SIZE,
        disable_tqdm=True,
    )
    for text_list in tqdm(
        chunked_text_list, desc="Encoding", total=n_batches, unit="text"
    )
)
embeddings = torch.vstack(list(embeddings)) #Stacks the embeddings into a single tensor

Encoding:   0%|          | 0/1962 [00:00<?, ?text/s]

Encoding:   0%|          | 3/1962 [00:06<1:01:27,  1.88s/text]

KeyboardInterrupt: 

Make the dataframe

In [None]:
embeddings_name = f"{col_name}-{TRANSFORMER_MODEL_NAME}"
series_emb = pl.Series(embeddings_name, embeddings.to("cpu").numpy())
df_emb = df_articles.select("article_id").with_columns(series_emb)

Dump the embeddings:

In [None]:
file_path = DUMP_DIR.joinpath(f"{embeddings_name.replace('/', '_')}.parquet")
df_emb.write_parquet(file_path)
print(f"Embeddings saved to: {file_path}")

Embeddings saved to: /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/artifacts/FacebookAI_xlm-roberta-large/title-subtitle-body-FacebookAI_xlm-roberta-large.parquet


# DONE 🚀