# Emotions Embeddings
### This notebook has been used to build a new embedding using https://huggingface.co/NikolajMunch/danish-emotion-classification model.

### For each article the code extracts the title and subtitle, it concatenates them and then the 'full title' is processed by the model.

### The model output is a 6-th dimensional vector representing the score given by the model to the following emotios [disgust, fear ,joy, surprise, sadness, anger]

In [None]:
import polars as pl
import tqdm
from scipy.special import softmax

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("NikolajMunch/danish-emotion-classification")

model = AutoModelForSequenceClassification.from_pretrained("NikolajMunch/danish-emotion-classification")

In [None]:
def compute_emotion_score(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    return softmax(scores).tolist()

In [None]:
articles = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_large/articles.parquet')

In [None]:
full_title_articles = articles.select(['article_id','title','subtitle'])\
                        .with_columns(
                            pl.concat_str(
                                    [
                                        pl.col('title'),
                                        pl.col('subtitle')
                                    ],
                                    separator=" ",
                                ).alias('full_title')).select(['article_id','full_title'])

In [None]:
full_title_articles = articles.select(['article_id','title','subtitle'])\
                        .with_columns(
                            pl.concat_str(
                                    [
                                        pl.col('title'),
                                        pl.col('subtitle')
                                    ],
                                    separator=" ",
                                ).alias('full_title')).select(['article_id','full_title'])

In [None]:
articles_emotions =  pl.concat(
            rows.with_columns(
            pl.struct(['full_title'])\
                .map_elements(lambda x: compute_emotion_score(x['full_title']),return_dtype=pl.List(pl.Float64)).cast(pl.List(pl.Float64)).alias('emotion_scores')
            )
        for rows in tqdm.tqdm(full_title_articles.iter_slices(100), total=full_title_articles.shape[0] // 100))

In [None]:
articles_emotions.drop('full_title').write_parquet('/kaggle/working/articles_emotion.parquet')

In [None]:
articles = pl.read_parquet('/kaggle/working/articles_emotion.parquet')