# Run HuggingFace Sentiment Analysis on Forum Posts

This notebook uses the [Meta Kaggle](https://www.kaggle.com/kaggle/meta-kaggle) dataset and computes a positive/negative sentiment score for all the Kaggle forum posts using the [HuggingFace Sentiment Analysis Pipeline](https://huggingface.co/transformers/main_classes/pipelines.html), saving the results for later analysis.


In [1]:
!pip install beautifulsoup4 -q -q

In [2]:
from jt_mk_utils import *
from transformers import pipeline
from bs4 import BeautifulSoup
import re
import numpy as np
from tqdm.notebook import tqdm

In [3]:
classifier = pipeline("sentiment-analysis", framework="pt", device=0)
# max tokens evaluated per forum post: long posts are truncated
classifier.tokenizer.model_max_length

In [4]:
def get_html_text(s):
    soup = BeautifulSoup(s, "html.parser")
    # removed quoted text written by others
    for data in soup(["style", "script", "iframe", "blockquote", "code"]):
        data.decompose()
    txt = soup.get_text()
    txt = re.sub(r"\[quote.*\[/quote\]", " ", txt, flags=re.S)
    txt = txt.strip()
    return txt

In [5]:
forum_messages = read_forum_messages(index_col=0)
forum_messages.shape

# Parse HTML

In [6]:
%%time
text = forum_messages.Message.fillna("").apply(get_html_text)

In [7]:
# V2: sort by length, batches with lower maximum length are quicker
text = text.iloc[text.str.len().argsort()]
text.shape

# Run Pipeline

In [8]:
batch_size = 100
batch_ids = np.arange(len(text)) // batch_size
batch_ids

In [9]:
dfs = []
for batch_id, sub in tqdm(text.groupby(batch_ids)):
    result = classifier(list(sub), truncation=True)
    dfs.append(pd.DataFrame(result, index=sub.index))

In [10]:
df = pd.concat(dfs)
df.shape

In [11]:
df.label.value_counts()

In [12]:
df.groupby("label").score.agg(["count", "sum", "min", "mean", "max"])

# Save

In [13]:
df.sort_index().to_csv("forum-messages-sentiment-analysis.csv")