In [None]:
import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))

from src.paths import *

import pandas as pd

from datasets import Dataset
from pandas import DataFrame

from tqdm.auto import tqdm

tqdm.pandas()

## Load data

### Data

In [None]:
posts = Dataset.load_from_disk(dataset_path=datap("posts")).to_pandas()
posts["postedAt"] = posts["postedAt"].dt.tz_localize(None)
posts = posts.loc[
    (posts["postedAt"] >= pd.to_datetime("2010-01-01")) & (posts.body.str.len() > 10)
]

posts.head()

### Models

In [None]:
from transformers import pipeline
import fasttext
import nltk

nltk.download("punkt")

def predict_emotion(text, clf):
    em, prob = clf.predict(text)
    return em[0].split("__")[-1], prob[0]

In [None]:
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="finiteautomata/bertweet-base-sentiment-analysis",
    device=0,
)

In [None]:
emotion_clf = fasttext.load_model(datap("fasttext_empathetic_dialogues.mdl"))

## Prepare data and cache the results

In [None]:
from src.text_split import (
    extract_paragraphs,
    split_long_paragraphs,
    collapse_paragraphs_iteratively,
)

### Split posts by paragraphs

In [None]:
max_n_words = 128
posts["paragraphs"] = posts.body.progress_map(extract_paragraphs)
posts["paragraphs"] = posts.paragraphs.progress_map(
    lambda p: split_long_paragraphs(p, max_n_words=max_n_words)
)
posts["paragraphs_split"] = posts.paragraphs.progress_map(
    lambda x: collapse_paragraphs_iteratively(x, max_n_words=max_n_words)
)


In [None]:
posts_split_df = (
    posts.explode(column="paragraphs_split")[["postedAt", "postId", "paragraphs_split"]]
    .rename(columns={"paragraphs_split": "text"})
    .reset_index(drop=True)
)
posts_split_df.shape

In [None]:
posts_split_df.head()

In [None]:
posts_split_ds = Dataset.from_pandas(posts_split_df)
posts_split_ds.save_to_disk(cachep("posts_split_ds"))

### Split comments by paragraphs

In [None]:
comments = Dataset.load_from_disk(dataset_path=datap("comments")).to_pandas()
comments = comments.loc[comments.body.str.len() > 10]

In [None]:
max_n_words = 128
comments["paragraphs"] = comments.body.progress_map(extract_paragraphs)
comments["paragraphs"] = comments.paragraphs.progress_map(
    lambda p: split_long_paragraphs(p, max_n_words=max_n_words)
)
comments["paragraphs_split"] = comments.paragraphs.progress_map(
    lambda x: collapse_paragraphs_iteratively(x, max_n_words=max_n_words)
)


In [None]:
# XXX: this is explode + rename
comments_split_df = pd.concat(
    [
        DataFrame(
            {
                "postedAt": r.postedAt,
                "postId": r.postId,
                "text": r.paragraphs_split.text.values,
            }
        )
        for _, r in comments.iterrows()
    ],
    ignore_index=True,
)
comments_split_df.shape


In [None]:
comments_split_ds = Dataset.from_pandas(comments_split_df)
comments_split_ds.save_to_disk(cachep("comments_split_ds"))


## Load data from cache

In [None]:
posts_split_ds = Dataset.from_disk(cachep("posts_split_ds"))
posts_split_df = posts_split_ds.to_pandas()

In [None]:
comments_split_ds = Dataset.load_from_disk(cachep("comments_split_ds"))

## Analyze data

In [None]:
from pandas.api.types import CategoricalDtype

LABELS = CategoricalDtype(["POS", "NEG", "NEU"])

In [None]:
def simple_sentiment(text):
    label = sentiment_classifier(text, truncation=True)[0]["label"]
    return pd.Categorical([label], categories=LABELS.categories)[0]


def simple_sentiment_row(row):
    row["sentiment"] = simple_sentiment(row["text"])
    return row


def simple_sentiment_batch(batch):
    texts = batch["text"]
    sentiments = sentiment_classifier(texts, truncation=True)
    batch["sentiment"] = pd.Series(
        pd.Categorical([s["label"] for s in sentiments], categories=LABELS.categories)
    )
    return batch


def simple_emotion_row(row):
    text = row["text"].replace("\n", "")
    em, prob = predict_emotion(text, emotion_clf)
    row["emotion"] = em
    row["emotion_prob"] = prob
    return row

### Post sentiments

In [None]:
result = posts_split_ds.map(
    simple_sentiment_batch,
    batched=True,
    # batch_size=2,
)

result.save_to_disk(cachep("posts_split_sentiment_ds"))

### Post emotions

In [None]:
emotions = posts_split_ds.map(simple_emotion_row)
emotions.save_to_disk(cachep("posts_split_emotions_ds"))

In [None]:
from collections import Counter

top_emotions = Counter(emotions["emotion"]).most_common(5)
top_emotions

### Comment sentiments

In [None]:
comments_sentiment = comments_split_ds.map(
    simple_sentiment_batch,
    batched=True,
    # batch_size=2,
)

comments_sentiment.save_to_disk(cachep("comments_sentiment_ds"))

### Comment emotions

In [None]:
comments_emotions = comments_split_ds.map(simple_emotion_row)
comments_emotions.save_to_disk(cachep("comments_emotions_ds"))