In [1]:
import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))

In [2]:
import altair as alt

alt.data_transformers.enable("json")

import pandas as pd

from datasets import Dataset
from pandas import DataFrame

from src.paths import *

from tqdm.auto import tqdm

tqdm.pandas()

In [3]:
SINCE = pd.to_datetime("2022-01-01")
CAUSE_AREAS = ["AI alignment", "Global health & development", "Community"]

In [17]:
sp = slidesp()
!mkdir -p $sp

## Load data

### Posts

In [50]:
posts = Dataset.load_from_disk(dataset_path=datap("posts")).to_pandas()
posts["postedAt"] = posts["postedAt"].dt.tz_localize(None)
posts = posts.loc[(posts.postedAt >= SINCE)]

In [5]:
posts_split_df = Dataset.load_from_disk(cachep("posts_split_ds")).to_pandas()
posts_sentiment = Dataset.load_from_disk(cachep("posts_split_sentiment_ds"))

In [6]:
posts_emotions = Dataset.load_from_disk(cachep("posts_split_emotions_ds"))

### Comments

In [7]:
comments = Dataset.load_from_disk(dataset_path=datap("comments")).to_pandas()
comments["postedAt"] = comments["postedAt"].dt.tz_localize(None)
comments = comments[comments["postedAt"] >= SINCE]

In [8]:
comments_emotions = Dataset.load_from_disk(cachep("comments_emotions_ds")).to_pandas()
comments_emotions["postedAt"] = comments_emotions["postedAt"].dt.tz_localize(None)

In [9]:
comments_sentiment = Dataset.load_from_disk(cachep("comments_sentiment_ds"))

## Plots

### Posts per month

In [20]:
df = posts.groupby(pd.Grouper(key="postedAt", freq="M")).size().reset_index(name="count")

chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("yearmonth(postedAt):T", title="Month"),
        y=alt.Y("count", title="Number of posts"),
    )
)

chart.save(slidesp("posts_per_month.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Post sentiments

In [21]:
df = posts_sentiment.to_pandas()
df = df.loc[df.postedAt >= SINCE]
df = (
    df.groupby([pd.Grouper(key="postedAt", freq="M"), "sentiment"])
    .size()
    .reset_index(name="count")
)

chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("yearmonth(postedAt):T", title="Month"),
        y=alt.Y("count", title="Number of posts"),
        color=alt.Color("sentiment", title="Sentiment"),
        column=alt.Column("sentiment:N", title="Sentiment"),
    )
)
chart.save(slidesp("posts_sentiment_per_month.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


In [23]:
df = posts_sentiment.to_pandas()
df = df.loc[df.postedAt >= SINCE]
df["month"] = pd.to_datetime(df["postedAt"]).dt.to_period("M")

grouped = df.groupby(["month", "sentiment"]).agg(count=("text", "count"))
totals = grouped.groupby("month").agg(count=("count", "sum"))

grouped["count"] = grouped["count"].div(totals["count"], axis=0)
grouped = grouped.reset_index()
grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

chart = (
    alt.Chart(grouped.reset_index())
    .mark_area()
    .encode(
        color=alt.Color("sentiment:N", title="Emotion"),
        x=alt.X("yearmonth(month):O", title="Month"),
        y=alt.Y("count:Q", title="Proportion of posts", scale=alt.Scale(domain=(0, 1))),
    )
    .properties(width=600, height=200)
)
chart.save(slidesp("posts_sentiments_per_month_cols_proportion.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Post emotins

In [24]:
df = posts_emotions.to_pandas()
df = (
    df.groupby("emotion")
    .size()
    .to_frame(name="count")
    .reset_index(names="emotion")
    .sort_values(by="count", ascending=False)
)

chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("emotion", title="Emotion", sort=df.emotion.values),
        y=alt.Y("count", title="Number of posts"),
    )
).properties(width=600, height=120)
chart.save(slidesp("posts_emotions_bar.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


In [25]:
from collections import Counter

top_posts_emotions = Counter(posts_emotions["emotion"]).most_common(5)
top_posts_emotions = [x[0] for x in top_posts_emotions]

In [28]:
df = posts_emotions.to_pandas()
df = df.loc[df.postedAt >= SINCE]
df = df[df["emotion"].isin(top_posts_emotions)]
df["month"] = pd.to_datetime(df["postedAt"]).dt.to_period("M")

grouped = df.groupby(["month", "emotion"]).agg(count=("text", "count"))
totals = grouped.groupby("month").agg(count=("count", "sum"))

grouped["fraction"] = grouped["count"].div(totals["count"], axis=0)
grouped = grouped.reset_index()
grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

chart = (
    alt.Chart(grouped.reset_index())
    .mark_area()
    .encode(
        color=alt.Color("emotion:N", title="Emotion"),
        x=alt.X("yearmonth(month):O", title="Month"),
        y=alt.Y(
            "fraction:Q", title="Proportion of posts", scale=alt.Scale(domain=(0, 1))
        ),
        tooltip=["emotion", "yearmonth(month)", "fraction", "count"],
    )
    .properties(width=600, height=180)
)
chart.save(slidesp("posts_emotions_per_month_cols_proportion.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Post emotions per selected cause areas

In [29]:
# join posts and posts_emotions
# to filter posts_emotions to only include posts with given tags

df = posts_emotions.to_pandas().set_index("postId")[["emotion"]]
df = df.join(posts.set_index("postId"), how="inner")
df = df[["postedAt", "emotion", "tags"]]
df = df.loc[df["tags"].apply(lambda x: any(item in x for item in CAUSE_AREAS))]
df_encoded = pd.get_dummies(df["tags"].apply(pd.Series).stack())[CAUSE_AREAS].any(level=0)
df = df.drop(["tags"], axis=1).merge(df_encoded, on="postId")


df_top_emotions = Counter(df["emotion"]).most_common(5)
df_top_emotions = [x[0] for x in df_top_emotions]

  df_encoded = pd.get_dummies(df["tags"].apply(pd.Series).stack())[CAUSE_AREAS].any(level=0)


In [30]:
for tag in CAUSE_AREAS:
    x = df[df[tag] == True]

    x = x[x["emotion"].isin(df_top_emotions)]
    x["month"] = pd.to_datetime(x["postedAt"]).dt.to_period("M")

    grouped = x.groupby(["month", "emotion"]).agg(count=("emotion", "count"))
    totals = grouped.groupby("month").agg(count=("count", "sum"))

    grouped["count"] = grouped["count"].div(totals["count"], axis=0)
    grouped = grouped.reset_index()
    grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

    chart = (
        alt.Chart(grouped, title=tag)
        .mark_line()
        .encode(
            color=alt.Color("emotion:N", title="Emotion"),
            x=alt.X("yearmonth(month):O", title="Month"),
            y=alt.Y("count:Q", title="Proportion of posts"),
        )
    ).properties(width=600, height=150)
    chart.save(
        slidesp(
            "posts_emotions_per_month_cols_proportion_{}.svg".format(
                tag.replace(" ", "_").replace('&', 'and')
            )
        )
    )
    chart.display()

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


### Comments per month

In [31]:
# this is an altair graph of the number of comments per month

df = (
    comments.groupby(pd.Grouper(key="postedAt", freq="M"))
    .size()
    .reset_index(name="count")
)

chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("yearmonth(postedAt):T", title="Month"),
        y=alt.Y("count", title="Number of comments"),
    )
)

chart.save(slidesp("comments_per_month.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Comment emotions

In [32]:
df = comments_emotions
df = (
    df.groupby("emotion")
    .size()
    .to_frame(name="count")
    .reset_index(names="emotion")
    .sort_values(by="count", ascending=False)
)

chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("emotion", title="Emotion", sort=df.emotion.values),
        y=alt.Y("count", title="Number of comments"),
    )
).properties(width=600, height=120)
chart.save(slidesp("comments_emotions_bar.svg"))
chart


  for col_name, dtype in df.dtypes.iteritems():


### Commment sentiments

In [33]:
df = comments_sentiment.to_pandas()
df["postedAt"] = df["postedAt"].dt.tz_localize(None)
df = df.loc[df.postedAt >= SINCE]
df["month"] = pd.to_datetime(df["postedAt"]).dt.to_period("M")

grouped = df.groupby(["month", "sentiment"]).agg(count=("text", "count"))
totals = grouped.groupby("month").agg(count=("count", "sum"))

grouped["count"] = grouped["count"].div(totals["count"], axis=0)
grouped = grouped.reset_index()
grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

chart = (
    alt.Chart(grouped.reset_index())
    .mark_area()
    .encode(
        color=alt.Color("sentiment:N", title="Emotion"),
        x=alt.X("yearmonth(month):O", title="Month"),
        y=alt.Y(
            "count:Q", title="Proportion of comments", scale=alt.Scale(domain=(0, 1))
        ),
    )
    .properties(width=600, height=200)
)
chart.save(slidesp("comments_sentiments_per_month_cols_proportion.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Comment emotions

In [34]:
top_comments_emotions = Counter(comments_emotions["emotion"]).most_common(5)
top_comments_emotions = [x[0] for x in top_comments_emotions]

In [36]:
df = comments_emotions
df = df.loc[df.postedAt >= SINCE]
df = df[df["emotion"].isin(top_comments_emotions)]
df["month"] = pd.to_datetime(df["postedAt"]).dt.to_period("M")

grouped = df.groupby(["month", "emotion"]).agg(count=("text", "count"))
totals = grouped.groupby("month").agg(count=("count", "sum"))

grouped["fraction"] = grouped["count"].div(totals["count"], axis=0)
grouped = grouped.reset_index()
grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

chart = (
    alt.Chart(grouped.reset_index())
    .mark_area()
    .encode(
        color=alt.Color("emotion:N", title="Emotion"),
        x=alt.X("yearmonth(month):O", title="Month"),
        y=alt.Y(
            "fraction:Q", title="Proportion of comments", scale=alt.Scale(domain=(0, 1))
        ),
        tooltip=["emotion", "yearmonth(month)", "fraction", "count"],
    )
    .properties(width=600, height=180)
)
chart.save(slidesp("comments_emotions_per_month_cols_proportion.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


### Comment emotions per selected cause areas

In [39]:
df = comments_emotions.set_index("postId")[["emotion"]]
df = df.join(posts.set_index("postId"), how="inner")
df = df[["postedAt", "emotion", "tags"]]
df = df.loc[df["tags"].apply(lambda x: any(item in x for item in CAUSE_AREAS))]
df_encoded = pd.get_dummies(df["tags"].apply(pd.Series).stack())[CAUSE_AREAS].any(level=0)
df = df.drop(["tags"], axis=1).merge(df_encoded, on="postId")


df_top_emotions = Counter(df["emotion"]).most_common(5)
df_top_emotions = [x[0] for x in df_top_emotions]

  df_encoded = pd.get_dummies(df["tags"].apply(pd.Series).stack())[CAUSE_AREAS].any(level=0)


In [40]:
for tag in CAUSE_AREAS:
    x = df[df[tag] == True]

    x = x[x["emotion"].isin(df_top_emotions)]
    x["month"] = pd.to_datetime(x["postedAt"]).dt.to_period("M")

    grouped = x.groupby(["month", "emotion"]).agg(count=("emotion", "count"))
    totals = grouped.groupby("month").agg(count=("count", "sum"))

    grouped["count"] = grouped["count"].div(totals["count"], axis=0)
    grouped = grouped.reset_index()
    grouped["month"] = pd.to_datetime(grouped["month"].dt.to_timestamp())

    chart = (
        alt.Chart(grouped, title=tag)
        .mark_line()
        .encode(
            color=alt.Color("emotion:N", title="Emotion"),
            x=alt.X("yearmonth(month):O", title="Month"),
            y=alt.Y("count:Q", title="Proportion of comments"),
        )
    ).properties(width=600, height=150)
    chart.save(
        slidesp(
            "comments_emotions_per_month_cols_proportion_{}.svg".format(
                tag.replace(" ", "_").replace('&', 'and')
            )
        )
    )
    chart.display()

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


### Matching comment sentiments with post emotions

In [44]:
post2emotions = posts_emotions.to_pandas()
post2emotions = post2emotions.loc[post2emotions.postedAt >= SINCE]
post2emotions = (
    post2emotions.groupby("postId")
    .agg(emotion=("emotion", pd.Series.mode))
    .explode("emotion")
)

In [45]:
post2commentsentiment = comments_sentiment.to_pandas()
post2commentsentiment.set_index("postId", inplace=True)
post2commentsentiment["postedAt"] = post2commentsentiment["postedAt"].dt.tz_localize(
    None
)


In [46]:
# join post2emotions and post2commentsentiment

df = post2emotions.join(post2commentsentiment, how="inner")
df = df.explode("emotion")
df["sentiment"] = df["sentiment"].map({"NEG": -1, "NEU": 0, "POS": 1})
df = (
    df.groupby([pd.Grouper(key="postedAt", freq="M"), "emotion"])
    .mean()
    .reset_index(names=["postedAt", "emotion", "sentiment"])
)


  .mean()


In [48]:
chart = (
    alt.Chart(df)
    .mark_rect()
    .encode(
        x=alt.X("yearmonth(postedAt):O", title="Month"),
        y=alt.Y("emotion:N", title="Post emotion"),
        color=alt.Color(
            "sentiment",
            title="Comment sentiment",
            scale=alt.Scale(scheme="redyellowgreen"),
        ),
    )
    .properties(width=800, height=400)
)
chart.save(slidesp("comments_sentiment_by_post_emotion_heatmap.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():


In [49]:
chart = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        x=alt.Y(
            "emotion:N",
            title="Post emotion",
            sort=df["sentiment"]
            .groupby(df["emotion"])
            .mean()
            .sort_values(ascending=False)
            .index.values,
        ),
        y=alt.Color("sentiment", title="Comment sentiment"),
    )
)
chart.save(slidesp("comments_sentiment_by_post_emotion_boxplot.svg"))
chart

  for col_name, dtype in df.dtypes.iteritems():
