In [None]:
import os
import json

import sqlite3
import numpy as np
import pandas as pd

from bertopic import BERTopic

from utils.analysis import (
    load_and_prep_data
)

import matplotlib.pyplot as plt
import seaborn as sns
from  matplotlib.ticker import FuncFormatter

from tqdm import tqdm
tqdm.pandas()

## Load Data

- `text_embedding_path`: path to new-line JSON file of document emebddings.
- `data_path`: path to processed CSV file containing analytic sample.
- `sql_path`: path to SQL database to grab additional columns.
- `topic_mapper_path`: path to JSON file mapping observation IDs to topic groups.
- `score_mapper_path`: path to JSON file mapping observation IDs to updated scores.

In [None]:
data_path = os.path.join(
    "..",
    "data",
    "topic_data",
    "topic_data.csv"
)

sql_path = os.path.join(
    "..", 
    "data", 
    "sqlite", 
    "idw_reddit.db"
)

topic_mapper_path = os.path.join(
    "..", 
    "data", 
    "topic_data", 
    "labels", 
    "topic_mapper.json"
)

score_mapper_path = os.path.join(
    "..", 
    "data", 
    "updated_scores", 
    "score_mapper.json"
)

# load & prep topic data:
df = load_and_prep_data(
    data=data_path, 
    sql_db=sql_path, 
    topic_group_file=topic_mapper_path, 
    score_file=score_mapper_path
)

# fetch coded data:
coded_data = pd.read_csv(
    os.path.join(
        "..",
        "data",
        "coding",
        "analysis_sample",
        "idw_reddit_posts.csv"
    )
)

# fetch all SQL records for computing aggregate stats:
sql_df = pd.DataFrame()

conn = sqlite3.connect(sql_path)
for table in ["comments", "posts"]:
    tbl_data = pd.read_sql(
        f"SELECT id, full_id, unique_id, author, date FROM {table}",
        con=conn
    )
    
    tbl_data["table"] = table
    
    sql_df = pd.concat([sql_df, tbl_data])
    del tbl_data
conn.close()

sql_df["date"] = pd.to_datetime(sql_df["date"])
sql_df["month_year"] = sql_df["date"].dt.strftime("%Y-%m")

In [None]:
df.head(10)

In [None]:
coded_data.head(10)

In [None]:
sql_df.head(10)

**Load BERtopic model:**

In [None]:
NEW_MODEL = "updated_bertopic_model"
NEW_MODEL_PATH = os.path.join("..", "data", "topic_data", NEW_MODEL)
topic_model = BERTopic.load(NEW_MODEL_PATH)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_df = topic_model.get_topic_info()
TOTAL_N = topic_df.iloc[1:]["Count"].sum()
covid_topics = [1, 6, 29, 63, 74, 96, 115, 167, 168, 178, 194, 200, 215, 224, 234, 236, 242]
topic_df = topic_df.loc[topic_df["Topic"].isin(covid_topics)]
topic_df["Percent"] = round((topic_df["Count"] / TOTAL_N)*100, 2)
topic_df = topic_df[["Topic", "Representation", "Count", "Percent"]]
topic_df.sort_values("Topic", ascending=True, inplace=True)
topic_df.reset_index(drop=True, inplace=True)
topic_df["Representation"] = topic_df["Representation"].apply(lambda row: ", ".join(row))

topic_df.rename(
    columns={
        "Topic": "Topic ID",
        "Representation": "Top 20 Words"
    },
    inplace=True
)

In [None]:
topic_df

##### Save table:

In [None]:
topic_df[["Topic ID", "Top 20 Words"]].to_excel(
    os.path.join(
        "..",
        "manuscript",
        "tables",
        "covid_topics.xlsx"
    ),
    sheet_name="covid_topics",
    index=False
)

## Subreddit Overview

### Figure Aesthetics

In [None]:
axis_font = {"font": "Reddit Mono", "size": 8}
label_font = {"font": "Roboto", "size": 9, "weight": "bold"}
qual_labels = {"font": "Roboto", "size": 9, "weight": "normal"}
subplot_title = {"font": "Roboto", "size": 12}
legend_labels = {"family": "Roboto", "size": 9, "weight": "normal"}

### Subreddit Stats & Covid-19 Topic Stats

#### Overall Counts (Joint Plot)

In [None]:
monthly_freqs = sql_df.groupby("month_year").size().reset_index(name="count")

covid_topics = [1, 6, 29, 63, 74, 96, 115, 167, 168, 178, 194, 200, 215, 224, 234, 236, 242]
covid_freqs = df.loc[df["new_topic"].isin(covid_topics)].groupby("month_year").size().reset_index(name="count")

sns.set_style("white")
colors = sns.color_palette().as_hex()

fig, axes = plt.subplots(2,1, figsize=(12,10))
sns.lineplot(monthly_freqs, x="month_year", y="count", ax=axes[0], c="black", lw=2)
sns.lineplot(covid_freqs, x="month_year", y="count", ax=axes[1], c="black", lw=2)
sns.despine(offset=5, left=True)

# axis 0:
axes[0].set_title("All Comments and Posts", fontdict=subplot_title)
axes[0].set_yticks(list(range(0,30000+2000,2000)))

# axis 1:
axes[1].set_title("Comments and Posts in Covid-19 Related Topics", fontdict=subplot_title)

for ax in axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontdict=axis_font)
    ax.set_yticklabels(ax.get_yticklabels(), fontdict=axis_font)
    ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
    ax.set_xlabel("Date", fontdict=label_font, labelpad=10)
    ax.set_ylabel("Count", fontdict=label_font, labelpad=10)
    ax.set_xlim(0, "2022-12")

plt.subplots_adjust(hspace=0.4)
plt.show()

#### COVID-19 Topics Only:

In [None]:
covid_topics = [1, 6, 29, 63, 74, 96, 115, 167, 168, 178, 194, 200, 215, 224, 234, 236, 242]
covid_freqs = df.loc[df["new_topic"].isin(covid_topics)].groupby("month_year").size().reset_index(name="count")

sns.set_style("white")
colors = sns.color_palette().as_hex()

fig, ax = plt.subplots(1,1, figsize=(10,4))
sns.lineplot(covid_freqs, x="month_year", y="count", ax=ax, c="black", lw=2)
sns.despine(offset=5, left=True)

ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontdict=axis_font)
ax.set_yticklabels(ax.get_yticklabels(), fontdict=axis_font)
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.set_xlabel("Date", fontdict=label_font, labelpad=10)
ax.set_ylabel("Count", fontdict=label_font, labelpad=10)
ax.set_xlim(0, "2022-12")

plt.subplots_adjust(hspace=0.4)
plt.tight_layout()
plt.savefig(
    os.path.join(
        "..",
        "manuscript",
        "revisions",
        "figures",
        "covid_topic_counts.jpg"
    ),
    dpi=300,
    bbox_inches="tight"
)

plt.show()

#### Users

In [None]:
# unique users:
user_df = sql_df.loc[sql_df["author"] != "[deleted]"]
user_df = user_df.groupby("month_year").agg({"author": set})
user_df["n_unique_users"] = user_df["author"].apply(lambda row: len(row))
user_df.reset_index(inplace=True)

# deleted users:
deleted_users = sql_df.loc[sql_df["author"] == "[deleted]"]
deleted_users = deleted_users.groupby("month_year").size().reset_index(name="count")
deleted_users.sort_values("month_year", inplace=True)

In [None]:
user_df.head()

In [None]:
deleted_users.head()

In [None]:
sns.set_style("white")
colors = sns.color_palette().as_hex()

fig, ax = plt.subplots(1,1, figsize=(11,5))
sns.lineplot(
    user_df, 
    x="month_year", 
    y="n_unique_users", 
    ax=ax, 
    c="black",
    lw=2, 
    label="Number of Unique Users"
)
sns.lineplot(
    deleted_users, 
    x="month_year", 
    y="count", 
    ax=ax, 
    c="gray", 
    ls="--", 
    lw=2, 
    label="Number of Comments & Posts from Deleted Users",
    zorder=0
)
sns.despine(offset=5, left=True)
ax.set_yticks(list(range(0,4200+200,200)))
ax.set_yticklabels(ax.get_yticklabels(), fontdict=axis_font)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontdict=axis_font)
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.xaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.set_xlabel("Date", fontdict=label_font, labelpad=10)
ax.set_ylabel("Count", fontdict=label_font, labelpad=10)
ax.set_xlim(0, "2022-12")

ax.legend(
    prop=legend_labels, 
    fancybox=True, 
    edgecolor="black",
    ncol=1,
    shadow=True
)

plt.tight_layout()
plt.subplots_adjust(hspace=0.4)
plt.savefig(
    os.path.join(
        "..", 
        "manuscript",
        "revisions",
        "figures",
        "users.jpg"
    ),
    dpi=300,
    bbox_inches="tight"
)

plt.show()

In [None]:
user_df["n_unique_users"].mean()

In [None]:
user_df.loc[user_df["month_year"] >= "2020-01"]["n_unique_users"].mean()

In [None]:
deleted_users["count"].mean()

In [None]:
deleted_users.loc[deleted_users["month_year"] >= "2020-01"]["count"].mean()

## Coded Data

In [None]:
coded_data

In [None]:
author_sets = coded_data.groupby("sample_source").agg({"author": set}).reset_index()
author_sets["author"] = author_sets["author"].apply(lambda row: {i for i in row if i != "[deleted]"})
author_sets["n_authors"] = author_sets["author"].apply(lambda row: len(row))

In [None]:
author_sets

In [None]:
for sample in ["highest", "lowest", "random"]:
    gt1 = []
    counts = coded_data.loc[coded_data["sample_source"]==sample]["author"].value_counts().to_dict()
    conts = {k:v for k,v in counts.items() if k != "[deleted]"}
    for k,v in counts.items():
        if v == 1:
            gt1.append(k)
    print(f"{sample}: {len(gt1)}")

In [None]:
for sample in ["highest", "lowest", "random"]:
    gt1 = []
    counts = coded_data.loc[coded_data["sample_source"]==sample]["author"].value_counts().to_dict()
    conts = {k:v for k,v in counts.items() if k != "[deleted]"}
    for k,v in counts.items():
        if v == 1:
            gt1.append(k)
    print(f"{sample}: {len(gt1)}")

##### Summary Statistics

In [None]:
coded_data["Neutral"] = 0
coded_data.loc[(coded_data["Anti-Contrarianism"]==0) & (coded_data["Contrarianism"]==0), "Neutral"] = 1
coded_data["sample_source"] = coded_data["sample_source"].replace(
    {"highest": "Most Upvoted", "lowest": "Most Downvoted", "random": "Random"}
)
coded_data["sample_source"] = pd.Categorical(coded_data["sample_source"], ["Most Upvoted", "Most Downvoted", "Random"])

In [None]:
value_cols = ["Anti-Contrarianism", "Contrarianism", "Neutral"]

pivot_df = coded_data.pivot_table(
    index="sample_source",
    values=value_cols,
    aggfunc="sum",
    fill_value=0
)

pivot_df["Total"] = pivot_df.sum(axis=1)

for i,row in pivot_df.iterrows():
    for col in value_cols:
        per = round((row[col] / row["Total"])*100, 1)
        pivot_df.loc[i, col] = f"{row[col]}\n({per}%)"
        
pivot_df.index.names = ["Sample Source"]

In [None]:
pivot_df

In [None]:
pivot_df.to_excel(
    os.path.join(
        "..",
        "manuscript",
        "tables",
        "sample_source_stats.xlsx"
    ),
    index=True,
    sheet_name="statistics"
)

### Temporal Trends in Contrarian & Anti-Contrarian Observations

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,5))
colors = sns.color_palette().as_hex()

contrarian_data = (
    coded_data.loc[coded_data["Contrarianism"] == 1]
    .groupby("month_year")
    .size()
    .reset_index(name="count")
)


anti_cont_data = (
    coded_data.loc[coded_data["Anti-Contrarianism"] == 1]
    .groupby("month_year")
    .size()
    .reset_index(name="count")
)

# fill in dates with no counts:
full_date_set = sorted(sql_df["month_year"].unique())
start_idx = full_date_set.index("2020-01")

for date in full_date_set[start_idx:]:
    if date not in contrarian_data["month_year"].unique():
        contrarian_data = pd.concat(
            [
                contrarian_data,
                pd.DataFrame({"month_year": [date], "count": [0]})
            ]
        )
    if date not in anti_cont_data["month_year"].unique():
        anti_cont_data = pd.concat(
            [
                anti_cont_data,
                pd.DataFrame({"month_year": [date], "count": [0]})
            ]
        )

contrarian_data["type"] = "Contrarianism"
anti_cont_data["type"] = "Anti-Contrarianism"

data = pd.concat([contrarian_data, anti_cont_data])
data.sort_values("month_year", inplace=True)

sns.lineplot(
    data, 
    x="month_year", 
    y="count", 
    hue="type",
    ax=ax,
    style="type",
    lw=2
)

sns.despine(offset=5, left=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontdict=axis_font)

ax.set_yticks(list(range(0,105+5,5)))
ax.set_yticklabels(ax.get_yticklabels(), fontdict=axis_font)
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.set_xlabel("Date", fontdict=label_font, labelpad=10)
ax.set_ylabel("Count", fontdict=label_font, labelpad=10)
ax.set_xlim(0, "2022-12")

for t in data["type"].unique():
    subset = data.loc[data['type'] == t]
    ax.fill_between(subset['month_year'], subset['count'], alpha=0.3)

ax.legend(prop=legend_labels, fancybox=False, edgecolor="black")
plt.tight_layout()

plt.savefig(
    os.path.join(
        "..",
        "manuscript",
        "revisions",
        "figures",
        "contrarianism_anti-contrarianism_trends.jpg"
    ),
    dpi=300,
    bbox_inches="tight"
)

plt.show()

### Category Counts

In [None]:
coded_data

In [None]:
coded_data.columns

#### Entire Sample

In [None]:
assert(len(coded_data) == 1000)

N = len(coded_data)
cont_values = ["Anti-Contrarianism", "Contrarianism", "Neutral"]
categories = coded_data.columns[13:].tolist()
categories = [c for c in categories if c not in cont_values]

category_counts = (
    coded_data[categories]
    .melt()
    .groupby("variable")["value"]
    .sum()
    .reset_index(name="count")
)

category_counts.sort_values("count", ascending=False, inplace=True)
category_counts.reset_index(inplace=True, drop=True)
category_counts["percent_of_obs"] = round((category_counts["count"] / N) * 100, 2)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,5))
sns.set_style("white")
colors = sns.color_palette().as_hex()

sns.barplot(
    data=category_counts, 
    x="percent_of_obs", 
    y="variable", 
    ax=ax,
    alpha=0.85
)

sns.despine(offset=0, left=False)
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.xaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.set_xticklabels(ax.get_xticklabels(), fontdict=axis_font)
ax.set_yticklabels(ax.get_yticklabels(), fontdict=qual_labels)
ax.set_ylabel("")
ax.set_xlabel("Percent", fontdict=label_font, labelpad=5)

for i, bar in enumerate(ax.patches):
    bar.set_edgecolor(colors[0])

xtick_vals = list(range(0,55+5,5))
xtick_vals = [float(i) for i in xtick_vals]
ax.set_xticks(xtick_vals)
ax.set_xticklabels(xtick_vals, fontdict=axis_font)

plt.tight_layout()
plt.show()

#### Sub-sample Counts

In [None]:
cont_values = ["Anti-Contrarianism", "Contrarianism", "Neutral"]
categories = coded_data.columns[13:].tolist()
categories = [c for c in categories if c not in cont_values]

data_frames = []

for sub_type in cont_values:
    label = sub_type.lower().replace("-", "_")
    
    N = len(coded_data.loc[coded_data[sub_type]==1])
    cats = (
        coded_data.loc[coded_data[sub_type]==1][categories]
        .melt()
        .groupby("variable")["value"]
        .sum()
        .reset_index(name=f"{label}_count")
    )

    cats.sort_values(f"{label}_count", ascending=False, inplace=True)
    cats.reset_index(inplace=True, drop=True)
    cats[f"{label}_percent_of_obs"] = round((cats[f"{label}_count"] / N) * 100, 2)
    data_frames.append(cats)

subsample_cats = data_frames[0].merge(data_frames[1], how="left", on="variable")
subsample_cats.sort_values("variable", ascending=True, inplace=True)
subsample_cats.reset_index(inplace=True, drop=True)

# sort by the max value for each row:
# subsample_cats["max_sort"] = 0.0
# for i,row in subsample_cats.iterrows():
#     subsample_cats.loc[i, "max_sort"] = max(row["anti_contrarianism_percent_of_obs"], row["contrarianism_percent_of_obs"])
subsample_cats["max_sort"] = subsample_cats["anti_contrarianism_count"] + subsample_cats["contrarianism_count"]
    
subsample_cats.sort_values("max_sort", ascending=False, inplace=True)
subsample_cats.reset_index(inplace=True, drop=True)

In [None]:
subsample_cats

In [None]:
fig, ax = plt.subplots(1,1, figsize=(9,6))

sns.set_style("white")

scatter_size = 75
line_width = 1.25
ax.scatter(
    x=subsample_cats["anti_contrarianism_percent_of_obs"],
    y=subsample_cats.index[::-1],
    c=colors[0],
    s=scatter_size,
    label="Anti-Contrarianism"
)

ax.scatter(
    x=subsample_cats["contrarianism_percent_of_obs"],
    y=subsample_cats.index[::-1],
    c=colors[4],
    s=scatter_size,
    label="Contrarianism"
)

sns.despine(offset=0, left=False)
ax.set_yticks(subsample_cats.index[::-1])
ax.set_yticklabels(subsample_cats["variable"], fontdict=qual_labels)

ax.set_xlabel("Percentage of Observations", fontdict=label_font, labelpad=5)
xtick_vals = list(range(0,65+5,5))
xtick_vals = [float(i) for i in xtick_vals]
ax.set_xticks(xtick_vals)
ax.set_xticklabels(xtick_vals, fontdict=axis_font)
ax.set_xlim(0,65)

ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.xaxis.grid(True, linestyle='--', color='gray', alpha=0.5)

# dumbbell lines:
for idx, p1, p2 in zip(
    subsample_cats.index[::-1], 
    subsample_cats["anti_contrarianism_percent_of_obs"],
    subsample_cats["contrarianism_percent_of_obs"]
):
    ax.plot([p1, p2], [idx, idx], color="black", linestyle="-", lw=line_width, zorder=-1)

ax.legend(
    prop=legend_labels, 
    fancybox=True, 
    edgecolor="black",
    ncol=2,
    loc="upper center",
    bbox_to_anchor=(0.5, 1.06),
    shadow=True
)

plt.tight_layout()

plt.savefig(
    os.path.join(
        "..",
        "manuscript",
        "revisions",
        "figures",
        "coded_categories_comparison.jpg"
    ),
    dpi=300,
    bbox_inches="tight"
)

plt.show()

### Co-Occurrences

##### Vaccine category:

In [None]:
vax = "Vaccine Safety, Efficacy & Hesitancy"
categories = coded_data.columns[13:].tolist()
cont_values = ["Anti-Contrarianism", "Contrarianism", "Neutral"]
categories = [c for c in categories if c not in cont_values]
cat_df = coded_data.loc[coded_data[vax] == 1][categories]

co_occur_df = cat_df.T.dot(cat_df)
co_occur_df = co_occur_df.unstack().reset_index()

co_occur_df.columns = ["var1", "var2", "count"]
co_occur_df = co_occur_df.loc[co_occur_df["var1"] != co_occur_df["var2"]]
co_occur_df["sorted_labels"] = co_occur_df.apply(
   lambda x: tuple(sorted([x['var1'], x['var2']])), axis=1
)

co_occur_df = co_occur_df[["sorted_labels", "count"]]
co_occur_df.drop_duplicates("sorted_labels", inplace=True)
co_occur_df.sort_values("count", ascending=False, inplace=True)
co_occur_df.reset_index(inplace=True, drop=True)

co_occur_df["text_labels"] = co_occur_df["sorted_labels"].apply(
   lambda row: f"{row[0]} — {row[1]}"
)

co_occur_df["vaccine"] = co_occur_df["sorted_labels"].apply(
   lambda row: np.where((row[0] == vax) | (row[1] == vax), 1, 0)
)

N = len(coded_data.loc[coded_data[vax]==1])
co_occur_df = co_occur_df.loc[co_occur_df["vaccine"]==1]
co_occur_df["percent_of_obs"] = round((co_occur_df["count"] / N) * 100, 2)
co_occur_df[["var1", "var2"]] = co_occur_df["sorted_labels"].tolist()

In [None]:
co_occur_df

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,5))
sns.set_style("white")
colors = sns.color_palette().as_hex()

sns.barplot(
    data=co_occur_df.head(10), 
    x="percent_of_obs", 
    y="var1", 
    ax=ax,
    color="black",
    alpha=0.75
)

sns.despine(offset=0, left=False)
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.xaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.set_xticklabels(ax.get_xticklabels(), fontdict=axis_font)
ax.set_yticklabels(ax.get_yticklabels(), fontdict=qual_labels)
ax.set_ylabel("")
ax.set_xlabel("Percent", fontdict=label_font, labelpad=5)
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: float(x)))

for i, bar in enumerate(ax.patches):
    bar.set_edgecolor("black")

xtick_vals = list(range(0,30+5,5))
xtick_vals = [float(i) for i in xtick_vals]
ax.set_xticks(xtick_vals)
ax.set_xticklabels(xtick_vals, fontdict=axis_font)

plt.tight_layout()

plt.savefig(
    os.path.join(
        "..",
        "manuscript",
        "revisions",
        "figures",
        "vax_co_occurences.jpg"
    ),
    dpi=300,
    bbox_inches="tight"
)

plt.show()