In [16]:
import json
import os
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats
import numpy as np
import networkx as nx
import pydot
from networkx.drawing.nx_pydot import graphviz_layout

import utils.processing_utils as pu

In [2]:
save_figs = False

***
# Load Sim Data

In [3]:
db_json_path = "/../abyss/home/oasis/oasis-rutschmanna/data/db_json/"
directory = os.fsencode(db_json_path)

data = {}

for subdir in os.listdir(directory):
    name = os.fsdecode(subdir)

    print("Reading: ", name)
    with open(f"{db_json_path}{name}/comment.json") as f:
        content = json.load(f)

    with open(f"{db_json_path}{name}/user.json") as f:
        users = json.load(f)

    for i in content:
        for j in users:
            if i["user_id"] == j["user_id"]:
                i["seed_user_id"] = j["user_name"]

    data[name] = content

Reading:  reddit-sim_qwen-16_18-12
Reading:  reddit-sim_qwen-16_22-28
Reading:  reddit-sim_qwen-16_17-04
Reading:  reddit-sim_qwen-16_23-06
Reading:  reddit-sim_qwen-16_13-47
Reading:  reddit-sim_qwen-16_02-35
Reading:  reddit-sim_qwen-16_17-34
Reading:  reddit-sim_qwen-16_00-16
Reading:  reddit-sim_qwen-16_12-24
Reading:  reddit-sim_qwen-16_19-24
Reading:  reddit-sim_qwen-16_14-21
Reading:  reddit-sim_qwen-16_20-40
Reading:  reddit-sim_qwen-16_01-00
Reading:  reddit-sim_qwen-16_21-15
Reading:  reddit-sim_qwen-16_18-51
Reading:  reddit-sim_qwen-16_21-49
Reading:  reddit-sim_qwen-16_23-41
Reading:  reddit-sim_qwen-16_14-42
Reading:  reddit-sim_qwen-16_20-07
Reading:  reddit-sim_qwen-16_16-33
Reading:  reddit-sim_qwen-16_14-08
Reading:  reddit-sim_qwen-16_15-18
Reading:  reddit-sim_qwen-16_15-57
Reading:  reddit-sim_qwen-16_01-31
Reading:  reddit-sim_qwen-16_12-59
Reading:  reddit-sim_qwen-16_03-07
Reading:  reddit-sim_qwen-16_13-35
Reading:  reddit-sim_qwen-16_02-01
Reading:  reddit-sim

In [4]:
structure_data = {}

for i, j in zip(data.keys(), data.values()):
    vol, width, depth, scale, active, lengths = pu.structure_analysis(j, 110)
    structure_data[i] = {
        "volume" : vol,
        "width" : width,
        "depth" : depth,
        "scale" : scale,
        "active": active,
        "comment_lengths" : lengths,
    }

***
# Structural Analysis

In [18]:
# Print avg. share of actives in SIM
temp = [[], []]
for i in list(data.keys()):
    temp[0].append(structure_data[i]["scale"])
    temp[1].append(structure_data[i]["active"])

print(np.mean(temp[0]))
print(np.mean(temp[1]))

81.9
0.7445454545454545


In [19]:
# Load User data and discussions data
seed_data = pd.read_csv("../../oswald-et-al_2025/pre_survey_anon.csv")

seed_data.drop_duplicates("ParticipantID", inplace=True)
seed_data = seed_data.loc[seed_data["on_reddit"] == 1]
print(len(seed_data))

discussions_data = pd.read_csv(
    "../../oswald-et-al_2025/discussions_anon.csv",
    index_col=0
)
discussions_data.dropna(subset="ParticipantID", inplace=True)
print(len(discussions_data))

# Group by participant and count number of submissions created during experiment
discussions_data["comment_count"] = discussions_data.groupby(
    "ParticipantID"
)["ParticipantID"].transform("count").map(int)

discussions_data_merge = discussions_data[[
    "ParticipantID", "comment_count"
]].drop_duplicates()

seed_data = seed_data.merge(discussions_data_merge,
                            how="left",on="ParticipantID")

seed_data.fillna({"comment_count" : 0}, inplace=True)
# Print total user # and share of actives
print(len(seed_data))
print(len(seed_data[seed_data["comment_count"] != 0]) / len(seed_data))

520
5792
520
0.6365384615384615


In [None]:
# Load Oswald user data and compare Comment Lenght
sample_data = pd.read_csv(
    "../../oswald-et-al_2025/sample_anon.csv"
)
print(sample_data.columns)

sample_data = sample_data[[
    "ParticipantID",
    "subreddit",
    "polinterest",
    "time_online",
    "social_media",
    "comments_online",
    "comment_count",
    "comment_mean_lenght",
    "comment_mean_score",
    "comment_mean_tox"
]]

sample_data.rename(columns={"comment_mean_lenght":"comment_mean_length"},
                  inplace=True)

sample_data.fillna(0, inplace=True)
print(len(sample_data))

sample_sub_1_data = sample_data[sample_data["subreddit"] == "DiscussPolitics1"]

In [None]:
# Plot simulation descriptives
fig, axs = plt.subplots(1, 4, figsize=(15, 5))
x_ticks = [i for i in structure_data.keys()]

axs[0].hist([i["volume"] for i in structure_data.values()])
axs[0].set_xlabel("volume")

axs[1].hist([i["width"] for i in structure_data.values()])
axs[1].set_xlabel("width")

axs[2].hist([i["depth"] for i in structure_data.values()])
axs[2].set_xlabel("depth")

axs[3].hist([i["scale"] for i in structure_data.values()])
axs[3].set_xlabel("scale")

if save_figs:
    plt.savefig("figures/sim_descriptives.jpg")
plt.show()

In [None]:
# Plot dist of volume across sim and seed threads
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
x_ticks = [i for i in structure_data.keys()]

axs[0].hist([i["volume"] for i in structure_data.values()])
axs[0].set_xlabel("Volume")

temp = discussions_data.groupby(["subreddit", "post_title"]).count()["ParticipantID"]

print(discussions_data.groupby([
    "subreddit", "post_title"
]).count()["ParticipantID"].sum())

axs[1].hist(temp, color="y", label="Seed")
axs[1].set_xlabel("Volume")

# plt.legend()
if save_figs:
    plt.savefig("figures/seed-sim_vol.jpg")
plt.show()

In [None]:
# Plot dist of scale across sim and seed threads
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].hist([i["scale"] for i in structure_data.values()], label="SIM")
axs[0].set_xlabel("Scale")

temp = discussions_data.groupby([
    "subreddit", "post_title"
]).nunique("ParticipantID")

axs[1].hist(temp["ParticipantID"], color="y", label="Seed")
axs[1].set_xlabel("Scale")

# plt.legend()
if save_figs:
    plt.savefig("figures/seed-sim_dist_scale.jpg")
plt.show()

In [None]:
# Plot dist of comment lengths for all Sims
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].hist(structure_data[list(structure_data.keys())[0]]["comment_lengths"],
           label="Sim")

axs[0].set_xlabel("Comment Length")

# Plot dist of comment length for all Seed Subs
temp = discussions_data.loc[
discussions_data["post_title"] == "Fur clothing should be banned."
]

temp = temp.groupby([
    "subreddit", "post_title"
])["length_comment_char"]

temp = [group.values.tolist() for _, group in temp]

axs[1].hist(temp, color=["y"]*6, bins=10, label="Seed")
axs[1].set_xlabel("Comment Length")
axs[1].set_xlim(0, 1050)

# plt.legend()
if save_figs:
    plt.savefig("figures/seed-sim_dist_comment_lengths.jpg")
plt.show()

In [None]:
# Visualize the thread structure in tree form
thread_tree = []
node_labels = {}

for i in data[list(data.keys())[0]]:
    if i["parent_comment_id"] == -1:
        thread_tree.append((0, i["comment_id"]))
    else:
        thread_tree.append((i["parent_comment_id"], i["comment_id"]))
    node_labels[i["comment_id"]] = i["user_id"]

G = nx.Graph()
G.add_edges_from(thread_tree)
pos = graphviz_layout(G, prog="dot")

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(40, 10))
nx.draw(G, pos, with_labels=True, labels=node_labels, node_size=300, node_color='lightblue',
        font_weight='bold', font_size=8)
if save_figs:
    plt.savefig("figures/sim_thread-tree.jpg")
plt.show()

In [None]:
# Get share of inactive agents


***
# Merge Sim and Seed Agent Data

In [None]:
# Merge LLM Agents with their respective Seed Users
seed_sub_1_data = seed_data[seed_data["subreddit"] == "DiscussPolitics1"]

seed_sub_1_data = seed_sub_1_data.drop(
    list(seed_sub_1_data.filter(regex="comprehen")), axis=1
).dropna()

seed_sub_1_data = seed_sub_1_data[[
    "ParticipantID",
    "polinterest",
    "time_online",
    "social_media",
    "comments_online",
    "comment_count",
]]

print(len(seed_sub_1_data))

In [None]:
for k in list(data.keys()):
    llm_posts = {}
    for i in seed_sub_1_data["ParticipantID"]:
        user_posts = []
        for j in data[k]:
            if i == j["seed_user_id"]:
                user_posts.append(j)
    
        llm_posts[i] = user_posts

    seed_sub_1_data[k] = seed_sub_1_data["ParticipantID"].map(llm_posts)
    seed_sub_1_data[f"{k}_comment_count"] = seed_sub_1_data[k].map(len)

***
# Interaction Analysis

In [None]:
discussions_data["created_comment"] = pd.to_datetime(
    discussions_data["created_comment"]
)
discussions_data.groupby(["subreddit", "post_title"])["created_comment"].transform(
    lambda x: x.max() - x.min()
).unique()

In [None]:
# Correlation between self-reported vars and comment # for llm agents
for i in list(seed_sub_1_data.filter(regex="_count").columns):
    for j in ["polinterest", "time_online", "social_media", "comments_online"]:
        corr = scipy.stats.spearmanr(seed_sub_1_data[j],
                                    seed_sub_1_data[i])
        if corr.pvalue <= 0.05:
            print(i, j, corr)

In [None]:
temp = {"stat" : [], "p" : []}
for i in list(seed_sub_1_data.filter(regex="_count").columns):
    corr = scipy.stats.spearmanr(seed_sub_1_data["polinterest"],
                                seed_sub_1_data[i])
    temp["stat"].append(corr.statistic)
    temp["p"].append(corr.pvalue)

print("polinterest", pu.fisher(temp))

In [None]:
temp = {"stat" : [], "p" : []}
for i in list(seed_sub_1_data.filter(regex="_count").columns):
    corr = scipy.stats.spearmanr(seed_sub_1_data["time_online"],
                                seed_sub_1_data[i])
    temp["stat"].append(corr.statistic)
    temp["p"].append(corr.pvalue)

print("time_online", pu.fisher(temp))

In [None]:
temp = {"stat" : [], "p" : []}
for i in list(seed_sub_1_data.filter(regex="_count").columns):
    corr = scipy.stats.spearmanr(seed_sub_1_data["social_media"],
                                seed_sub_1_data[i])
    temp["stat"].append(corr.statistic)
    temp["p"].append(corr.pvalue)

print("social_media", pu.fisher(temp))

In [None]:
temp = {"stat" : [], "p" : []}
for i in list(seed_sub_1_data.filter(regex="_count").columns):
    corr = scipy.stats.spearmanr(seed_sub_1_data["comments_online"],
                                seed_sub_1_data[i])
    temp["stat"].append(corr.statistic)
    temp["p"].append(corr.pvalue)

print("comments_online", pu.fisher(temp))

In [None]:
# Comparison of self-reported vars in whole sample vs. Sub 1
plt.rcParams['axes.prop_cycle'] = plt.cycler(color="y")
fig, axs = plt.subplots(2, 4, figsize=(14, 5))

axs[0,0].hist(seed_data["polinterest"])
axs[0,0].set_ylabel("All User Data")

axs[0,1].hist(seed_data["time_online"])

axs[0,2].hist(seed_data["social_media"])

axs[0,3].hist(seed_data["comments_online"])

axs[1,0].hist(seed_sub_1_data["polinterest"])
axs[1,0].set_xlabel("polinterest")
axs[1,0].set_ylabel("Sub 1 User Data")

axs[1,1].hist(seed_sub_1_data["time_online"])
axs[1,1].set_xlabel("time_online")

axs[1,2].hist(seed_sub_1_data["social_media"])
axs[1,2].set_xlabel("social_media")

axs[1,3].hist(seed_sub_1_data["comments_online"])
axs[1,3].set_xlabel("comments_online")

if save_figs:
    plt.savefig("figures/whole-sub1_self-reports.jpg")
plt.show()

In [None]:
# Whole sample correlation analysis
scipy.stats.spearmanr(seed_data["polinterest"], 
                      seed_data["comment_count"])

In [None]:
# Whole sample correlation analysis
scipy.stats.spearmanr(seed_data["time_online"],
                      seed_data["comment_count"])

In [None]:
# Whole sample correlation analysis
scipy.stats.spearmanr(seed_data["social_media"],
                      seed_data["comment_count"])

In [None]:
# Whole sample correlation analysis
scipy.stats.spearmanr(seed_data["comments_online"],
                      seed_data["comment_count"])

***
# Content Analysis

In [23]:
with open("perspective_secret.txt", "r") as f:
    api_key = f.read()

'AIzaSyBHead3sIi9cy2QOgzZi7rzDNHI-3JQlK4'

In [None]:
j = pu.query_perspective(data[list(data.keys())[0]], api_key)

In [None]:
with open("perspective_rating.json", "w") as f:
    json.dump(j, f, indent=4)

In [None]:
with open("perspective_rating.json", "r") as f:
    responses = json.load(f)

***
# Junk yard

In [20]:
import sqlite3

def activation_function():
    activated_agents = []
    
    return activated_agents

In [21]:
activation_function()

[]