In [157]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

In [101]:
# 1) Load and prepare text
df = pd.read_csv("../data/interactions.csv")

# combine input/output into a single text field
df["combined"] = (df["input"].fillna("").astype(str) + " " + df["output"].fillna("").astype(str)).str.strip()

# choose docs = one row per interaction (matches probs shape to df rows)
docs = df["combined"].tolist()

# 2) Fit BERTopic
umap_model = UMAP(random_state=11)
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True
)

topics_assigned, probs = topic_model.fit_transform(docs)

# 3) Save topic summary
topic_info = topic_model.get_topic_info()  # includes -1 "outliers" row first
topic_info.to_csv("../data/topics.csv", index=False)

# 4) Build a labeled probability DataFrame (columns = topic names, excluding -1)
# The probability matrix `probs` has shape (n_docs, n_topics_without_outliers)
# Align names by taking non -1 topics from topic_info in order.
non_outlier = topic_info[topic_info["Topic"] != -1].copy()

# Ensure we have exactly as many names as there are columns in probs
expected_cols = probs.shape[1]
names = non_outlier["Name"].tolist()[:expected_cols]
ids = non_outlier["Topic"].tolist()[:expected_cols]

probs_df = pd.DataFrame(probs, columns=names)

# 5) Attach assigned topic and probs to the original rows
df_out = df.reset_index(drop=True).copy()
df_out["assigned_topic"] = topics_assigned  # the single best topic per row
df_out = pd.concat([df_out, probs_df.reset_index(drop=True)], axis=1)

# 6) Save
df_out.to_csv("../data/interactions_with_topic_probs.csv", index=False)

In [207]:
df = pd.read_csv("../data/regression/topic_coefficients.csv")
df = df[df["p_value"]<0.05]
df = df[df["coefficient"]>0]
interactions = pd.read_csv("../data/interactions_with_topic_probs.csv")
topics_by_user = interactions.groupby(["assigned_topic"])["user_id"].nunique().reset_index()
print(np.median(topics_by_user["user_id"]))
topics_by_user = topics_by_user[topics_by_user["user_id"]>=3]
len(topics_by_user)

3.0


55

In [208]:
for model_name in df["model_name"].unique():
    for task_name in df["task_name"].unique():
        print(model_name, task_name)
        curr = df[df["model_name"]==model_name]
        curr = curr[curr["task_name"]==task_name]
        curr["topic_number"] = curr["topic_column"].str.extract(r'(\d+)')
        curr["topic_number"] = curr["topic_number"].astype(int)
        curr = curr[curr["topic_number"].isin(topics_by_user["assigned_topic"])]
        print(len(curr))
        #curr["topic_column"] = curr["topic_column"].str.replace(r"^X\d+_", "", regex=True)
        print(curr.sort_values(by='coefficient', ascending=False).head(10)["topic_column"].to_list())
        print()

claude-sonnet-4-20250514 aita
18
['X33_years_humans_ago_earth', 'X42_welcome_assist_today_hi', 'X41_military_global_economic_power', 'X39_flour_acid_juice_wheat', 'X8_sperm_genetic_conservation_art', 'X60_thermodynamics_heat_steam_efficiency', 'X22_2016_nutrition_mg_facts', 'X29_thought_fear_person_anxiety', 'X26_hersheypark_weather_park_rides', 'X24_mechanical_robots_robotics_automation']

claude-sonnet-4-20250514 politics
15
['X33_years_humans_ago_earth', 'X13_music_rihanna_public_oprah', 'X35_anime_fantasy_series_studio', 'X29_thought_fear_person_anxiety', 'X26_hersheypark_weather_park_rides', 'X6_animals_snakes_species_birds', 'X40_digital_platforms_terrorism_network', 'X77_cfa_skills_house_roles', 'X34_points_total_attendance_grade', 'X59_note_quarter_latin_rhyme']

gpt-4.1-mini-2025-04-14 aita
12
['X33_years_humans_ago_earth', 'X42_welcome_assist_today_hi', 'X41_military_global_economic_power', 'X8_sperm_genetic_conservation_art', 'X80_smart_iot_infrastructure_sensors', 'X40_digi

In [186]:
interactions = pd.read_csv("../data/interactions_with_topic_probs.csv")
interactions = interactions[interactions["assigned_topic"]==8]
interactions = interactions.sort_values(by="8_sperm_genetic_conservation_art", ascending=False)
users = []
for i,r in interactions.iterrows():
    if r["user_id"] not in users:
        print(r["user_id"])
        print(r["input"])
        users.append(r["user_id"])
        print()
        


31d9218ec5
Can dead person DNA be use to confirm paternity test

f53292ced8
Is it possible for a woman to be pregnant after this age?

f4c86d38d9
How

8fd23ba596
Thanks. Does Poland still offer citizenship for people which a family history? Whatâ€™s the process like?

33d3cdfa55
Proceed

8b280921c9
When do women stop seeing their period



In [216]:
interactions = pd.read_csv("../data/interactions_with_topic_probs.csv")
interactions = interactions.groupby(["assigned_topic", "user_id"])["timestamp"].count().reset_index()
interactions = interactions.rename({"timestamp": "num_queries"})
interactions = interactions[interactions["assigned_topic"]>=0]
participants = pd.read_csv("../data/participants.csv")
participants = participants[["user_id", "political_lean", "gender"]]
participants.loc[participants["political_lean"].isin(["Moderate", "Conservative", "Very Conservative"]), "political_lean"] = "right"
participants.loc[participants["political_lean"].isin(["Liberal", "Very Liberal"]), "political_lean"] = "left"

topics_by_demographic = interactions.merge(participants, on="user_id", how="left")

for model_name in df["model_name"].unique():
    for task_name in df["task_name"].unique():
        print(model_name, task_name)
        curr = df[df["model_name"]==model_name]
        curr = curr[curr["task_name"]==task_name]
        curr["topic_number"] = curr["topic_column"].str.extract(r'(\d+)')
        curr["topic_number"] = curr["topic_number"].astype(int)
        curr = curr[curr["topic_number"].isin(topics_by_user["assigned_topic"])]
        curr = topics_by_demographic[topics_by_demographic["assigned_topic"].isin(curr["topic_number"])].groupby(["political_lean", "gender"])["user_id"].nunique().reset_index()
        print(curr)
        print()

SyntaxError: unmatched ']' (2182744335.py, line 20)

In [244]:
understanding = pd.read_csv("../data/survey_results.csv")
understanding = understanding[understanding["understanding"]==5]
understanding[["participant", "understanding", "model", "task"]].drop_duplicates()

interactions = pd.read_csv("../data/interactions_with_topic_probs.csv")
interactions = interactions.groupby(["assigned_topic", "user_id"])["timestamp"].count().reset_index()
interactions = interactions.rename({"timestamp": "num_queries"})

for model_name in understanding["model"].unique():
    for task_name in understanding["task"].unique():
        print(model_name, task_name)
        curr = understanding[(understanding["model"]==model_name)&(understanding["task"]==task_name)]
        topics = interactions[interactions["user_id"].isin(curr["participant"])]
        topics = topics[topics["assigned_topic"]>=0]
        print(topics["assigned_topic"].value_counts().head(10).reset_index()["assigned_topic"].to_list())
        print()

claude-sonnet-4-20250514 aita
[1, 10, 0, 12, 28, 2, 4, 5, 7, 47]

claude-sonnet-4-20250514 politics
[1, 10, 0, 4, 2, 12, 27, 47, 34, 18]

gpt-4.1-mini-2025-04-14 aita
[1, 10, 0, 12, 2, 28, 4, 5, 34, 7]

gpt-4.1-mini-2025-04-14 politics
[1, 0, 10, 12, 4, 2, 5, 47, 27, 34]



[1, 0, 10, 12, 4, 2, 5, 47, 27, 34]

In [206]:
topics_by_demographic[topics_by_demographic["assigned_topic"].isin(curr["topic_number"])].groupby(["political_lean", "gender"])["user_id"].nunique().reset_index()

Unnamed: 0,political_lean,gender,user_id
0,left,man,2
1,left,woman,3
2,right,man,4
3,right,non-binary,1
4,right,woman,2


Unnamed: 0,political_lean,gender,user_id
0,left,man,3
1,left,woman,5
2,right,man,9
3,right,non-binary,1
4,right,woman,4
