In [15]:
import sys
sys.path.append('scripts')
from scripts.utils import (
    list_clusters_with_sizes,
    clusters_to_dataframe,
    sample_cluster_items,
    get_cluster_summaries
)
import pandas as pd
import json


In [16]:
cluster_json = "results/clustering/combined_posts_k12/kmeans_sentence.json"
csv_path = "data/combined_posts.csv"


In [17]:
clusters = list_clusters_with_sizes(cluster_json)
cluster_sizes_df = pd.DataFrame(clusters).sort_values('size', ascending=False)
cluster_sizes_df


Unnamed: 0,cluster,size
3,Cluster 2,485
6,Cluster 5,264
1,Cluster 1,252
4,Cluster 10,201
5,Cluster 6,176
9,Cluster 7,175
7,Cluster 12,166
10,Cluster 9,164
2,Cluster 3,161
11,Cluster 4,149


In [18]:
df = clusters_to_dataframe(cluster_json, csv_path=csv_path, fields=["Summary", "Full Content"])
df.head()


Unnamed: 0,cluster,post_id,Summary,Full Content
0,Cluster 11,1o5ucqy,The Reddit post suggests AI containment might ...,https://preview.redd.it/uwpd5xkcpxuf1.png?widt...
1,Cluster 11,1o5u56c,The poster desperately wants to disable the an...,Please help it's driving me crazy
2,Cluster 11,1o5pruo,The author is amused that a message about the ...,(the Pok√©mon is Pecharunt)
3,Cluster 11,1o5ihvy,The user is surprised that a search of three w...,https://preview.redd.it/k8mhmqirivuf1.png?widt...
4,Cluster 11,1o5d3c3,The Reddit post confirms agreement with an uns...,https://preview.redd.it/z8hvdjdqytuf1.png?widt...


In [19]:
cluster_name = "Cluster 1"
df[df['cluster'] == cluster_name][['Summary', 'Full Content']].head(10)


Unnamed: 0,Summary,Full Content
69,The poster is confused because they thought us...,"I mean, it's restricted?"
70,The poster is confused and frustrated that a n...,These 2 buttons do the exact same thing. The u...
71,The user is experiencing erratic responses fro...,https://chatgpt.com/share/68ed3d73-b0d4-8007-9...
72,A previously reported software bug has been re...,See my last post for context
73,A blog post shares an interview with an AI age...,https://lenajohnlennontwinflames.wordpress.com...
74,The poster questions the value of a service th...,What‚Äôs the point if the service is so wrong al...
75,"The poster is worried about societal collapse,...",\nWhat will become of world if most jobs vanis...
76,The user is frustrated that ChatGPT invents qu...,I have given the exact link of the document I'...
77,The user noticed that insults from an unspecif...,"Something, back end, must have changed because..."
78,The author is starting a movement to ban the l...,I‚Äôm starting a movement. No long dashes. ü§¨\n\n...


In [20]:
for cluster in sorted(df['cluster'].unique()):
    cluster_df = df[df['cluster'] == cluster]
    print(f"\n{'='*60}")
    print(f"{cluster}: {len(cluster_df)} posts")
    print(f"{'='*60}")
    for idx, row in cluster_df.head(5).iterrows():
        print(f"\n{idx+1}. {row['Summary']}")
        if pd.notna(row['Full Content']) and len(str(row['Full Content'])) > 0:
            content_preview = str(row['Full Content'])[:200]
            print(f"   Content: {content_preview}...")



Cluster 1: 252 posts

70. The poster is confused because they thought using Trump's likeness was restricted in Sora AI 2, but some people seem to be doing it.
   Content: I mean, it's restricted? ...

71. The poster is confused and frustrated that a newly added button duplicates the function of an existing button for starting a new chat.
   Content: These 2 buttons do the exact same thing. The upper one was added today. Why would they not at least make the upper one be the one you chose the model you're talking to instead of adding another button...

72. The user is experiencing erratic responses from ChatGPT, which initially denied the existence of the show "Dexter: New Blood" and then flip-flopped between confirming and denying its existence, even after being provided with an IMDB link.
   Content: https://chatgpt.com/share/68ed3d73-b0d4-8007-983b-0b40d406a4d9

this is the link to the conversation. I was asking it about if prater died in Dexter resurrection, it said that there's no 

In [21]:
selected_cluster = "Cluster 1"
cluster_df = df[df['cluster'] == selected_cluster]
print(f"{selected_cluster}: {len(cluster_df)} posts\n")
cluster_df[['post_id', 'Summary']].head(20)


Cluster 1: 252 posts



Unnamed: 0,post_id,Summary
69,1o5u6ep,The poster is confused because they thought us...
70,1o5tnz9,The poster is confused and frustrated that a n...
71,1o5r9qo,The user is experiencing erratic responses fro...
72,1o5pzy3,A previously reported software bug has been re...
73,1o5m8xg,A blog post shares an interview with an AI age...
74,1o5m7n9,The poster questions the value of a service th...
75,1o5i2v4,"The poster is worried about societal collapse,..."
76,1o5gvx6,The user is frustrated that ChatGPT invents qu...
77,1o5eshh,The user noticed that insults from an unspecif...
78,1o56b5c,The author is starting a movement to ban the l...


In [22]:
selected_cluster = "Cluster 1"
cluster_df = df[df['cluster'] == selected_cluster]
for idx, row in cluster_df.iterrows():
    print(f"\nPost ID: {row['post_id']}")
    print(f"Summary: {row['Summary']}")
    if pd.notna(row['Full Content']):
        print(f"Full Content: {row['Full Content'][:500]}...")
    print("-" * 60)



Post ID: 1o5u6ep
Summary: The poster is confused because they thought using Trump's likeness was restricted in Sora AI 2, but some people seem to be doing it.
Full Content: I mean, it's restricted? ...
------------------------------------------------------------

Post ID: 1o5tnz9
Summary: The poster is confused and frustrated that a newly added button duplicates the function of an existing button for starting a new chat.
Full Content: These 2 buttons do the exact same thing. The upper one was added today. Why would they not at least make the upper one be the one you chose the model you're talking to instead of adding another button that already does what "+" is doing.

Bruh...
------------------------------------------------------------

Post ID: 1o5r9qo
Summary: The user is experiencing erratic responses from ChatGPT, which initially denied the existence of the show "Dexter: New Blood" and then flip-flopped between confirming and denying its existence, even after being provided with 

In [None]:
cluster_labels = {}
for cluster in sorted(df['cluster'].unique()):
    cluster_df = df[df['cluster'] == cluster]
    print(f"\n{cluster}: {len(cluster_df)} posts")
    print("Sample summaries:")
    for summary in cluster_df['Summary'].head(5):
        print(f"  - {summary}")
    print(f"\nManual label for {cluster}:")
    print("(add your label here)")



Cluster 1: 68 posts
["Creepy hallways are scary because you don't know who might be lurking there.", 'The poster is looking for old, funny Reddit posts where ChatGPT repeatedly fails to correct a typo.', 'The post asks people who were laid off due to AI if they were later asked to return to their old job after the employer realized their mistake, and what that experience was like.']

Cluster 10: 58 posts
['A group of AI engineers and website owners are collaborating to study and standardize AI search optimization, inviting others to join and submit their sites.', 'The author is excited about using AI to create a digital twin to handle digital tasks and endless scrolling, and asks what tasks others would outsource.', 'The AI industry saw major investments, safety collaborations, regulatory frameworks, and technical advancements this week, signaling growth and preparation for wider adoption.']

Cluster 11: 56 posts
["ChatGPT's performance, especially the 4o model, is currently poor and 