In [1]:
# Box 1 (final): read each full line, then split with Python’s split()
import pandas as pd

CSV_PATH = "prompts.csv"

# 1) Read raw lines
with open(CSV_PATH, 'r', encoding='utf-8', newline='') as f:
    lines = [line.rstrip('\r\n') for line in f if line.strip()]

# 2) Drop header if it's literally "prompt"
if lines and lines[0].strip().lower() == 'prompt':
    lines = lines[1:]

# 3) Build df exactly as before
df = pd.DataFrame({'prompt': lines})

# 4) Split on first comma using Python split()
def split_prompt(s):
    parts = s.split(',', 1)
    primary = parts[0].strip()
    if len(parts) > 1:
        secondaries = [p.strip() for p in parts[1].split(',')]
    else:
        secondaries = []
    return pd.Series([primary, secondaries], index=['primary','secondary_list'])

df[['primary','secondary_list']] = df['prompt'].apply(split_prompt)

# 5) Explode into df_secondary
df_secondary = (
    df
    .explode('secondary_list')
    .rename(columns={'secondary_list': 'secondary'})
    .dropna(subset=['secondary'])
)

print(f"Loaded {len(df)} prompts, with {len(df_secondary)} secondary rows.")
print(df.head())
print(df_secondary.head())

Loaded 1000 prompts, with 5697 secondary rows.
                                              prompt  \
0  Portrait of Elon Musk by Gottfried Helnwein an...   
1                       kanye west as ezio auditore    
2  sonic the hedgehog, in the style of sega genes...   
3  a photo of sunflower monster with real human m...   
4  dance first. think later. it's the natural ord...   

                                             primary  \
0  Portrait of Elon Musk by Gottfried Helnwein an...   
1                        kanye west as ezio auditore   
2                                 sonic the hedgehog   
3  a photo of sunflower monster with real human m...   
4  dance first. think later. it's the natural ord...   

                                      secondary_list  
0                                                 []  
1                                                 []  
2  [in the style of sega genesis, cartoon, illust...  
3                                         [1 0 0 mm]  
4  [

In [2]:
# Task 2: Vectorize & cluster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

def cluster_and_describe(texts, n_clusters=10, top_n_terms=5):
    vec = TfidfVectorizer(max_features=1000, stop_words='english')
    X = vec.fit_transform(texts)
    km = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
    centers = km.cluster_centers_
    terms = np.array(vec.get_feature_names_out())
    descriptions = {}
    for i, center in enumerate(centers):
        top_terms = terms[center.argsort()[-top_n_terms:][::-1]]
        descriptions[i] = " / ".join(top_terms)
    return km.labels_, descriptions

# cluster primaries
primaries = df['primary'].tolist()
primary_labels, primary_desc = cluster_and_describe(primaries, n_clusters=10)
df['primary_cluster'] = primary_labels

# cluster secondaries
secondaries = df_secondary['secondary'].tolist()
sec_labels, sec_desc = cluster_and_describe(secondaries, n_clusters=50)
df_secondary['secondary_cluster'] = sec_labels

print("Primary cluster descriptions:", primary_desc)
print("Secondary cluster descriptions:", sec_desc)

Primary cluster descriptions: {0: 'epic / intricate / portrait / photo / forms', 1: 'portrait / painting / art / photo / man', 2: 'russia / energy / illustration / forms / forest', 3: 'walter / giving / white / middle / portrait', 4: 'detailed / highly / painting / portrait / working', 5: 'rutkowski / greg / portrait / film / human', 6: 'beautiful / woman / portrait / young / face', 7: 'fat / french / eating / frank / fox', 8: 'photography / dream / winning / award / focused', 9: 'energy / dmt / dream / hyper / artstation'}
Secondary cluster descriptions: {0: 'plasma / floating / flare / flame / finnstark', 1: 'painting / digital / aivazovsky / time / realistic', 2: 'artgerm / art / feng / fenghua / ferdinand', 3: 'detailed / highly / portrait / face / beautiful', 4: 'smooth / realistic / high / beautiful / photorealistic', 5: 'intricate / details / detailed / environment / insanely', 6: 'sharp / details / lightning / edges / flame', 7: 'gold / black / accents / sparkling / blood', 8: 

In [3]:
# Task 3 (fixed): Frequent‐itemset mining (size ≤2) on clusters

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd

# build cluster_transactions without grouping KeyError
cluster_transactions = []
for idx, row in df.iterrows():
    # one primary per prompt
    prim = f"P{row['primary_cluster']}"
    # lookup secondary clusters for this prompt index (may be none)
    sec_clusters = df_secondary.loc[df_secondary.index == idx, 'secondary_cluster'].unique()
    secs = [f"S{c}" for c in sec_clusters]
    cluster_transactions.append([prim] + secs)

# now encode and mine
te = TransactionEncoder()
te_ary = te.fit(cluster_transactions).transform(cluster_transactions)
cluster_df = pd.DataFrame(te_ary, columns=te.columns_)

freq_itemsets_2 = apriori(cluster_df, min_support=0.01, use_colnames=True, max_len=2)
print(freq_itemsets_2)

     support   itemsets
0      0.897       (P1)
1      0.025       (P4)
2      0.053       (P6)
3      0.082       (S1)
4      0.025      (S10)
..       ...        ...
267    0.017  (S5, S44)
268    0.010  (S9, S44)
269    0.010  (S5, S47)
270    0.021   (S5, S8)
271    0.036   (S5, S9)

[272 rows x 2 columns]


In [4]:
# Task 4(a): Rebuild vec_primary, km_primary, construct G, define random_walk, and run on a new prompt

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import association_rules
import networkx as nx

# Re-create the primary vectorizer & k-means from Task 2
vec_primary = TfidfVectorizer(max_features=1000, stop_words='english')
X_primary   = vec_primary.fit_transform(df['primary'].tolist())
km_primary  = KMeans(n_clusters=len(primary_desc), random_state=42).fit(X_primary)

# Build directed graph G from association rules on freq_itemsets_2
rules = association_rules(freq_itemsets_2, metric="confidence", min_threshold=0.1)
G = nx.DiGraph()
for _, row in rules.iterrows():
    if len(row['antecedents']) == 1 and len(row['consequents']) == 1:
        a = next(iter(row['antecedents']))
        b = next(iter(row['consequents']))
        G.add_edge(a, b, weight=row['confidence'])

# Define the random_walk function (only visits new satellite clusters)
def random_walk(graph, start, k=5):
    path = []
    current = start
    visited = {start}
    for _ in range(k):
        neigh = [
            (n, d['weight'])
            for n, d in graph[current].items()
            if n.startswith('S') and n not in visited
        ]
        if not neigh:
            break
        nodes, weights = zip(*neigh)
        probs = [w / sum(weights) for w in weights]
        next_node = np.random.choice(nodes, p=probs)
        path.append(next_node)
        visited.add(next_node)
        current = next_node
    return path

# Classify a new incoming primary prompt and run the walk
new_prompt = "Describe the optimal solar panel configuration"
X_new      = vec_primary.transform([new_prompt])
label      = km_primary.predict(X_new)[0]    # e.g. 2
start      = f"P{label}"                     # "P2"
chain      = random_walk(G, start, k=4)

print(f"New prompt classified into P{label}")
print("Satellite cluster chain:", chain)

New prompt classified into P1
Satellite cluster chain: [np.str_('S4'), np.str_('S23'), np.str_('S13'), np.str_('S22')]


In [5]:
# Task 4(b): Build support matrix & IPF for size‑2 itemsets, handling a new incoming primary prompt

import numpy as np
# silence divide‑by‑zero / invalid warnings during IPF iterations
np.seterr(divide='ignore', invalid='ignore')

from ipfn import ipfn   # correct import for the ipfn package

# build 2-item support matrix
items = list(te.columns_)
n = len(items)
support = np.zeros((n, n))
for i, xi in enumerate(items):
    for j, xj in enumerate(items):
        if i < j:
            mask = cluster_df[xi] & cluster_df[xj]
            support[i, j] = mask.mean()
            support[j, i] = support[i, j]

# prepare marginals and dimensions
marginals = [
    support.sum(axis=1),  # row sums (for axis‑0 marginals)
    support.sum(axis=0)   # col sums (for axis‑1 marginals)
]
dimensions = [[1], [0]]    # match row marginals to axis 1, col to axis 0

# run IPF
IPF = ipfn.ipfn(support, marginals, dimensions, convergence_rate=1e-6)
est = IPF.iteration()

# get top size‑2 itemsets containing a given primary cluster
def top_kitemsets(primary, k=2, top_n=5):
    idx = items.index(primary)
    from itertools import combinations
    scores = []
    for combo in combinations(range(n), k):
        if idx in combo:
            prob = est[combo[0], combo[1]]
            scores.append((tuple(items[i] for i in combo), prob))
    return sorted(scores, key=lambda x: -x[1])[:top_n]

# — classify a new incoming primary prompt (just like in 4a) —
new_prompt = "Describe the optimal solar panel configuration"
X_new      = vec_primary.transform([new_prompt])
label      = km_primary.predict(X_new)[0]
primary    = f"P{label}"

# — find the top 2‑itemsets that include this primary cluster —
top_itemsets = top_kitemsets(primary, k=5, top_n=1)
print(f"New prompt classified into {primary}")
print("Top 2‑itemsets including this cluster:", top_itemsets)

New prompt classified into P1
Top 2‑itemsets including this cluster: [(('P1', 'S4', 'S40', 'S41', 'S42'), np.float64(0.6389999999999998))]
