In [5]:
import pandas as pd

file = r'/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Main_MLS_w_Features_2025-12-18-1053.csv' # first set of 12K Data from Farbod

df = pd.read_csv(file)
df = df.rename(columns = {'PARSED_OUTPUT':'PARSED'})

  df = pd.read_csv(file)


In [4]:
def extract_features_with_lda(df, n1=300, n2=20, n3=20,
                               dedupe_threshold=3, dedupe_top_n=2000,
                               n_topics=10, lda_features=500):
    """
    Extract features with LDA topic modeling - optimized for performance

    Parameters:
    -----------
    n_topics : int
        Number of latent topics to discover (e.g., 10 = Modern, Traditional, Luxury, etc.)
    lda_features : int
        Number of top features to use for LDA (recommend 300-1000)
    """
    from collections import Counter, defaultdict
    from itertools import combinations
    import ast
    import numpy as np
    from rapidfuzz import process, distance
    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd

    print("Parsing...")
    prop_feats = []
    for s in df['PARSED'].values:
        try:
            d = ast.literal_eval(s) if pd.notna(s) else {}
            feats = set()
            for img in d.values():
                feats.update(img.get('prominent_features', []))
            prop_feats.append(frozenset(feats))
        except:
            prop_feats.append(frozenset())

    # Count ALL features first
    print("Counting all features...")
    all_singles = Counter()
    for feats in prop_feats:
        all_singles.update(feats)

    # Dedupe top N features
    if dedupe_top_n and dedupe_threshold > 0:
        top_for_dedupe = [f for f, _ in all_singles.most_common(dedupe_top_n)]
        print(f"Deduplicating top {len(top_for_dedupe)} features...")

        canonical_map = {}
        canonical_list = []

        for feat in sorted(top_for_dedupe):
            if not canonical_list:
                canonical_map[feat] = feat
                canonical_list.append(feat)
                continue

            match = process.extractOne(
                feat.lower(),
                [c.lower() for c in canonical_list],
                scorer=distance.Levenshtein.distance,
                score_cutoff=dedupe_threshold
            )

            if match:
                canonical_map[feat] = canonical_list[match[2]]
            else:
                canonical_map[feat] = feat
                canonical_list.append(feat)

        for feat in all_singles:
            if feat not in canonical_map:
                canonical_map[feat] = feat
    else:
        canonical_map = {f: f for f in all_singles}

    # Apply canonicalization
    prop_feats = [frozenset(canonical_map[f] for f in feats) for feats in prop_feats]

    # Count singles
    print("Counting singles...")
    singles = Counter()
    for feats in prop_feats:
        singles.update(feats)
    top_singles = [f for f, _ in singles.most_common(n1)]

    # ========== LDA TOPIC MODELING ==========
    print(f"\n{'='*60}")
    print("RUNNING LDA TOPIC MODELING")
    print(f"{'='*60}")

    # Get top features for LDA
    lda_top_features = [f for f, _ in singles.most_common(lda_features)]
    lda_feat_idx = {f: i for i, f in enumerate(lda_top_features)}

    # Create document-term matrix
    print(f"Building document-term matrix with {len(lda_top_features)} features...")
    doc_term_matrix = np.zeros((len(df), len(lda_top_features)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in lda_feat_idx:
                doc_term_matrix[row, lda_feat_idx[feat]] = 1

    # Fit LDA
    print(f"Fitting LDA with {n_topics} topics...")
    lda_model = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=20,
        learning_method='online',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    # Get topic distributions for each property
    doc_topic_dist = lda_model.fit_transform(doc_term_matrix)

    # Add topic columns to dataframe (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding topic distributions to dataframe...")

    # Drop any existing topic columns first to avoid duplicates
    existing_topic_cols = [f'topic_{i+1}' for i in range(n_topics)] + ['dominant_topic']
    df = df.drop(columns=[col for col in existing_topic_cols if col in df.columns], errors='ignore')

    topic_cols = {}
    for topic_idx in range(n_topics):
        topic_cols[f'topic_{topic_idx+1}'] = doc_topic_dist[:, topic_idx]

    topic_cols['dominant_topic'] = np.argmax(doc_topic_dist, axis=1) + 1

    df = pd.concat([df, pd.DataFrame(topic_cols, index=df.index)], axis=1)

    # Analyze topics
    print(f"\n{'='*60}")
    print("TOP FEATURES BY TOPIC")
    print(f"{'='*60}")

    topic_features = []
    n_top_words = 15

    for topic_idx, topic in enumerate(lda_model.components_):
        top_feature_indices = topic.argsort()[-n_top_words:][::-1]
        top_features = [lda_top_features[i] for i in top_feature_indices]
        top_weights = [topic[i] for i in top_feature_indices]

        print(f"\nTopic {topic_idx + 1}:")
        for feat, weight in zip(top_features, top_weights):
            print(f"  {feat:40s} {weight:.4f}")

        topic_features.append({
            'topic': topic_idx + 1,
            'top_features': top_features,
            'weights': top_weights
        })

    # Create topic summary dataframe (FIXED - ensure single values, not Series)
    topic_summary_data = []
    for t in topic_features:
        topic_num = t['topic']
        topic_summary_data.append({
            'topic': topic_num,
            'top_5_features': ', '.join(t['top_features'][:5]),
            'num_properties': int((df['dominant_topic'] == topic_num).sum()),
            'avg_topic_strength': float(df[f"topic_{topic_num}"].mean())
        })

    topic_summary = pd.DataFrame(topic_summary_data)

    print(f"\n{'='*60}")
    print("TOPIC SUMMARY")
    print(f"{'='*60}")
    print(topic_summary.to_string(index=False))

    # ========== CONTINUE WITH REGULAR FEATURE EXTRACTION ==========

    # Create singles columns (OPTIMIZED - no fragmentation, no duplicates)
    print("\nCreating single feature columns...")
    feat_idx = {f: i for i, f in enumerate(top_singles)}
    singles_data = np.zeros((len(df), len(top_singles)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in feat_idx:
                singles_data[row, feat_idx[feat]] = 1

    # Build all single columns at once
    single_cols = {}
    for i, feat in enumerate(top_singles):
        col_name = feat.replace(' ', '_')[:50]
        # Ensure unique column names
        if col_name in df.columns:
            col_name = f"{col_name}_{i}"
        single_cols[col_name] = singles_data[:, i]

    df = pd.concat([df, pd.DataFrame(single_cols, index=df.index)], axis=1)

    # Pairs (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting pairs...")
    top_set = set(top_singles)
    filtered_feats = [[f for f in feats if f in top_set] for feats in prop_feats]

    pairs = Counter()
    for feats in filtered_feats:
        if len(feats) >= 2:
            pairs.update(combinations(sorted(feats), 2))

    top_pairs = [p for p, _ in pairs.most_common(n2)]

    # Drop existing pair columns
    existing_pair_cols = [f'pair_{i}' for i in range(1, n2+1)]
    df = df.drop(columns=[col for col in existing_pair_cols if col in df.columns], errors='ignore')

    pair_definitions = []
    pair_cols = {}

    for i, (f1, f2) in enumerate(top_pairs, 1):
        i1, i2 = feat_idx[f1], feat_idx[f2]
        pair_cols[f"pair_{i}"] = (singles_data[:, i1] & singles_data[:, i2]).astype(np.int8)
        pair_definitions.append({
            'column_name': f'pair_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'count': pairs[(f1, f2)]
        })

    df = pd.concat([df, pd.DataFrame(pair_cols, index=df.index)], axis=1)

    # Triples (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting triples...")
    triples = Counter()
    for feats in filtered_feats:
        if len(feats) >= 3:
            triples.update(combinations(sorted(feats), 3))

    top_triples = [t for t, _ in triples.most_common(n3)]

    # Drop existing triple columns
    existing_triple_cols = [f'trip_{i}' for i in range(1, n3+1)]
    df = df.drop(columns=[col for col in existing_triple_cols if col in df.columns], errors='ignore')

    triple_definitions = []
    triple_cols = {}

    for i, (f1, f2, f3) in enumerate(top_triples, 1):
        i1, i2, i3 = feat_idx[f1], feat_idx[f2], feat_idx[f3]
        triple_cols[f"trip_{i}"] = (singles_data[:, i1] & singles_data[:, i2] & singles_data[:, i3]).astype(np.int8)
        triple_definitions.append({
            'column_name': f'trip_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'feature_3': f3,
            'count': triples[(f1, f2, f3)]
        })

    df = pd.concat([df, pd.DataFrame(triple_cols, index=df.index)], axis=1)

    # Conditions (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding conditions...")
    parsed = [ast.literal_eval(s) if pd.notna(s) else {} for s in df['PARSED'].values]

    # Drop existing condition columns
    condition_col_names = []
    for pre in ['gran', 'high']:
        for suf in ['_in', '_ex', '']:
            condition_col_names.append(f'{pre}_c{suf}')
    df = df.drop(columns=[col for col in condition_col_names if col in df.columns], errors='ignore')

    condition_cols = {}
    for pre, key in [('gran', 'granular_condition_num'), ('high', 'high_condition_num')]:
        for suf, typ in [('_in', 'Indoor'), ('_ex', 'Exterior'), ('', None)]:
            col_name = f'{pre}_c{suf}'
            condition_cols[col_name] = [
                np.mean([img[key] for img in d.values()
                        if key in img and (not typ or img.get('image_type') == typ)])
                if d else np.nan
                for d in parsed
            ]

    df = pd.concat([df, pd.DataFrame(condition_cols, index=df.index)], axis=1)

    print(f"\n✓ Done: {df.shape}")
    print(f"✓ Added {n_topics} topic columns: topic_1 to topic_{n_topics}")
    print(f"✓ Added dominant_topic column")
    print(f"✓ Added {len(single_cols)} single features")
    print(f"✓ Added {len(pair_cols)} pair features")
    print(f"✓ Added {len(triple_cols)} triple features")
    print(f"✓ Added {len(condition_cols)} condition features")
    print(f"✓ No fragmentation warnings!")
    print(f"✓ No duplicate columns!")

    pairs_df = pd.DataFrame(pair_definitions)
    triples_df = pd.DataFrame(triple_definitions)

    return df, pairs_df, triples_df, topic_summary, lda_model


# ========== USAGE ==========

# Run with LDA (10 topics, using top 500 features)
df, pairs, triples, topics, lda_model = extract_features_with_lda(
    df,
    n1=300,           # top single features
    n2=20,            # top pairs
    n3=20,            # top triples
    dedupe_threshold=3,
    dedupe_top_n=2000,
    n_topics=10,      # number of latent topics
    lda_features=500  # features to use for LDA
)

# Save results
pairs.to_csv('pair_definitions.csv', index=False)
triples.to_csv('triple_definitions.csv', index=False)
topics.to_csv('topic_summary.csv', index=False)

# Explore topics
print("\nProperties in Topic 1:")
print(df[df['dominant_topic'] == 1][['topic_1', 'topic_2', 'topic_3']].head())

# Find properties with high Topic 3 score
print("\nTop properties for Topic 3:")
print(df.nlargest(10, 'topic_3')[['dominant_topic', 'topic_3']].head())

Parsing...
Counting all features...
Deduplicating top 2000 features...
Counting singles...

RUNNING LDA TOPIC MODELING
Building document-term matrix with 500 features...
Fitting LDA with 10 topics...
iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20
Adding topic distributions to dataframe...

TOP FEATURES BY TOPIC

Topic 1:
  standard bedroom size                    552.5699
  Neutral finishes                         384.0098
  Carpeted floor             

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



✓ Done: (11358, 1538)
✓ Added 10 topic columns: topic_1 to topic_10
✓ Added dominant_topic column
✓ Added 300 single features
✓ Added 20 pair features
✓ Added 20 triple features
✓ Added 6 condition features
✓ No duplicate columns!

Properties in Topic 1:
     topic_1   topic_2   topic_3
3   0.100000  0.100000  0.100000
8   0.100000  0.100000  0.100000
10  0.959082  0.004547  0.004546
12  0.100000  0.100000  0.100000
41  0.100000  0.100000  0.100000

Top properties for Topic 3:
       dominant_topic   topic_3
10398               3  0.819978
1278                3  0.775000
86                  3  0.774999
4338                3  0.774999
5844                3  0.774997


In [8]:
def extract_features_with_gsdmm(df, n1=300, n2=20, n3=20,
                                 dedupe_threshold=3, dedupe_top_n=2000,
                                 n_topics=10, gsdmm_features=500,
                                 alpha=0.1, beta=0.1, n_iterations=30):
    """
    Extract features with GSDMM topic modeling using gensim - optimized for performance
    GSDMM (Movie Group Process) assigns each document to ONE dominant topic

    Parameters:
    -----------
    n_topics : int
        Number of clusters/topics to discover (e.g., 10 = Modern, Traditional, Luxury, etc.)
    gsdmm_features : int
        Number of top features to use for GSDMM (recommend 300-1000)
    alpha : float
        Dirichlet parameter for document-cluster distribution (default 0.1)
    beta : float
        Dirichlet parameter for cluster-word distribution (default 0.1)
    n_iterations : int
        Number of Gibbs sampling iterations (default 30)

    Requires:
    ---------
    pip install gsdmm gensim
    """
    from collections import Counter, defaultdict
    from itertools import combinations
    import ast
    import numpy as np
    from rapidfuzz import process, distance
    import pandas as pd
    from gsdmm import MovieGroupProcess
    from gensim import corpora
    from gensim.models import CoherenceModel

    print("Parsing...")
    prop_feats = []
    for s in df['PARSED'].values:
        try:
            d = ast.literal_eval(s) if pd.notna(s) else {}
            feats = set()
            for img in d.values():
                feats.update(img.get('prominent_features', []))
            prop_feats.append(frozenset(feats))
        except:
            prop_feats.append(frozenset())

    # Count ALL features first
    print("Counting all features...")
    all_singles = Counter()
    for feats in prop_feats:
        all_singles.update(feats)

    # Dedupe top N features
    if dedupe_top_n and dedupe_threshold > 0:
        top_for_dedupe = [f for f, _ in all_singles.most_common(dedupe_top_n)]
        print(f"Deduplicating top {len(top_for_dedupe)} features...")

        canonical_map = {}
        canonical_list = []

        for feat in sorted(top_for_dedupe):
            if not canonical_list:
                canonical_map[feat] = feat
                canonical_list.append(feat)
                continue

            match = process.extractOne(
                feat.lower(),
                [c.lower() for c in canonical_list],
                scorer=distance.Levenshtein.distance,
                score_cutoff=dedupe_threshold
            )

            if match:
                canonical_map[feat] = canonical_list[match[2]]
            else:
                canonical_map[feat] = feat
                canonical_list.append(feat)

        for feat in all_singles:
            if feat not in canonical_map:
                canonical_map[feat] = feat
    else:
        canonical_map = {f: f for f in all_singles}

    # Apply canonicalization
    prop_feats = [frozenset(canonical_map[f] for f in feats) for feats in prop_feats]

    # Count singles
    print("Counting singles...")
    singles = Counter()
    for feats in prop_feats:
        singles.update(feats)
    top_singles = [f for f, _ in singles.most_common(n1)]

    # ========== GSDMM TOPIC MODELING ==========
    print(f"\n{'='*60}")
    print("RUNNING GSDMM TOPIC MODELING (Movie Group Process)")
    print(f"{'='*60}")

    # Get top features for GSDMM
    gsdmm_top_features = [f for f, _ in singles.most_common(gsdmm_features)]
    gsdmm_feat_set = set(gsdmm_top_features)

    # Create document representations (list of word strings)
    print(f"Building document representations with {len(gsdmm_top_features)} features...")
    docs = []
    for feats in prop_feats:
        doc = [feat for feat in feats if feat in gsdmm_feat_set]
        docs.append(doc)

    # Create gensim dictionary
    print("Creating gensim dictionary...")
    dictionary = corpora.Dictionary(docs)

    # Fit GSDMM (Movie Group Process)
    print(f"Fitting GSDMM with {n_topics} topics...")
    print(f"Parameters: alpha={alpha}, beta={beta}, iterations={n_iterations}")

    mgp = MovieGroupProcess(K=n_topics, alpha=alpha, beta=beta, n_iters=n_iterations)

    # Fit the model
    y = mgp.fit(docs, len(dictionary))

    print(f"GSDMM converged. Final number of active clusters: {len(set(y))}")

    # Get topic assignments for each document
    doc_topic_assignment = np.array(y)

    # Create topic distribution matrix (one-hot encoding for GSDMM)
    # GSDMM is a hard clustering method - each doc belongs to ONE topic
    doc_topic_dist = np.zeros((len(docs), n_topics))
    for i, topic in enumerate(y):
        doc_topic_dist[i, topic] = 1.0

    # Add topic columns to dataframe (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding topic distributions to dataframe...")

    # Drop any existing topic columns first to avoid duplicates
    existing_topic_cols = [f'topic_{i+1}' for i in range(n_topics)] + ['dominant_topic']
    df = df.drop(columns=[col for col in existing_topic_cols if col in df.columns], errors='ignore')

    topic_cols = {}
    # For GSDMM, topic probability is binary (1 or 0)
    for topic_idx in range(n_topics):
        topic_cols[f'topic_{topic_idx+1}'] = doc_topic_dist[:, topic_idx]

    # Dominant topic is the assigned cluster
    topic_cols['dominant_topic'] = doc_topic_assignment + 1

    df = pd.concat([df, pd.DataFrame(topic_cols, index=df.index)], axis=1)

    # Analyze topics - get top words per cluster
    print(f"\n{'='*60}")
    print("TOP FEATURES BY TOPIC")
    print(f"{'='*60}")

    topic_features = []
    n_top_words = 15

    for topic_idx in range(n_topics):
        # Get top words for this cluster from GSDMM model
        top_words = mgp.top_words(topic_idx, n_top_words)

        # Get word counts
        topic_word_counts = []
        total_words_in_topic = sum(mgp.cluster_word_count[topic_idx].values())

        for word in top_words:
            count = mgp.cluster_word_count[topic_idx][word]
            weight = count / total_words_in_topic if total_words_in_topic > 0 else 0
            topic_word_counts.append((word, weight))

        print(f"\nTopic {topic_idx + 1} ({mgp.cluster_doc_count[topic_idx]} properties):")
        for word, weight in topic_word_counts:
            print(f"  {word:40s} {weight:.4f}")

        topic_features.append({
            'topic': topic_idx + 1,
            'top_features': [w for w, _ in topic_word_counts],
            'weights': [w for _, w in topic_word_counts]
        })

    # Create topic summary dataframe
    topic_summary_data = []
    for t in topic_features:
        topic_num = t['topic']
        topic_idx = topic_num - 1
        num_props = int((df['dominant_topic'] == topic_num).sum())

        topic_summary_data.append({
            'topic': topic_num,
            'top_5_features': ', '.join(t['top_features'][:5]),
            'num_properties': num_props,
            'cluster_size': mgp.cluster_doc_count[topic_idx],
            'avg_topic_strength': float(df[f"topic_{topic_num}"].mean())
        })

    topic_summary = pd.DataFrame(topic_summary_data)

    print(f"\n{'='*60}")
    print("TOPIC SUMMARY")
    print(f"{'='*60}")
    print(topic_summary.to_string(index=False))

    # Calculate coherence score using gensim
    try:
        print("\nCalculating topic coherence...")
        # Get topic words for coherence calculation
        topic_words = [[word for word in mgp.top_words(k, 10)] for k in range(n_topics)]

        # Calculate C_V coherence
        coherence_model = CoherenceModel(
            topics=topic_words,
            texts=docs,
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()
        print(f"Topic Coherence (C_V): {coherence_score:.4f}")
    except Exception as e:
        print(f"Could not calculate coherence: {e}")
        coherence_score = None

    # ========== CONTINUE WITH REGULAR FEATURE EXTRACTION ==========

    # Create singles columns (OPTIMIZED - no fragmentation, no duplicates)
    print("\nCreating single feature columns...")
    feat_idx = {f: i for i, f in enumerate(top_singles)}
    singles_data = np.zeros((len(df), len(top_singles)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in feat_idx:
                singles_data[row, feat_idx[feat]] = 1

    # Build all single columns at once
    single_cols = {}
    for i, feat in enumerate(top_singles):
        col_name = feat.replace(' ', '_')[:50]
        # Ensure unique column names
        if col_name in df.columns:
            col_name = f"{col_name}_{i}"
        single_cols[col_name] = singles_data[:, i]

    df = pd.concat([df, pd.DataFrame(single_cols, index=df.index)], axis=1)

    # Pairs (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting pairs...")
    top_set = set(top_singles)
    filtered_feats = [[f for f in feats if f in top_set] for feats in prop_feats]

    pairs = Counter()
    for feats in filtered_feats:
        if len(feats) >= 2:
            pairs.update(combinations(sorted(feats), 2))

    top_pairs = [p for p, _ in pairs.most_common(n2)]

    # Drop existing pair columns
    existing_pair_cols = [f'pair_{i}' for i in range(1, n2+1)]
    df = df.drop(columns=[col for col in existing_pair_cols if col in df.columns], errors='ignore')

    pair_definitions = []
    pair_cols = {}

    for i, (f1, f2) in enumerate(top_pairs, 1):
        i1, i2 = feat_idx[f1], feat_idx[f2]
        pair_cols[f"pair_{i}"] = (singles_data[:, i1] & singles_data[:, i2]).astype(np.int8)
        pair_definitions.append({
            'column_name': f'pair_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'count': pairs[(f1, f2)]
        })

    df = pd.concat([df, pd.DataFrame(pair_cols, index=df.index)], axis=1)

    # Triples (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting triples...")
    triples = Counter()
    for feats in filtered_feats:
        if len(feats) >= 3:
            triples.update(combinations(sorted(feats), 3))

    top_triples = [t for t, _ in triples.most_common(n3)]

    # Drop existing triple columns
    existing_triple_cols = [f'trip_{i}' for i in range(1, n3+1)]
    df = df.drop(columns=[col for col in existing_triple_cols if col in df.columns], errors='ignore')

    triple_definitions = []
    triple_cols = {}

    for i, (f1, f2, f3) in enumerate(top_triples, 1):
        i1, i2, i3 = feat_idx[f1], feat_idx[f2], feat_idx[f3]
        triple_cols[f"trip_{i}"] = (singles_data[:, i1] & singles_data[:, i2] & singles_data[:, i3]).astype(np.int8)
        triple_definitions.append({
            'column_name': f'trip_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'feature_3': f3,
            'count': triples[(f1, f2, f3)]
        })

    df = pd.concat([df, pd.DataFrame(triple_cols, index=df.index)], axis=1)

    # Conditions (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding conditions...")
    parsed = [ast.literal_eval(s) if pd.notna(s) else {} for s in df['PARSED'].values]

    # Drop existing condition columns
    condition_col_names = []
    for pre in ['gran', 'high']:
        for suf in ['_in', '_ex', '']:
            condition_col_names.append(f'{pre}_c{suf}')
    df = df.drop(columns=[col for col in condition_col_names if col in df.columns], errors='ignore')

    condition_cols = {}
    for pre, key in [('gran', 'granular_condition_num'), ('high', 'high_condition_num')]:
        for suf, typ in [('_in', 'Indoor'), ('_ex', 'Exterior'), ('', None)]:
            col_name = f'{pre}_c{suf}'
            condition_cols[col_name] = [
                np.mean([img[key] for img in d.values()
                        if key in img and (not typ or img.get('image_type') == typ)])
                if d else np.nan
                for d in parsed
            ]

    df = pd.concat([df, pd.DataFrame(condition_cols, index=df.index)], axis=1)

    print(f"\n✓ Done: {df.shape}")
    print(f"✓ Added {n_topics} topic columns: topic_1 to topic_{n_topics}")
    print(f"✓ Added dominant_topic column (GSDMM hard clustering)")
    print(f"✓ Added {len(single_cols)} single features")
    print(f"✓ Added {len(pair_cols)} pair features")
    print(f"✓ Added {len(triple_cols)} triple features")
    print(f"✓ Added {len(condition_cols)} condition features")
    if coherence_score:
        print(f"✓ Topic coherence (C_V): {coherence_score:.4f}")
    print(f"✓ No fragmentation warnings!")
    print(f"✓ No duplicate columns!")

    pairs_df = pd.DataFrame(pair_definitions)
    triples_df = pd.DataFrame(triple_definitions)

    return df, pairs_df, triples_df, topic_summary, mgp, coherence_score


# ========== USAGE ==========

# Run with GSDMM (10 topics, using top 500 features)
df, pairs, triples, topics, gsdmm_model, coherence = extract_features_with_gsdmm(
    df,
    n1=300,           # top single features
    n2=20,            # top pairs
    n3=20,            # top triples
    dedupe_threshold=3,
    dedupe_top_n=2000,
    n_topics=10,      # number of clusters
    gsdmm_features=500,  # features to use for GSDMM
    alpha=0.1,        # Dirichlet prior for doc-topic
    beta=0.1,         # Dirichlet prior for topic-word
    n_iterations=30   # Gibbs sampling iterations
)

# Save results
pairs.to_csv('pair_definitions.csv', index=False)
triples.to_csv('triple_definitions.csv', index=False)
topics.to_csv('topic_summary_gsdmm.csv', index=False)

# Explore topics (GSDMM assigns each property to ONE topic)
print("\nProperties in Topic 1:")
print(df[df['dominant_topic'] == 1][['dominant_topic']].head())

# Count properties per topic
print("\nProperties per topic:")
print(df['dominant_topic'].value_counts().sort_index())

# See top features for a specific cluster
print("\nTop features for Topic 3:")
print(gsdmm_model.top_words(2, 10))  # Note: 0-indexed, so topic 3 is index

ModuleNotFoundError: No module named 'gsdmm'

In [9]:
%pip install gsdmm

[31mERROR: Could not find a version that satisfies the requirement gsdmm (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: No matching distribution found for gsdmm[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [12]:
def extract_features_with_gsdmm(df, n1=300, n2=20, n3=20,
                                 dedupe_threshold=3, dedupe_top_n=2000,
                                 n_topics=10, gsdmm_features=500,
                                 alpha=0.1, beta=0.1, n_iterations=30):
    """
    Extract features with GSDMM topic modeling - pure implementation
    GSDMM (Movie Group Process) assigns each document to ONE dominant topic

    Parameters:
    -----------
    n_topics : int
        Number of clusters/topics to discover (e.g., 10 = Modern, Traditional, Luxury, etc.)
    gsdmm_features : int
        Number of top features to use for GSDMM (recommend 300-1000)
    alpha : float
        Dirichlet parameter for document-cluster distribution (default 0.1)
    beta : float
        Dirichlet parameter for cluster-word distribution (default 0.1)
    n_iterations : int
        Number of Gibbs sampling iterations (default 30)

    No special dependencies - pure numpy/python implementation
    """
    from collections import Counter, defaultdict
    from itertools import combinations
    import ast
    import numpy as np
    from rapidfuzz import process, distance
    import pandas as pd

    # ========== GSDMM Implementation (Movie Group Process) ==========
    class MovieGroupProcess:
        """
        GSDMM - Gibbs Sampling Dirichlet Multinomial Mixture
        Pure numpy/python implementation - no external dependencies
        """
        def __init__(self, K=10, alpha=0.1, beta=0.1, n_iters=30):
            self.K = K  # number of clusters
            self.alpha = alpha  # doc-cluster prior
            self.beta = beta  # word-cluster prior
            self.n_iters = n_iters

            self.doc_cluster = []  # cluster assignment for each doc
            self.cluster_doc_count = np.zeros(K)  # number of docs in each cluster
            self.cluster_word_count = [defaultdict(int) for _ in range(K)]  # word counts per cluster
            self.cluster_word_total = np.zeros(K)  # total words per cluster

        def fit(self, docs, vocab_size):
            """
            docs: list of lists of words (strings)
            vocab_size: number of unique words in vocabulary
            """
            n_docs = len(docs)

            # Random initialization
            print("  Initializing clusters randomly...")
            self.doc_cluster = np.random.randint(0, self.K, n_docs)

            # Populate initial counts
            for doc_id, doc in enumerate(docs):
                cluster = self.doc_cluster[doc_id]
                self.cluster_doc_count[cluster] += 1
                for word in doc:
                    self.cluster_word_count[cluster][word] += 1
                    self.cluster_word_total[cluster] += 1

            # Gibbs sampling
            print(f"  Running {self.n_iters} Gibbs sampling iterations...")
            for iteration in range(self.n_iters):
                if iteration % 5 == 0:
                    active_clusters = len([c for c in range(self.K) if self.cluster_doc_count[c] > 0])
                    print(f"    Iteration {iteration}/{self.n_iters} - Active clusters: {active_clusters}")

                for doc_id, doc in enumerate(docs):
                    if len(doc) == 0:
                        continue

                    # Remove doc from current cluster
                    old_cluster = self.doc_cluster[doc_id]
                    self.cluster_doc_count[old_cluster] -= 1
                    for word in doc:
                        self.cluster_word_count[old_cluster][word] -= 1
                        self.cluster_word_total[old_cluster] -= 1

                    # Calculate probability for each cluster
                    probs = np.zeros(self.K)
                    for k in range(self.K):
                        # P(z|d) ∝ P(z) * P(d|z)
                        # P(z) = (n_z + alpha) / (D - 1 + K*alpha)
                        p_z = (self.cluster_doc_count[k] + self.alpha) / (n_docs - 1 + self.K * self.alpha)

                        # P(d|z) = product of P(w|z) for all words in d
                        # P(w|z) = (n_z_w + beta) / (n_z + V*beta)
                        p_d_given_z = 1.0
                        for word in doc:
                            n_z_w = self.cluster_word_count[k][word]
                            n_z = self.cluster_word_total[k]
                            p_w_given_z = (n_z_w + self.beta) / (n_z + vocab_size * self.beta)
                            p_d_given_z *= p_w_given_z

                        probs[k] = p_z * p_d_given_z

                    # Normalize and sample new cluster
                    if probs.sum() == 0:
                        probs = np.ones(self.K) / self.K
                    else:
                        probs /= probs.sum()

                    new_cluster = np.random.choice(self.K, p=probs)

                    # Add doc to new cluster
                    self.doc_cluster[doc_id] = new_cluster
                    self.cluster_doc_count[new_cluster] += 1
                    for word in doc:
                        self.cluster_word_count[new_cluster][word] += 1
                        self.cluster_word_total[new_cluster] += 1

            # Final summary
            active_clusters = len([c for c in range(self.K) if self.cluster_doc_count[c] > 0])
            print(f"  ✓ Converged. Active clusters: {active_clusters}/{self.K}")

            return self.doc_cluster

        def top_words(self, cluster_id, n_words=10):
            """Get top n words for a cluster"""
            word_counts = self.cluster_word_count[cluster_id]
            sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
            return [word for word, count in sorted_words[:n_words]]

        def choose_best_label(self, cluster_id):
            """Choose best descriptive label for a cluster"""
            if self.cluster_doc_count[cluster_id] == 0:
                return "Empty Cluster"

            top_words = self.top_words(cluster_id, 3)
            return " + ".join(top_words) if top_words else "Empty"

    print("Parsing...")
    prop_feats = []
    for s in df['PARSED'].values:
        try:
            d = ast.literal_eval(s) if pd.notna(s) else {}
            feats = set()
            for img in d.values():
                feats.update(img.get('prominent_features', []))
            prop_feats.append(frozenset(feats))
        except:
            prop_feats.append(frozenset())

    # Count ALL features first
    print("Counting all features...")
    all_singles = Counter()
    for feats in prop_feats:
        all_singles.update(feats)

    # Dedupe top N features
    if dedupe_top_n and dedupe_threshold > 0:
        top_for_dedupe = [f for f, _ in all_singles.most_common(dedupe_top_n)]
        print(f"Deduplicating top {len(top_for_dedupe)} features...")

        canonical_map = {}
        canonical_list = []

        for feat in sorted(top_for_dedupe):
            if not canonical_list:
                canonical_map[feat] = feat
                canonical_list.append(feat)
                continue

            match = process.extractOne(
                feat.lower(),
                [c.lower() for c in canonical_list],
                scorer=distance.Levenshtein.distance,
                score_cutoff=dedupe_threshold
            )

            if match:
                canonical_map[feat] = canonical_list[match[2]]
            else:
                canonical_map[feat] = feat
                canonical_list.append(feat)

        for feat in all_singles:
            if feat not in canonical_map:
                canonical_map[feat] = feat
    else:
        canonical_map = {f: f for f in all_singles}

    # Apply canonicalization
    prop_feats = [frozenset(canonical_map[f] for f in feats) for feats in prop_feats]

    # Count singles
    print("Counting singles...")
    singles = Counter()
    for feats in prop_feats:
        singles.update(feats)
    top_singles = [f for f, _ in singles.most_common(n1)]

    # ========== GSDMM TOPIC MODELING ==========
    print(f"\n{'='*60}")
    print("RUNNING GSDMM TOPIC MODELING (Movie Group Process)")
    print(f"{'='*60}")

    # Get top features for GSDMM
    gsdmm_top_features = [f for f, _ in singles.most_common(gsdmm_features)]
    gsdmm_feat_set = set(gsdmm_top_features)
    vocab_size = len(gsdmm_top_features)

    # Create document representations (list of word strings)
    print(f"Building document representations with {vocab_size} features...")
    docs = []
    for feats in prop_feats:
        doc = [feat for feat in feats if feat in gsdmm_feat_set]
        docs.append(doc)

    # Fit GSDMM (Movie Group Process)
    print(f"Fitting GSDMM with {n_topics} topics...")
    print(f"Parameters: alpha={alpha}, beta={beta}, iterations={n_iterations}")

    mgp = MovieGroupProcess(K=n_topics, alpha=alpha, beta=beta, n_iters=n_iterations)
    doc_topic_assignment = mgp.fit(docs, vocab_size)

    # Create topic distribution matrix (one-hot encoding for GSDMM)
    # GSDMM is a hard clustering method - each doc belongs to ONE topic
    doc_topic_dist = np.zeros((len(docs), n_topics))
    for i, topic in enumerate(doc_topic_assignment):
        doc_topic_dist[i, topic] = 1.0

    # Add topic columns to dataframe (OPTIMIZED - no fragmentation, no duplicates)
    print("\nAdding topic distributions to dataframe...")

    # Drop any existing topic columns first to avoid duplicates
    existing_topic_cols = [f'topic_{i+1}' for i in range(n_topics)] + ['dominant_topic']
    df = df.drop(columns=[col for col in existing_topic_cols if col in df.columns], errors='ignore')

    topic_cols = {}
    # For GSDMM, topic probability is binary (1 or 0)
    for topic_idx in range(n_topics):
        topic_cols[f'topic_{topic_idx+1}'] = doc_topic_dist[:, topic_idx]

    # Dominant topic is the assigned cluster
    topic_cols['dominant_topic'] = doc_topic_assignment + 1

    df = pd.concat([df, pd.DataFrame(topic_cols, index=df.index)], axis=1)

    # Analyze topics - get top words per cluster
    print(f"\n{'='*60}")
    print("TOP FEATURES BY TOPIC")
    print(f"{'='*60}")

    topic_features = []
    n_top_words = 15

    for topic_idx in range(n_topics):
        # Get top words for this cluster
        top_words = mgp.top_words(topic_idx, n_top_words)

        # Get word counts and weights
        topic_word_data = []
        total_words_in_topic = mgp.cluster_word_total[topic_idx]

        for word in top_words:
            count = mgp.cluster_word_count[topic_idx][word]
            weight = count / total_words_in_topic if total_words_in_topic > 0 else 0
            topic_word_data.append((word, weight))

        num_docs = int(mgp.cluster_doc_count[topic_idx])
        print(f"\nTopic {topic_idx + 1} ({num_docs} properties):")
        for word, weight in topic_word_data:
            print(f"  {word:40s} {weight:.4f}")

        topic_features.append({
            'topic': topic_idx + 1,
            'top_features': [w for w, _ in topic_word_data],
            'weights': [w for _, w in topic_word_data]
        })

    # Create topic summary dataframe
    topic_summary_data = []
    for t in topic_features:
        topic_num = t['topic']
        topic_idx = topic_num - 1
        num_props = int((df['dominant_topic'] == topic_num).sum())

        topic_summary_data.append({
            'topic': topic_num,
            'top_5_features': ', '.join(t['top_features'][:5]),
            'num_properties': num_props,
            'cluster_doc_count': int(mgp.cluster_doc_count[topic_idx]),
            'avg_topic_strength': float(df[f"topic_{topic_num}"].mean()),
            'cluster_label': mgp.choose_best_label(topic_idx)
        })

    topic_summary = pd.DataFrame(topic_summary_data)

    print(f"\n{'='*60}")
    print("TOPIC SUMMARY")
    print(f"{'='*60}")
    print(topic_summary.to_string(index=False))

    # ========== CONTINUE WITH REGULAR FEATURE EXTRACTION ==========

    # Create singles columns (OPTIMIZED - no fragmentation, no duplicates)
    print("\nCreating single feature columns...")
    feat_idx = {f: i for i, f in enumerate(top_singles)}
    singles_data = np.zeros((len(df), len(top_singles)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in feat_idx:
                singles_data[row, feat_idx[feat]] = 1

    # Build all single columns at once
    single_cols = {}
    for i, feat in enumerate(top_singles):
        col_name = feat.replace(' ', '_')[:50]
        # Ensure unique column names
        if col_name in df.columns:
            col_name = f"{col_name}_{i}"
        single_cols[col_name] = singles_data[:, i]

    df = pd.concat([df, pd.DataFrame(single_cols, index=df.index)], axis=1)

    # Pairs (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting pairs...")
    top_set = set(top_singles)
    filtered_feats = [[f for f in feats if f in top_set] for feats in prop_feats]

    pairs = Counter()
    for feats in filtered_feats:
        if len(feats) >= 2:
            pairs.update(combinations(sorted(feats), 2))

    top_pairs = [p for p, _ in pairs.most_common(n2)]

    # Drop existing pair columns
    existing_pair_cols = [f'pair_{i}' for i in range(1, n2+1)]
    df = df.drop(columns=[col for col in existing_pair_cols if col in df.columns], errors='ignore')

    pair_definitions = []
    pair_cols = {}

    for i, (f1, f2) in enumerate(top_pairs, 1):
        i1, i2 = feat_idx[f1], feat_idx[f2]
        pair_cols[f"pair_{i}"] = (singles_data[:, i1] & singles_data[:, i2]).astype(np.int8)
        pair_definitions.append({
            'column_name': f'pair_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'count': pairs[(f1, f2)]
        })

    df = pd.concat([df, pd.DataFrame(pair_cols, index=df.index)], axis=1)

    # Triples (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting triples...")
    triples = Counter()
    for feats in filtered_feats:
        if len(feats) >= 3:
            triples.update(combinations(sorted(feats), 3))

    top_triples = [t for t, _ in triples.most_common(n3)]

    # Drop existing triple columns
    existing_triple_cols = [f'trip_{i}' for i in range(1, n3+1)]
    df = df.drop(columns=[col for col in existing_triple_cols if col in df.columns], errors='ignore')

    triple_definitions = []
    triple_cols = {}

    for i, (f1, f2, f3) in enumerate(top_triples, 1):
        i1, i2, i3 = feat_idx[f1], feat_idx[f2], feat_idx[f3]
        triple_cols[f"trip_{i}"] = (singles_data[:, i1] & singles_data[:, i2] & singles_data[:, i3]).astype(np.int8)
        triple_definitions.append({
            'column_name': f'trip_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'feature_3': f3,
            'count': triples[(f1, f2, f3)]
        })

    df = pd.concat([df, pd.DataFrame(triple_cols, index=df.index)], axis=1)

    # Conditions (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding conditions...")
    parsed = [ast.literal_eval(s) if pd.notna(s) else {} for s in df['PARSED'].values]

    # Drop existing condition columns
    condition_col_names = []
    for pre in ['gran', 'high']:
        for suf in ['_in', '_ex', '']:
            condition_col_names.append(f'{pre}_c{suf}')
    df = df.drop(columns=[col for col in condition_col_names if col in df.columns], errors='ignore')

    condition_cols = {}
    for pre, key in [('gran', 'granular_condition_num'), ('high', 'high_condition_num')]:
        for suf, typ in [('_in', 'Indoor'), ('_ex', 'Exterior'), ('', None)]:
            col_name = f'{pre}_c{suf}'
            condition_cols[col_name] = [
                np.mean([img[key] for img in d.values()
                        if key in img and (not typ or img.get('image_type') == typ)])
                if d else np.nan
                for d in parsed
            ]

    df = pd.concat([df, pd.DataFrame(condition_cols, index=df.index)], axis=1)

    print(f"\n✓ Done: {df.shape}")
    print(f"✓ Added {n_topics} topic columns: topic_1 to topic_{n_topics}")
    print(f"✓ Added dominant_topic column (GSDMM hard clustering - each property assigned to ONE topic)")
    print(f"✓ Added {len(single_cols)} single features")
    print(f"✓ Added {len(pair_cols)} pair features")
    print(f"✓ Added {len(triple_cols)} triple features")
    print(f"✓ Added {len(condition_cols)} condition features")
    print(f"✓ No fragmentation warnings!")
    print(f"✓ No duplicate columns!")

    pairs_df = pd.DataFrame(pair_definitions)
    triples_df = pd.DataFrame(triple_definitions)

    return df, pairs_df, triples_df, topic_summary, mgp


# ========== USAGE ==========

# Run with GSDMM (10 topics, using top 500 features)
df, pairs, triples, topics, gsdmm_model = extract_features_with_gsdmm(
    df,
    n1=300,           # top single features
    n2=20,            # top pairs
    n3=20,            # top triples
    dedupe_threshold=3,
    dedupe_top_n=2000,
    n_topics=10,      # number of clusters
    gsdmm_features=500,  # features to use for GSDMM
    alpha=0.1,        # Dirichlet prior for doc-topic (lower = fewer topics)
    beta=0.1,         # Dirichlet prior for topic-word (lower = focused topics)
    n_iterations=30   # Gibbs sampling iterations (more = better convergence)
)

# Save results
pairs.to_csv('pair_definitions.csv', index=False)
triples.to_csv('triple_definitions.csv', index=False)
topics.to_csv('topic_summary_gsdmm.csv', index=False)

# Explore topics (GSDMM assigns each property to ONE topic)
print("\nProperties in Topic 1:")
print(df[df['dominant_topic'] == 1][['dominant_topic']].head(10))

# Count properties per topic
print("\nProperties per topic:")
topic_counts = df['dominant_topic'].value_counts().sort_index()
for topic_num, count in topic_counts.items():
    label = gsdmm_model.choose_best_label(int(topic_num) - 1)
    print(f"  Topic {topic_num}: {count} properties - [{label}]")

# See top features for ALL clusters
print("\n" + "="*60)
print("TOP 10 FEATURES FOR EACH TOPIC")
print("="*60)
for topic_idx in range(n_topics):
    topic_num = topic_idx + 1
    num_props = int((df['dominant_topic'] == topic_num).sum())
    label = gsdmm_model.choose_best_label(topic_idx)

    print(f"\nTopic {topic_num} - {num_props} properties - [{label}]:")
    for i, word in enumerate(gsdmm_model.top_words(topic_idx, 10), 1):
        count = gsdmm_model.cluster_word_count[topic_idx][word]
        print(f"  {i:2d}. {word:40s} (count: {count})")

Parsing...
Counting all features...
Deduplicating top 2000 features...
Counting singles...

RUNNING GSDMM TOPIC MODELING (Movie Group Process)
Building document representations with 500 features...
Fitting GSDMM with 10 topics...
Parameters: alpha=0.1, beta=0.1, iterations=30
  Initializing clusters randomly...
  Running 30 Gibbs sampling iterations...
    Iteration 0/30 - Active clusters: 10
    Iteration 5/30 - Active clusters: 10
    Iteration 10/30 - Active clusters: 10
    Iteration 15/30 - Active clusters: 10
    Iteration 20/30 - Active clusters: 10
    Iteration 25/30 - Active clusters: 10
  ✓ Converged. Active clusters: 10/10

Adding topic distributions to dataframe...

TOP FEATURES BY TOPIC

Topic 1 (836 properties):
  Hardwood flooring                        0.0326
  Ceiling fan                              0.0239
  Neutral paint                            0.0225
  stainless steel appliances               0.0222
  Neutral finishes                         0.0199
  granite cou

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



✓ Done: (11358, 1538)
✓ Added 10 topic columns: topic_1 to topic_10
✓ Added dominant_topic column (GSDMM hard clustering - each property assigned to ONE topic)
✓ Added 300 single features
✓ Added 20 pair features
✓ Added 20 triple features
✓ Added 6 condition features
✓ No duplicate columns!

Properties in Topic 1:
     dominant_topic
7                 1
13                1
35                1
42                1
58                1
78                1
80                1
87                1
95                1
101               1

Properties per topic:
  Topic 1: 836 properties - [Hardwood flooring + Ceiling fan + Neutral paint]
  Topic 2: 1246 properties - [Neutral paint + Ceiling fan + Carpeted floor]
  Topic 3: 1097 properties - [Neutral paint + Hardwood flooring + Ceiling fan]
  Topic 4: 961 properties - [Carpeted floor + Neutral paint + Ceiling fan]
  Topic 5: 591 properties - [Hardwood flooring + Ceiling fan + tile floor]
  Topic 6: 1156 properties - [Hardwood flooring + Ceilin

NameError: name 'n_topics' is not defined