In [1]:
import pandas as pd

file = r'/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Main_MLS_w_Features_2025-12-18-1053.csv' # first set of 12K Data from Farbod

df = pd.read_csv(file)
df = df.rename(columns = {'PARSED_OUTPUT':'PARSED'})

  df = pd.read_csv(file)


In [4]:
def extract_features_with_lda(df, n1=300, n2=20, n3=20,
                               dedupe_threshold=3, dedupe_top_n=2000,
                               n_topics=10, lda_features=500):
    """
    Extract features with LDA topic modeling - optimized for performance

    Parameters:
    -----------
    n_topics : int
        Number of latent topics to discover (e.g., 10 = Modern, Traditional, Luxury, etc.)
    lda_features : int
        Number of top features to use for LDA (recommend 300-1000)
    """
    from collections import Counter, defaultdict
    from itertools import combinations
    import ast
    import numpy as np
    from rapidfuzz import process, distance
    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd

    print("Parsing...")
    prop_feats = []
    for s in df['PARSED'].values:
        try:
            d = ast.literal_eval(s) if pd.notna(s) else {}
            feats = set()
            for img in d.values():
                feats.update(img.get('prominent_features', []))
            prop_feats.append(frozenset(feats))
        except:
            prop_feats.append(frozenset())

    # Count ALL features first
    print("Counting all features...")
    all_singles = Counter()
    for feats in prop_feats:
        all_singles.update(feats)

    # Dedupe top N features
    if dedupe_top_n and dedupe_threshold > 0:
        top_for_dedupe = [f for f, _ in all_singles.most_common(dedupe_top_n)]
        print(f"Deduplicating top {len(top_for_dedupe)} features...")

        canonical_map = {}
        canonical_list = []

        for feat in sorted(top_for_dedupe):
            if not canonical_list:
                canonical_map[feat] = feat
                canonical_list.append(feat)
                continue

            match = process.extractOne(
                feat.lower(),
                [c.lower() for c in canonical_list],
                scorer=distance.Levenshtein.distance,
                score_cutoff=dedupe_threshold
            )

            if match:
                canonical_map[feat] = canonical_list[match[2]]
            else:
                canonical_map[feat] = feat
                canonical_list.append(feat)

        for feat in all_singles:
            if feat not in canonical_map:
                canonical_map[feat] = feat
    else:
        canonical_map = {f: f for f in all_singles}

    # Apply canonicalization
    prop_feats = [frozenset(canonical_map[f] for f in feats) for feats in prop_feats]

    # Count singles
    print("Counting singles...")
    singles = Counter()
    for feats in prop_feats:
        singles.update(feats)
    top_singles = [f for f, _ in singles.most_common(n1)]

    # ========== LDA TOPIC MODELING ==========
    print(f"\n{'='*60}")
    print("RUNNING LDA TOPIC MODELING")
    print(f"{'='*60}")

    # Get top features for LDA
    lda_top_features = [f for f, _ in singles.most_common(lda_features)]
    lda_feat_idx = {f: i for i, f in enumerate(lda_top_features)}

    # Create document-term matrix
    print(f"Building document-term matrix with {len(lda_top_features)} features...")
    doc_term_matrix = np.zeros((len(df), len(lda_top_features)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in lda_feat_idx:
                doc_term_matrix[row, lda_feat_idx[feat]] = 1

    # Fit LDA
    print(f"Fitting LDA with {n_topics} topics...")
    lda_model = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=20,
        learning_method='online',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    # Get topic distributions for each property
    doc_topic_dist = lda_model.fit_transform(doc_term_matrix)

    # Add topic columns to dataframe (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding topic distributions to dataframe...")

    # Drop any existing topic columns first to avoid duplicates
    existing_topic_cols = [f'topic_{i+1}' for i in range(n_topics)] + ['dominant_topic']
    df = df.drop(columns=[col for col in existing_topic_cols if col in df.columns], errors='ignore')

    topic_cols = {}
    for topic_idx in range(n_topics):
        topic_cols[f'topic_{topic_idx+1}'] = doc_topic_dist[:, topic_idx]

    topic_cols['dominant_topic'] = np.argmax(doc_topic_dist, axis=1) + 1

    df = pd.concat([df, pd.DataFrame(topic_cols, index=df.index)], axis=1)

    # Analyze topics
    print(f"\n{'='*60}")
    print("TOP FEATURES BY TOPIC")
    print(f"{'='*60}")

    topic_features = []
    n_top_words = 15

    for topic_idx, topic in enumerate(lda_model.components_):
        top_feature_indices = topic.argsort()[-n_top_words:][::-1]
        top_features = [lda_top_features[i] for i in top_feature_indices]
        top_weights = [topic[i] for i in top_feature_indices]

        print(f"\nTopic {topic_idx + 1}:")
        for feat, weight in zip(top_features, top_weights):
            print(f"  {feat:40s} {weight:.4f}")

        topic_features.append({
            'topic': topic_idx + 1,
            'top_features': top_features,
            'weights': top_weights
        })

    # Create topic summary dataframe (FIXED - ensure single values, not Series)
    topic_summary_data = []
    for t in topic_features:
        topic_num = t['topic']
        topic_summary_data.append({
            'topic': topic_num,
            'top_5_features': ', '.join(t['top_features'][:5]),
            'num_properties': int((df['dominant_topic'] == topic_num).sum()),
            'avg_topic_strength': float(df[f"topic_{topic_num}"].mean())
        })

    topic_summary = pd.DataFrame(topic_summary_data)

    print(f"\n{'='*60}")
    print("TOPIC SUMMARY")
    print(f"{'='*60}")
    print(topic_summary.to_string(index=False))

    # ========== CONTINUE WITH REGULAR FEATURE EXTRACTION ==========

    # Create singles columns (OPTIMIZED - no fragmentation, no duplicates)
    print("\nCreating single feature columns...")
    feat_idx = {f: i for i, f in enumerate(top_singles)}
    singles_data = np.zeros((len(df), len(top_singles)), dtype=np.int8)

    for row, feats in enumerate(prop_feats):
        for feat in feats:
            if feat in feat_idx:
                singles_data[row, feat_idx[feat]] = 1

    # Build all single columns at once
    single_cols = {}
    for i, feat in enumerate(top_singles):
        col_name = feat.replace(' ', '_')[:50]
        # Ensure unique column names
        if col_name in df.columns:
            col_name = f"{col_name}_{i}"
        single_cols[col_name] = singles_data[:, i]

    df = pd.concat([df, pd.DataFrame(single_cols, index=df.index)], axis=1)

    # Pairs (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting pairs...")
    top_set = set(top_singles)
    filtered_feats = [[f for f in feats if f in top_set] for feats in prop_feats]

    pairs = Counter()
    for feats in filtered_feats:
        if len(feats) >= 2:
            pairs.update(combinations(sorted(feats), 2))

    top_pairs = [p for p, _ in pairs.most_common(n2)]

    # Drop existing pair columns
    existing_pair_cols = [f'pair_{i}' for i in range(1, n2+1)]
    df = df.drop(columns=[col for col in existing_pair_cols if col in df.columns], errors='ignore')

    pair_definitions = []
    pair_cols = {}

    for i, (f1, f2) in enumerate(top_pairs, 1):
        i1, i2 = feat_idx[f1], feat_idx[f2]
        pair_cols[f"pair_{i}"] = (singles_data[:, i1] & singles_data[:, i2]).astype(np.int8)
        pair_definitions.append({
            'column_name': f'pair_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'count': pairs[(f1, f2)]
        })

    df = pd.concat([df, pd.DataFrame(pair_cols, index=df.index)], axis=1)

    # Triples (OPTIMIZED - no fragmentation, no duplicates)
    print("Counting triples...")
    triples = Counter()
    for feats in filtered_feats:
        if len(feats) >= 3:
            triples.update(combinations(sorted(feats), 3))

    top_triples = [t for t, _ in triples.most_common(n3)]

    # Drop existing triple columns
    existing_triple_cols = [f'trip_{i}' for i in range(1, n3+1)]
    df = df.drop(columns=[col for col in existing_triple_cols if col in df.columns], errors='ignore')

    triple_definitions = []
    triple_cols = {}

    for i, (f1, f2, f3) in enumerate(top_triples, 1):
        i1, i2, i3 = feat_idx[f1], feat_idx[f2], feat_idx[f3]
        triple_cols[f"trip_{i}"] = (singles_data[:, i1] & singles_data[:, i2] & singles_data[:, i3]).astype(np.int8)
        triple_definitions.append({
            'column_name': f'trip_{i}',
            'feature_1': f1,
            'feature_2': f2,
            'feature_3': f3,
            'count': triples[(f1, f2, f3)]
        })

    df = pd.concat([df, pd.DataFrame(triple_cols, index=df.index)], axis=1)

    # Conditions (OPTIMIZED - no fragmentation, no duplicates)
    print("Adding conditions...")
    parsed = [ast.literal_eval(s) if pd.notna(s) else {} for s in df['PARSED'].values]

    # Drop existing condition columns
    condition_col_names = []
    for pre in ['gran', 'high']:
        for suf in ['_in', '_ex', '']:
            condition_col_names.append(f'{pre}_c{suf}')
    df = df.drop(columns=[col for col in condition_col_names if col in df.columns], errors='ignore')

    condition_cols = {}
    for pre, key in [('gran', 'granular_condition_num'), ('high', 'high_condition_num')]:
        for suf, typ in [('_in', 'Indoor'), ('_ex', 'Exterior'), ('', None)]:
            col_name = f'{pre}_c{suf}'
            condition_cols[col_name] = [
                np.mean([img[key] for img in d.values()
                        if key in img and (not typ or img.get('image_type') == typ)])
                if d else np.nan
                for d in parsed
            ]

    df = pd.concat([df, pd.DataFrame(condition_cols, index=df.index)], axis=1)

    print(f"\n✓ Done: {df.shape}")
    print(f"✓ Added {n_topics} topic columns: topic_1 to topic_{n_topics}")
    print(f"✓ Added dominant_topic column")
    print(f"✓ Added {len(single_cols)} single features")
    print(f"✓ Added {len(pair_cols)} pair features")
    print(f"✓ Added {len(triple_cols)} triple features")
    print(f"✓ Added {len(condition_cols)} condition features")
    print(f"✓ No fragmentation warnings!")
    print(f"✓ No duplicate columns!")

    pairs_df = pd.DataFrame(pair_definitions)
    triples_df = pd.DataFrame(triple_definitions)

    return df, pairs_df, triples_df, topic_summary, lda_model


# ========== USAGE ==========

# Run with LDA (10 topics, using top 500 features)
df, pairs, triples, topics, lda_model = extract_features_with_lda(
    df,
    n1=300,           # top single features
    n2=20,            # top pairs
    n3=20,            # top triples
    dedupe_threshold=3,
    dedupe_top_n=2000,
    n_topics=10,      # number of latent topics
    lda_features=500  # features to use for LDA
)

# Save results
pairs.to_csv('pair_definitions.csv', index=False)
triples.to_csv('triple_definitions.csv', index=False)
topics.to_csv('topic_summary.csv', index=False)

# Explore topics
print("\nProperties in Topic 1:")
print(df[df['dominant_topic'] == 1][['topic_1', 'topic_2', 'topic_3']].head())

# Find properties with high Topic 3 score
print("\nTop properties for Topic 3:")
print(df.nlargest(10, 'topic_3')[['dominant_topic', 'topic_3']].head())

Parsing...
Counting all features...
Deduplicating top 2000 features...
Counting singles...

RUNNING LDA TOPIC MODELING
Building document-term matrix with 500 features...
Fitting LDA with 10 topics...
iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20
Adding topic distributions to dataframe...

TOP FEATURES BY TOPIC

Topic 1:
  standard bedroom size                    552.5699
  Neutral finishes                         384.0098
  Carpeted floor             

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



✓ Done: (11358, 1538)
✓ Added 10 topic columns: topic_1 to topic_10
✓ Added dominant_topic column
✓ Added 300 single features
✓ Added 20 pair features
✓ Added 20 triple features
✓ Added 6 condition features
✓ No duplicate columns!

Properties in Topic 1:
     topic_1   topic_2   topic_3
3   0.100000  0.100000  0.100000
8   0.100000  0.100000  0.100000
10  0.959082  0.004547  0.004546
12  0.100000  0.100000  0.100000
41  0.100000  0.100000  0.100000

Top properties for Topic 3:
       dominant_topic   topic_3
10398               3  0.819978
1278                3  0.775000
86                  3  0.774999
4338                3  0.774999
5844                3  0.774997
