In [2]:
import re
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# ============================================================
# CONFIGURATION
# ============================================================

CSV_FILE_PATH = '/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/input_census_neighborhood_image_data_12K.csv'
PROPERTY_ID_COL = 'PROPERTY_ID'
COMMENTS_COL = 'PUBLIC_LISTING_COMMENTS'
PARSED_OUTPUT_COL = 'PARSED_OUTPUT'  # Column with image analysis data

# LDA Parameters
N_TOPICS = 5
N_TOP_WORDS = 15
MAX_FEATURES = 150
MIN_N = 2  # minimum n-gram size
MAX_N = 3  # maximum n-gram size

# Common stopwords (including subjective adjectives)
STOPWORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'will',
    'with', 'w/', 'been', 'all', 'this', 'been', 'you', 'into', 'offers',
    'offered', 'includes', 'some', 'tons', 'nice', 'ample', 'yrs',
    # Subjective/descriptive adjectives
    'adorable', 'charm', 'character', 'beautiful', 'stunning', 'unique',
    'modern', 'decorative', 'tastefully', 'tasteful', 'convenient', 'spacious',
    'plus', 'lovely', 'charming', 'gorgeous', 'amazing', 'wonderful', 'perfect',
    'excellent', 'fantastic', 'incredible', 'magnificent', 'spectacular',
    'delightful', 'elegant', 'cozy', 'inviting', 'warm', 'bright', 'airy',
    'luxurious', 'premium', 'quality', 'great', 'good', 'better', 'best',
    'original', 'covered', 'welcomes', 'showcasing', 'spanning', 'framed',
    'abound', 'throughout'
}


# ============================================================
# FUNCTIONS
# ============================================================

def extract_prominent_features(parsed_output):
    """
    Extract all prominent_features from the parsed output dictionary.

    Args:
        parsed_output: String representation of dict or dict object

    Returns:
        List of all prominent features across all images
    """
    if pd.isna(parsed_output):
        return []

    try:
        # Convert string to dict if needed
        if isinstance(parsed_output, str):
            data = ast.literal_eval(parsed_output)
        else:
            data = parsed_output

        # Extract all prominent features from all images
        all_features = []
        for idx, image_data in data.items():
            if 'prominent_features' in image_data:
                all_features.extend(image_data['prominent_features'])

        return all_features

    except Exception as e:
        print(f"Error parsing prominent features: {e}")
        return []


def clean_text(text):
    """Clean and tokenize text, removing stopwords and numbers."""
    text = text.lower()
    text = re.sub(r'[^\w\s-]', ' ', text)
    tokens = text.split()
    # Filter out stopwords, short tokens, and tokens containing digits
    filtered = [t for t in tokens
                if t not in STOPWORDS
                and len(t) > 1
                and not any(char.isdigit() for char in t)]
    return ' '.join(filtered)


def extract_ngrams_from_text(text, min_n=2, max_n=3):
    """Extract n-grams from text."""
    if pd.isna(text) or text == '':
        return ''

    cleaned = clean_text(text)
    tokens = cleaned.split()

    ngrams = []
    for n in range(min_n, max_n + 1):
        for i in range(len(tokens) - n + 1):
            ngram = '_'.join(tokens[i:i + n])
            ngrams.append(ngram)

    return ' '.join(ngrams)


def convert_feature_to_tuple(feature):
    """
    Convert a prominent feature phrase to an n-gram tuple.
    Example: 'ceiling fan' -> 'ceiling_fan'
    """
    # Clean and join words with underscore
    words = feature.lower().strip().split()
    # Remove stopwords from the feature
    words = [w for w in words if w not in STOPWORDS]
    return '_'.join(words) if words else ''


def combine_features_and_text(row, comments_col, parsed_col, min_n=2, max_n=3):
    """
    Combine prominent features from images with n-grams from text comments.

    Args:
        row: DataFrame row
        comments_col: Column name for text comments
        parsed_col: Column name for parsed image output

    Returns:
        Combined string of features and n-grams
    """
    combined_tokens = []

    # 1. Extract and convert prominent_features from images
    prominent_features = extract_prominent_features(row[parsed_col])
    for feature in prominent_features:
        feature_tuple = convert_feature_to_tuple(feature)
        if feature_tuple:  # Only add non-empty tuples
            combined_tokens.append(feature_tuple)

    # 2. Extract n-grams from text comments
    comments_ngrams = extract_ngrams_from_text(row[comments_col], min_n, max_n)
    if comments_ngrams:
        combined_tokens.append(comments_ngrams)

    return ' '.join(combined_tokens)


def load_and_prepare_data(csv_path, id_col, comments_col, parsed_col):
    """Load data from CSV and prepare combined feature documents."""
    try:
        df = pd.read_csv(csv_path)
        print(f"Loaded {len(df)} records from CSV")

        # Filter out rows with missing data
        df = df[df[comments_col].notna() | df[parsed_col].notna()]
        print(f"Found {len(df)} records with comments or image data")

        # Combine features and text for each property
        print("\nCombining image features and text comments...")
        df['combined_document'] = df.apply(
            lambda row: combine_features_and_text(row, comments_col, parsed_col, MIN_N, MAX_N),
            axis=1
        )

        # Filter out empty documents
        df = df[df['combined_document'].str.strip() != '']
        print(f"Created {len(df)} combined documents for LDA")

        return df[id_col].tolist(), df['combined_document'].tolist()

    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_path}")
        return None, None
    except KeyError as e:
        print(f"Error: Column {e} not found in CSV")
        return None, None


def train_lda(documents, property_ids=None, n_topics=3, n_top_words=10, max_features=100):
    """Train LDA model on combined documents (image features + text n-grams)."""

    # Create document-term matrix
    # Note: documents already contain tuples, so we don't need additional processing
    vectorizer = CountVectorizer(max_features=max_features, min_df=1)
    doc_term_matrix = vectorizer.fit_transform(documents)

    print(f"\nVocabulary size: {len(vectorizer.get_feature_names_out())}")
    print(f"Document-term matrix shape: {doc_term_matrix.shape}")

    # Train LDA
    print(f"\nTraining LDA with {n_topics} topics...")
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        max_iter=20,
        learning_method='batch'
    )
    lda.fit(doc_term_matrix)

    # Display topics
    feature_names = vectorizer.get_feature_names_out()
    print("\n" + "=" * 60)
    print("DISCOVERED TOPICS (Image Features + Text)")
    print("=" * 60)

    for topic_idx, topic in enumerate(lda.components_):
        top_indices = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        top_weights = [topic[i] for i in top_indices]

        print(f"\nTopic {topic_idx + 1}:")
        print("-" * 40)
        for word, weight in zip(top_words, top_weights):
            print(f"  {word:<40} {weight:.3f}")

    # Show document-topic distribution
    doc_topics = lda.transform(doc_term_matrix)

    # Calculate average topic distribution
    avg_topic_dist = doc_topics.mean(axis=0)

    print("\n" + "=" * 60)
    print("AVERAGE TOPIC DISTRIBUTION ACROSS ALL DOCUMENTS")
    print("=" * 60)
    for topic_idx, avg_prob in enumerate(avg_topic_dist):
        print(f"  Topic {topic_idx + 1}: {avg_prob:.1%}")

    print("\n" + "=" * 60)
    print("DOCUMENT-TOPIC DISTRIBUTION (Sample)")
    print("=" * 60)

    # Show first 10 documents
    num_to_show = min(10, len(doc_topics))
    for i in range(num_to_show):
        prop_id = property_ids[i] if property_ids else f"Doc {i + 1}"
        print(f"\nProperty {prop_id}:")
        for topic_idx, prob in enumerate(doc_topics[i]):
            if prob > 0.1:  # Only show topics with >10% probability
                print(f"  Topic {topic_idx + 1}: {prob:.3f}")

    if len(doc_topics) > num_to_show:
        print(f"\n... ({len(doc_topics) - num_to_show} more documents)")

    return lda, vectorizer, doc_topics


# ============================================================
# MAIN EXECUTION
# ============================================================

if __name__ == "__main__":
    print("=" * 60)
    print("COMBINED IMAGE + TEXT LDA TOPIC MODELING")
    print("=" * 60)

    # Load and prepare data
    property_ids, documents = load_and_prepare_data(
        CSV_FILE_PATH,
        PROPERTY_ID_COL,
        COMMENTS_COL,
        PARSED_OUTPUT_COL
    )

    if documents is None or len(documents) == 0:
        print("\nNo documents found. Exiting.")
    else:
        print(f"\n{'=' * 60}")
        print(f"TRAINING LDA ON {len(documents)} COMBINED DOCUMENTS")
        print(f"Text n-gram range: {MIN_N}-{MAX_N}")
        print(f"Number of topics: {N_TOPICS}")
        print("=" * 60)

        lda_model, vectorizer, doc_topics = train_lda(
            documents,
            property_ids=property_ids,
            n_topics=N_TOPICS,
            n_top_words=N_TOP_WORDS,
            max_features=MAX_FEATURES
        )

        print("\n" + "=" * 60)
        print("ANALYSIS COMPLETE")
        print("=" * 60)
        print("\nData streams combined:")
        print("  • Image prominent_features (converted to tuples)")
        print("  • Text comments (parsed into n-grams)")
        print("\nTo improve results:")
        print("  • Adjust N_TOPICS based on your corpus size")
        print("  • Increase MAX_FEATURES for larger vocabulary")
        print("  • Tune MIN_N and MAX_N for different n-gram sizes")
        print("  • Add more domain-specific stopwords if needed")

COMBINED IMAGE + TEXT LDA TOPIC MODELING
Loaded 15062 records from CSV
Found 14662 records with comments or image data

Combining image features and text comments...
Created 14662 combined documents for LDA

TRAINING LDA ON 14662 COMBINED DOCUMENTS
Text n-gram range: 2-3
Number of topics: 5

Vocabulary size: 150
Document-term matrix shape: (14662, 150)

Training LDA with 5 topics...

DISCOVERED TOPICS (Image Features + Text)

Topic 1:
----------------------------------------
  neutral_paint                            8594.595
  carpeted_floor                           7778.804
  ceiling_fan                              4797.855
  natural_light                            4073.137
  wood                                     2786.189
  neutral_finishes                         2723.284
  tub                                      2341.950
  look_flooring                            2326.186
  built                                    2063.260
  laminate_flooring                        2055.188
