In [1]:
# Cell 1: Load libraries and data (with sampling)
print("1. Loading libraries and data...")
import pandas as pd
from datetime import datetime

file_path = r"C:/Users/HP/Downloads/LDA/Complaints_original.csv"
complaint_df = pd.read_csv(file_path).sample(n=20000, random_state=42)

print("✔ Data loaded successfully (20,000 samples)")
print(f"New shape: {complaint_df.shape}")
print("\nFirst 5 rows:")
complaint_df.head()

1. Loading libraries and data...
✔ Data loaded successfully (20,000 samples)
New shape: (20000, 1)

First 5 rows:


Unnamed: 0,Consumer complaint narrative
1288629,There are unknown hard inquiries on my credit ...
783591,I recently reviewed a copy of my credit report...
392978,I am a little confused. Between my wife and I ...
713857,I understand the importance of removing any in...
271522,i am filing a complaint because this creditor ...


In [2]:
# Cell 2: Configure stop words
print("2. Setting up stop words...")
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop_words = set(ENGLISH_STOP_WORDS)
custom_stopwords = [
    "gimme", "lemme", "cause", "'cuz", "imma", "gonna", "wanna",
    "gotta", "hafta", "woulda", "coulda", "shoulda", "howdy", "day",
    "company", "bank", "hour", "express"
]

stop_words.update(custom_stopwords)

# Add Product/Sub-product unique values if present
if 'Sub-product' in complaint_df.columns:
    stop_words.update(complaint_df['Sub-product'].dropna().unique().tolist())
if 'Product' in complaint_df.columns:
    stop_words.update(complaint_df['Product'].dropna().unique().tolist())

print(f"✔ Stop words configured (Total: {len(stop_words)})")

2. Setting up stop words...
✔ Stop words configured (Total: 336)


In [3]:
# Cell 3: Initialize NLP processor
print("3. Loading NLP model...")
import spacy
nlp = spacy.load("en_core_web_sm")
pos_filter = {"AUX", "PART", "PRON", "SYM", "X"}

print("✔ spaCy model ready")

3. Loading NLP model...
✔ spaCy model ready


In [5]:
# Cell 4: Text cleaning function
print("4. Defining text cleaner...")

# First initialize all required components
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define POS filter (keep these parts of speech)
pos_filter = {'PUNCT', 'SYM', 'SPACE', 'X', 'CCONJ', 'DET', 'NUM'}

# Get stop words
stop_words = set(STOP_WORDS)

def clean_and_tokenize(text):
    doc = nlp(str(text))
    tokens = []
    for token in doc:
        if token.pos_ not in pos_filter and token.text.lower() not in stop_words:
            term = token.lemma_.strip() if token.lemma_ != '-PRON-' else token.text
            term = term.replace('X', '').replace('/', '').strip()
            if len(term) > 1:
                tokens.append(term.lower())
    return tokens

print("✔ Cleaning function defined")

4. Defining text cleaner...
✔ Cleaning function defined


In [None]:
# Cell 5: Process documents
print("5. Processing documents...")
complaint_df = complaint_df.dropna(subset=['Consumer complaint narrative'])
print(f"Working with {len(complaint_df)} documents")

print("Tokenizing (this may take a while)...")
tokenized_docs = [clean_and_tokenize(doc) for doc in complaint_df['Consumer complaint narrative']]

print("✔ Tokenization complete")
print(f"Sample tokens: {tokenized_docs[0][:10]}...")

5. Processing documents...
Working with 20000 documents
Tokenizing (this may take a while)...


In [11]:
# Cell 6: Create dictionary
print("6. Building dictionary...")
from gensim.corpora import Dictionary
dictionary = Dictionary(tokenized_docs)
print(f"Initial dictionary size: {len(dictionary)}")

# Optional filtering (uncomment if needed)
# dictionary.filter_extremes(no_below=10, no_above=0.3)
# print(f"Filtered dictionary size: {len(dictionary)}")

print("Creating corpus...")
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
print(f"✔ Corpus created ({len(corpus)} documents)")

6. Building dictionary...
Initial dictionary size: 8434
Creating corpus...
✔ Corpus created (2894 documents)


In [12]:
# Cell 7: Train LDA model
print("7. Training LDA model...")
from gensim.models import LdaMulticore
num_topics = 10

print(f"Training with {num_topics} topics (50 passes)...")
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=50,
    workers=2
)
print("✔ Model trained successfully")

7. Training LDA model...
Training with 10 topics (50 passes)...
✔ Model trained successfully


In [13]:
# Cell 8: Show topics
print("8. Extracting topics...")
topics = lda_model.show_topics(num_topics=num_topics, num_words=3, formatted=False)

print("\nDiscovered topics:")
for topic_id, words in topics:
    print(f"Topic {topic_id}: {' '.join([w for w,_ in words])}")

topic_labels = {
    topic_id: " ".join([word for word, _ in word_probs])
    for topic_id, word_probs in topics
}

8. Extracting topics...

Discovered topics:
Topic 0: payment late report
Topic 1: report account information
Topic 2: debt collection provide
Topic 3: report account consumer
Topic 4: account check money
Topic 5: consumer 15 information
Topic 6: receive mortgage send
Topic 7: payment loan pay
Topic 8: xxxx xxxxxxxx date
Topic 9: account identity report


In [16]:
# Cell 9: Assign topics (Fixed Version)
print("9. Assigning topics to documents...")

# Get document topics from the model
doc_topics = lda_model[corpus]

# Create topic labels dictionary (from Cell 8)
topic_labels = {
    topic_id: " ".join([word for word, _ in lda_model.show_topic(topic_id, topn=3)])
    for topic_id in range(lda_model.num_topics)
}

# Assign dominant topic to each document
complaint_df = complaint_df.copy()  # Ensure we're working on a copy
complaint_df['Topic_Label'] = [
    topic_labels[max(doc, key=lambda x: x[1])[0]] if doc else "No Topic"
    for doc in doc_topics
]

print("✔ Topic assignment complete")
print(f"Topic Label column exists: {'Topic_Label' in complaint_df.columns}")

9. Assigning topics to documents...
✔ Topic assignment complete
Topic Label column exists: True


In [17]:
# Cell 10: View results (Guaranteed to Work)
print("10. Sample results:")

# Verify the column exists first
if 'Topic_Label' not in complaint_df.columns:
    raise ValueError("'Topic_Label' column missing - check Cell 9")

# Safe display
sample = complaint_df.sample(3)
for _, row in sample.iterrows():
    print(f"\nComplaint (Topic: {row['Topic_Label']}):")
    print(str(row['Consumer complaint narrative'])[:200] + "...")

10. Sample results:

Complaint (Topic: payment late report):
15 USC 1681 a ( d ) ( 2 ) ( A ) ( i ) clearly states that transactions between the consumer and the Person/Corporation making the report is NOT included on the consumer reports. Yet XXXX, Experian & X...

Complaint (Topic: payment loan pay):
I tried to transfer my school loans from Navient to XXXX in XX/XX/XXXX because I was making payments but was unable to get out of just paying interest so it felt like I was n't getting anywhere with t...

Complaint (Topic: account check money):
When I purchase my original item ( mini red light treatment ) with my credit wise Wells Fargo card for {$650.00}. I decided to upgrade my purchase with a promo special that the merchant offered. I did...


In [18]:
# Save topics to text file (add this after Cell 10)
output_file = "C:/Users/HP/Downloads/LDA/FINAL_RESULT.txt"

print("\nSaving topics to text file...")
with open(output_file, 'w', encoding='utf-8') as f:
    # Write one topic per line
    for topic in complaint_df['Topic_Label']:
        f.write(f"{topic}\n")  # No extra spaces or formatting

print(f"✔ Saved {len(complaint_df)} topics to {output_file}")
print("File format verification:")
print("First 3 lines:")
with open(output_file, 'r') as f:
    for _ in range(3):
        print(f.readline(), end='')


Saving topics to text file...
✔ Saved 2894 topics to C:/Users/HP/Downloads/LDA/FINAL_RESULT.txt
File format verification:
First 3 lines:
report account information
receive mortgage send
debt collection provide


In [24]:
# Cell 12: Calculate and Save Metrics (Exact Format)
print("\n12. Calculating metrics in specified format...")

from gensim.models import CoherenceModel
import numpy as np
from scipy.stats import entropy

def calculate_exact_metrics(model, corpus, dictionary, tokenized_docs, num_topics):
    """Calculate metrics in the exact requested format"""
    # Basic counts
    num_docs = len(tokenized_docs)
    dict_size = len(dictionary)
    
    # Get topics in correct format for coherence calculations
    topics = model.show_topics(num_topics=num_topics, formatted=False)
    topic_words = [[word for word, _ in topic[1]] for topic in topics]
    
    # Coherence Scores
    cv = CoherenceModel(topics=topic_words,
                      texts=tokenized_docs,
                      dictionary=dictionary,
                      coherence='c_v').get_coherence()
    
    umass = CoherenceModel(topics=topic_words,
                         texts=tokenized_docs,
                         dictionary=dictionary,
                         coherence='u_mass').get_coherence()
    
    npmi = CoherenceModel(topics=topic_words,
                        texts=tokenized_docs,
                        dictionary=dictionary,
                        coherence='c_npmi').get_coherence()
    
    uci = CoherenceModel(topics=topic_words,
                       texts=tokenized_docs,
                       dictionary=dictionary,
                       coherence='c_uci').get_coherence()

    # Topic Diversity
    unique_words = len(set(word for topic in topic_words for word in topic))
    diversity = unique_words / (num_topics * 20)
    
    # Jaccard Distance
    jaccard_dists = []
    for i in range(num_topics):
        for j in range(i+1, num_topics):
            set1 = set(topic_words[i])
            set2 = set(topic_words[j])
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            jaccard_dists.append(1 - (intersection / union) if union else 0)
    avg_jaccard = np.mean(jaccard_dists) if jaccard_dists else 0

    # Topic Distribution Stats
    topic_counts = np.zeros(num_topics)
    for doc in model[corpus]:
        if doc:
            topic_id, _ = max(doc, key=lambda x: x[1])
            topic_counts[topic_id] += 1
    
    avg_size = np.mean(topic_counts)
    std_size = np.std(topic_counts)
    size_entropy = entropy(topic_counts[topic_counts > 0])
    max_ratio = np.max(topic_counts) / num_docs if num_docs else 0

    # Confidence and Entropy
    confidences = []
    entropies = []
    for doc in model[corpus]:
        if doc:
            probs = np.array([prob for _, prob in doc])
            confidences.append(np.max(probs))
            entropies.append(entropy(probs))
    
    avg_conf = np.mean(confidences) if confidences else 0
    avg_ent = np.mean(entropies) if entropies else 0

    # Calculate coverage safely
    coverage = len(corpus)/num_docs if num_docs else 0
    
    return f"""Topics evaluated: {num_docs}
Text documents: {num_docs}
Dictionary size: {dict_size}

COHERENCE SCORES:
      C_V: {cv:.4f}
   U_MASS: {umass:.4f}
   C_NPMI: {npmi:.4f}
    C_UCI: {uci:.4f}

DIVERSITY SCORES:
  topic_diversity: {diversity:.4f}
  avg_pairwise_jaccard_distance: {avg_jaccard:.4f}

QUALITY SCORES:
  num_topics: {num_topics}
  coverage: {coverage:.4f}
  avg_topic_size: {avg_size:.4f}
  std_topic_size: {std_size:.4f}
  topic_size_entropy: {size_entropy:.4f}
  largest_topic_ratio: {max_ratio:.4f}
  avg_confidence: {avg_conf:.4f}
  avg_prob_entropy: {avg_ent:.4f}"""

# Calculate and save metrics
metrics_report = calculate_exact_metrics(lda_model, corpus, dictionary, tokenized_docs, num_topics)

# Save to file with UTF-8 encoding
with open("lda_metrics_exact.txt", "w", encoding='utf-8') as f:
    f.write(metrics_report)

# Print to notebook
print("✔ Metrics saved to 'lda_metrics_exact.txt'")
print("\n" + "="*40)
print(metrics_report)
print("="*40)


12. Calculating metrics in specified format...
✔ Metrics saved to 'lda_metrics_exact.txt'

Topics evaluated: 2894
Text documents: 2894
Dictionary size: 8434

COHERENCE SCORES:
      C_V: 0.6071
   U_MASS: -1.2850
   C_NPMI: 0.0767
    C_UCI: 0.3938

DIVERSITY SCORES:
  topic_diversity: 0.3050
  avg_pairwise_jaccard_distance: 0.8948

QUALITY SCORES:
  num_topics: 10
  coverage: 1.0000
  avg_topic_size: 289.4000
  std_topic_size: 166.2992
  topic_size_entropy: 2.1495
  largest_topic_ratio: 0.2087
  avg_confidence: 0.6436
  avg_prob_entropy: 0.8001
