In [None]:
# Import necessary libraries
import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings('ignore')

# Enable pyLDAvis for notebook
pyLDAvis.enable_notebook()

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load dataset
file_path = r"C:\\Users\\sahel\\Downloads\\archive (5)\\rows.csv"
print("Loading dataset...")
df = pd.read_csv(file_path, low_memory=False)

print("\nDataset Overview:")
print(f"<class 'pandas.core.frame.DataFrame'>")
print(f"RangeIndex: {len(df)} entries, 0 to {len(df)-1}")
print(f"Data columns (total {len(df.columns)} columns):")
for i, col in enumerate(df.columns):
    non_null = df[col].notna().sum()
    dtype = df[col].dtype
    print(f" {i:<2} {col:<35} {non_null} non-null  {dtype}")
print(f"dtypes: {df.dtypes.value_counts().to_dict()}")
print(f"memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nFirst few rows:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(df.head())

# Check for complaint narrative column
complaint_col = 'Consumer complaint narrative'
if complaint_col not in df.columns:
    narrative_cols = [col for col in df.columns if 'narrative' in col.lower() or 'complaint' in col.lower()]
    if narrative_cols:
        complaint_col = narrative_cols[0]

# Handle missing values
print(f"\nMissing values in {complaint_col}: {df[complaint_col].isna().sum()}")
df_clean = df.dropna(subset=[complaint_col]).copy()
print(f"Dataset size after removing missing narratives: {len(df_clean)}")

# Sample data for faster processing
sample_size = min(5000, len(df_clean))
df_sample = df_clean.sample(sample_size, random_state=42).copy()
print(f"\nWorking with {sample_size} samples")

# Load spaCy model
print("\nLoading spaCy model...")
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.text) > 2]
    return " ".join(tokens)

# Preprocess texts
print("\nPreprocessing texts...")
df_sample['cleaned_text'] = df_sample[complaint_col].apply(preprocess_text)
df_sample = df_sample[df_sample['cleaned_text'].str.len() > 10].copy()
print(f"Samples after cleaning: {len(df_sample)}")

# TF-IDF Vectorization
print("\nApplying TF-IDF vectorization...")
tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample['cleaned_text'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Count Vectorization 
print("\nApplying Count vectorization...")
count_vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.8)
count_matrix = count_vectorizer.fit_transform(df_sample['cleaned_text'])
print(f"Count matrix shape: {count_matrix.shape}")

# spaCy Embeddings
print("\nGenerating spaCy embeddings...")
spacy_available = False
try:
    nlp_md = spacy.load("en_core_web_md")
    
    def get_spacy_embedding(text):
        doc = nlp_md(text)
        return doc.vector
    
    df_sample['spacy_embedding'] = df_sample['cleaned_text'].apply(get_spacy_embedding)
    spacy_embeddings = np.vstack(df_sample['spacy_embedding'].values)
    print(f"spaCy embeddings shape: {spacy_embeddings.shape}")
    spacy_available = True
except:
    print("spaCy medium model not available. Skipping spaCy embeddings.")

# Dimensionality Reduction with PCA
print("\nApplying PCA for visualization...")
pca_tfidf = PCA(n_components=2)
tfidf_2d = pca_tfidf.fit_transform(tfidf_matrix.toarray())

# Visualization - TF-IDF
fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.6, s=30, c=range(len(tfidf_2d)), cmap='viridis')
ax.set_title("TF-IDF Embeddings (PCA)", fontsize=14, fontweight='bold')
ax.set_xlabel("Principal Component 1", fontsize=11)
ax.set_ylabel("Principal Component 2", fontsize=11)
plt.colorbar(scatter, ax=ax, label='Sample Index')
plt.tight_layout()
plt.show()

if spacy_available:
    pca_spacy = PCA(n_components=2)
    spacy_2d = pca_spacy.fit_transform(spacy_embeddings)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    scatter = ax.scatter(spacy_2d[:, 0], spacy_2d[:, 1], alpha=0.6, s=30, c=range(len(spacy_2d)), cmap='plasma')
    ax.set_title("spaCy Embeddings (PCA)", fontsize=14, fontweight='bold')
    ax.set_xlabel("Principal Component 1", fontsize=11)
    ax.set_ylabel("Principal Component 2", fontsize=11)
    plt.colorbar(scatter, ax=ax, label='Sample Index')
    plt.tight_layout()
    plt.show()

# Cosine Similarity Analysis
print("\nComputing cosine similarity...")
sample_subset = min(100, len(df_sample))
tfidf_cosine_sim = cosine_similarity(tfidf_matrix[:sample_subset])
np.fill_diagonal(tfidf_cosine_sim, 0)
most_similar_idx = np.unravel_index(np.argmax(tfidf_cosine_sim), tfidf_cosine_sim.shape)

print(f"\nMost similar complaints (indices: {most_similar_idx}):")
print(f"\nComplaint 1:\n{df_sample.iloc[most_similar_idx[0]][complaint_col][:300]}...")
print(f"\nComplaint 2:\n{df_sample.iloc[most_similar_idx[1]][complaint_col][:300]}...")
print(f"\nSimilarity score: {tfidf_cosine_sim[most_similar_idx]:.4f}")

# K-Means Clustering
print("\nPerforming K-Means clustering...")
num_clusters = 5
embeddings_for_clustering = spacy_embeddings if spacy_available else tfidf_matrix.toarray()
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df_sample['cluster'] = kmeans.fit_predict(embeddings_for_clustering)

# Distribution of clusters
print(f"\nCluster distribution:")
print(df_sample['cluster'].value_counts().sort_index())

# Extract top words per cluster
print(f"\nTop words per cluster:")
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_tfidf = np.zeros((num_clusters, tfidf_matrix.shape[1]))

for i in range(num_clusters):
    cluster_mask = (df_sample['cluster'] == i).values
    cluster_tfidf[i] = tfidf_matrix[cluster_mask].mean(axis=0).A1

top_n = 10
cluster_keywords = {}
for i in range(num_clusters):
    top_words_idx = cluster_tfidf[i].argsort()[-top_n:][::-1]
    top_words = [feature_names[j] for j in top_words_idx]
    cluster_keywords[i] = top_words
    print(f"\nCluster {i} ({df_sample[df_sample['cluster']==i].shape[0]} complaints):")
    print(f"  {', '.join(top_words)}")

# Visualize clusters with PCA
fig, ax = plt.subplots(figsize=(12, 7))
embedding_type = "spaCy" if spacy_available else "TF-IDF"
embeddings_2d = spacy_2d if spacy_available else tfidf_2d

for i in range(num_clusters):
    cluster_points = embeddings_2d[df_sample['cluster'] == i]
    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], 
              label=f'Cluster {i}', alpha=0.6, s=40)

ax.set_title(f"K-Means Clustering ({embedding_type} + PCA)", fontsize=14, fontweight='bold')
ax.set_xlabel("Principal Component 1", fontsize=11)
ax.set_ylabel("Principal Component 2", fontsize=11)
ax.legend(loc='best', fontsize=9)
plt.tight_layout()
plt.show()

# Word clouds for clusters
print("\nGenerating word clouds for clusters...")
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i in range(num_clusters):
    cluster_texts = ' '.join(df_sample[df_sample['cluster'] == i]['cleaned_text'].values)
    wordcloud = WordCloud(width=400, height=300, background_color='white', 
                         colormap='viridis', max_words=50).generate(cluster_texts)
    
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'Cluster {i}', fontsize=12, fontweight='bold')
    axes[i].axis('off')

axes[5].axis('off')
plt.tight_layout()
plt.show()

# Preliminary Topic Modeling
print("\nPerforming preliminary Latent Dirichlet Allocation (LDA)...")
n_topics_lda = 5
lda_model = LatentDirichletAllocation(n_components=n_topics_lda, random_state=42, max_iter=20)
lda_topics = lda_model.fit_transform(count_matrix)

def display_topics(model, feature_names, num_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words-1:-1]]
        topics.append(top_words)
        print(f"\nTopic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

print("\nPreliminary LDA Topics:")
lda_topic_words = display_topics(lda_model, count_vectorizer.get_feature_names_out())

print("\nGenerating LDA topic distribution heatmap...")
fig, ax = plt.subplots(figsize=(12, 6))
sample_docs = min(50, len(lda_topics))
sns.heatmap(lda_topics[:sample_docs].T, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Topic Probability'})
ax.set_xlabel("Document Index", fontsize=11)
ax.set_ylabel("Topic", fontsize=11)
ax.set_title("LDA Topic Distribution (First 50 Documents)", fontsize=14, fontweight='bold')
ax.set_yticklabels([f'Topic {i+1}' for i in range(n_topics_lda)], rotation=0)
plt.tight_layout()
plt.show()

print("\nPerforming preliminary Non-negative Matrix Factorization (NMF)...")
n_topics_nmf = 5
nmf_model = NMF(n_components=n_topics_nmf, random_state=42, max_iter=200)
nmf_topics = nmf_model.fit_transform(tfidf_matrix)

print("\nPreliminary NMF Topics:")
nmf_topic_words = display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())

print("\nGenerating NMF topic distribution heatmap...")
fig, ax = plt.subplots(figsize=(12, 6))
sample_docs = min(50, len(nmf_topics))
sns.heatmap(nmf_topics[:sample_docs].T, cmap='BuPu', ax=ax, cbar_kws={'label': 'Topic Weight'})
ax.set_xlabel("Document Index", fontsize=11)
ax.set_ylabel("Topic", fontsize=11)
ax.set_title("NMF Topic Distribution (First 50 Documents)", fontsize=14, fontweight='bold')
ax.set_yticklabels([f'Topic {i+1}' for i in range(n_topics_nmf)], rotation=0)
plt.tight_layout()
plt.show()

# Coherence Score Analysis for Optimal K
print("\nFinding optimal number of topics using coherence approaches...")
topic_range = range(2, 11)
perplexity_scores = []
coherence_cv_scores = []
coherence_umass_scores = []

texts_for_coherence = df_sample['cleaned_text'].str.split().tolist()
dictionary_coherence = corpora.Dictionary(texts_for_coherence)
corpus_coherence = [dictionary_coherence.doc2bow(text) for text in texts_for_coherence]

print("\nCalculating Perplexity, C_v Coherence, and UMass Coherence...")
for n_topics in topic_range:
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
    lda.fit(count_matrix)
    perplexity_scores.append(lda.perplexity(count_matrix))
    
    feature_names = count_vectorizer.get_feature_names_out()
    lda_topics_words = []
    for topic_idx, topic in enumerate(lda.components_):
        top_indices = topic.argsort()[-10:][::-1]
        topic_words = [feature_names[i] for i in top_indices]
        lda_topics_words.append(topic_words)
    
    cm_cv = CoherenceModel(
        topics=lda_topics_words,
        texts=texts_for_coherence,
        dictionary=dictionary_coherence,
        coherence='c_v'
    )
    coherence_cv_scores.append(cm_cv.get_coherence())
    
    cm_umass = CoherenceModel(
        topics=lda_topics_words,
        texts=texts_for_coherence,
        dictionary=dictionary_coherence,
        coherence='u_mass'
    )
    coherence_umass_scores.append(cm_umass.get_coherence())
    
    print(f"Topics: {n_topics}, Perplexity: {perplexity_scores[-1]:.2f}, C_v: {coherence_cv_scores[-1]:.4f}, UMass: {coherence_umass_scores[-1]:.4f}")

fig, axes = plt.subplots(2, 2, figsize=(18, 12))
axes[0,0].plot(topic_range, perplexity_scores, marker='o', linewidth=2, markersize=8, color='blue')
axes[0,0].set_xlabel("Number of Topics (k)")
axes[0,0].set_ylabel("Perplexity Score")
axes[0,0].set_title("LDA Perplexity vs. k", fontweight='bold')
axes[0,0].grid(True, alpha=0.3)

axes[0,1].plot(topic_range, coherence_cv_scores, marker='o', linewidth=2, markersize=8, color='green')
axes[0,1].set_xlabel("Number of Topics (k)")
axes[0,1].set_ylabel("C_v Coherence Score")
axes[0,1].set_title("LDA C_v Coherence vs. k", fontweight='bold')
axes[0,1].grid(True, alpha=0.3)

axes[1,0].plot(topic_range, coherence_umass_scores, marker='o', linewidth=2, markersize=8, color='red')
axes[1,0].set_xlabel("Number of Topics (k)")
axes[1,0].set_ylabel("UMass Coherence Score")
axes[1,0].set_title("LDA UMass Coherence vs. k", fontweight='bold')
axes[1,0].grid(True, alpha=0.3)

axes[1,1].plot(topic_range, coherence_cv_scores, marker='o', label='C_v', linewidth=2)
axes[1,1].plot(topic_range, coherence_umass_scores, marker='s', label='UMass', linewidth=2)
axes[1,1].set_xlabel("Number of Topics (k)")
axes[1,1].set_ylabel("Coherence Score")
axes[1,1].set_title("Combined Coherence Metrics", fontweight='bold')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

optimal_topics_cv = topic_range[np.argmax(coherence_cv_scores)]
optimal_topics_umass = topic_range[np.argmax(coherence_umass_scores)]
optimal_topics = optimal_topics_cv 

# Final Phase Reporting and Model Training
print("\nFINAL PHASE REPORT: IDENTIFIED TOPICS OVERVIEW")
print(f"Methodology: Dual Coherence Score Evaluation (C_v + UMass)")
print(f"Optimal number of topics (k) identified by C_v Coherence: {optimal_topics}")
print(f"Max C_v Coherence Score achieved: {max(coherence_cv_scores):.4f}")
print(f"UMass optimal k: {optimal_topics_umass} (score: {max(coherence_umass_scores):.4f})")
print(f"Recommendation: Proceeding with k={optimal_topics} for final interpretation.")

print("\nTraining FINAL Gensim LDA model with optimal topics...")
texts = df_sample['cleaned_text'].str.split().tolist()
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_gensim_final = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=optimal_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

print("\nFINAL IDENTIFIED TOPICS OVERVIEW:")
for idx, topic in lda_gensim_final.print_topics(num_words=15):
    print(f"FINAL TOPIC {idx+1}: {topic}")

print("\nPreparing final interactive visualization...")
lda_vis_final = gensimvis.prepare(lda_gensim_final, corpus, dictionary, sort_topics=False)
html_file = 'final_lda_visualization.html'
pyLDAvis.save_html(lda_vis_final, html_file)
print(f"Visualization saved to {html_file}")

try:
    from IPython.display import display
    display(lda_vis_final)
except:
    pass

print("\nFINAL NMF comparison with optimal topics...")
nmf_final = NMF(n_components=optimal_topics, random_state=42, max_iter=200)
nmf_topics_final = nmf_final.fit_transform(tfidf_matrix)
display_topics(nmf_final, tfidf_vectorizer.get_feature_names_out())

# Comprehensive Summary of All Approaches Used
print("\nCOMPREHENSIVE ANALYSIS SUMMARY")
print("The following analytical approaches were successfully implemented:")
print("1. Data Preprocessing: Cleaned text using spaCy (lemmatization, stop-word removal, and alpha filtering).")
print("2. Text Vectorization: Implemented both TF-IDF and Count Vectorization.")
print("3. Dense Embeddings: Generated word vectors using spaCy's pre-trained medium language model.")
print("4. Dimensionality Reduction: Applied PCA (Principal Component Analysis) to visualize clusters in 2D.")
print("5. Similarity Analysis: Conducted Cosine Similarity to identify related document narratives.")
print("6. Unsupervised Clustering: Implemented K-Means clustering and generated Word Clouds for cluster interpretation.")
print("7. Topic Modeling (Preliminary): Applied LDA (scikit-learn) and NMF for initial topic discovery.")
print("8. Model Validation (Coherence): Performed a multi-metric sweep (Perplexity, C_v Coherence, and UMass Coherence).")
print("9. Optimization: Mathematically identified the optimal topic count (k) using the C_v metric.")
print("10. Final Modeling & Visualization: Trained an optimized Gensim LDA model and generated an interactive pyLDAvis report.")


