In [None]:
# Import necessary libraries
import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings('ignore')

# Enable pyLDAvis for notebook
pyLDAvis.enable_notebook()

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load dataset
file_path = r"C:\Users\sahel\Downloads\archive\rows.csv"
print("Loading dataset...")
df = pd.read_csv(file_path, low_memory=False)

print("\nDataset Overview:")
print(f"<class 'pandas.core.frame.DataFrame'>")
print(f"RangeIndex: {len(df)} entries, 0 to {len(df)-1}")
print(f"Data columns (total {len(df.columns)} columns):")
for i, col in enumerate(df.columns):
    non_null = df[col].notna().sum()
    dtype = df[col].dtype
    print(f" {i:<2} {col:<35} {non_null} non-null  {dtype}")
print(f"dtypes: {df.dtypes.value_counts().to_dict()}")
print(f"memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nFirst few rows:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(df.head())

# Check for complaint narrative column
complaint_col = 'Consumer complaint narrative'
if complaint_col not in df.columns:
    narrative_cols = [col for col in df.columns if 'narrative' in col.lower() or 'complaint' in col.lower()]
    if narrative_cols:
        complaint_col = narrative_cols[0]

# Handle missing values
print(f"\nMissing values in {complaint_col}: {df[complaint_col].isna().sum()}")
df_clean = df.dropna(subset=[complaint_col]).copy()
print(f"Dataset size after removing missing narratives: {len(df_clean)}")

# Sample data for faster processing
sample_size = min(5000, len(df_clean))
df_sample = df_clean.sample(sample_size, random_state=42).copy()
print(f"\nWorking with {sample_size} samples")

# Load spaCy model
print("\nLoading spaCy model...")
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.text) > 2]
    return " ".join(tokens)

# Preprocess texts
print("\nPreprocessing texts...")
df_sample['cleaned_text'] = df_sample[complaint_col].apply(preprocess_text)
df_sample = df_sample[df_sample['cleaned_text'].str.len() > 10].copy()
print(f"Samples after cleaning: {len(df_sample)}")

# TF-IDF Vectorization
print("\nApplying TF-IDF vectorization...")
tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample['cleaned_text'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Count Vectorization for LDA
print("\nApplying Count vectorization...")
count_vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.8)
count_matrix = count_vectorizer.fit_transform(df_sample['cleaned_text'])
print(f"Count matrix shape: {count_matrix.shape}")

# spaCy Embeddings
print("\nGenerating spaCy embeddings...")
try:
    nlp_md = spacy.load("en_core_web_md")
    
    def get_spacy_embedding(text):
        doc = nlp_md(text)
        return doc.vector
    
    df_sample['spacy_embedding'] = df_sample['cleaned_text'].apply(get_spacy_embedding)
    spacy_embeddings = np.vstack(df_sample['spacy_embedding'].values)
    print(f"spaCy embeddings shape: {spacy_embeddings.shape}")
    spacy_available = True
except:
    print("spaCy medium model not available. Skipping spaCy embeddings.")
    spacy_available = False

# Dimensionality Reduction with PCA
print("\nApplying PCA for visualization...")
pca_tfidf = PCA(n_components=2)
tfidf_2d = pca_tfidf.fit_transform(tfidf_matrix.toarray())

# Visualization - TF-IDF
fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.6, s=30, c=range(len(tfidf_2d)), cmap='viridis')
ax.set_title("TF-IDF Embeddings (PCA)", fontsize=14, fontweight='bold')
ax.set_xlabel("Principal Component 1", fontsize=11)
ax.set_ylabel("Principal Component 2", fontsize=11)
plt.colorbar(scatter, ax=ax, label='Sample Index')
plt.tight_layout()
plt.show()

if spacy_available:
    pca_spacy = PCA(n_components=2)
    spacy_2d = pca_spacy.fit_transform(spacy_embeddings)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    scatter = ax.scatter(spacy_2d[:, 0], spacy_2d[:, 1], alpha=0.6, s=30, c=range(len(spacy_2d)), cmap='plasma')
    ax.set_title("spaCy Embeddings (PCA)", fontsize=14, fontweight='bold')
    ax.set_xlabel("Principal Component 1", fontsize=11)
    ax.set_ylabel("Principal Component 2", fontsize=11)
    plt.colorbar(scatter, ax=ax, label='Sample Index')
    plt.tight_layout()
    plt.show()

# Cosine Similarity Analysis
print("\nComputing cosine similarity...")
sample_subset = min(100, len(df_sample))
tfidf_cosine_sim = cosine_similarity(tfidf_matrix[:sample_subset])
np.fill_diagonal(tfidf_cosine_sim, 0)
most_similar_idx = np.unravel_index(np.argmax(tfidf_cosine_sim), tfidf_cosine_sim.shape)

print(f"\nMost similar complaints (indices: {most_similar_idx}):")
print(f"\nComplaint 1:\n{df_sample.iloc[most_similar_idx[0]][complaint_col][:300]}...")
print(f"\nComplaint 2:\n{df_sample.iloc[most_similar_idx[1]][complaint_col][:300]}...")
print(f"\nSimilarity score: {tfidf_cosine_sim[most_similar_idx]:.4f}")

# K-Means Clustering
print("\nPerforming K-Means clustering...")
num_clusters = 5
embeddings_for_clustering = spacy_embeddings if spacy_available else tfidf_matrix.toarray()
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df_sample['cluster'] = kmeans.fit_predict(embeddings_for_clustering)

# Distribution of clusters
print(f"\nCluster distribution:")
print(df_sample['cluster'].value_counts().sort_index())

# Extract top words per cluster
print(f"\nTop words per cluster:")
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_tfidf = np.zeros((num_clusters, tfidf_matrix.shape[1]))

for i in range(num_clusters):
    cluster_mask = (df_sample['cluster'] == i).values
    cluster_tfidf[i] = tfidf_matrix[cluster_mask].mean(axis=0).A1

top_n = 10
cluster_keywords = {}
for i in range(num_clusters):
    top_words_idx = cluster_tfidf[i].argsort()[-top_n:][::-1]
    top_words = [feature_names[j] for j in top_words_idx]
    cluster_keywords[i] = top_words
    print(f"\nCluster {i} ({df_sample[df_sample['cluster']==i].shape[0]} complaints):")
    print(f"  {', '.join(top_words)}")

# Visualize clusters with PCA
fig, ax = plt.subplots(figsize=(12, 7))
embedding_type = "spaCy" if spacy_available else "TF-IDF"
embeddings_2d = spacy_2d if spacy_available else tfidf_2d

for i in range(num_clusters):
    cluster_points = embeddings_2d[df_sample['cluster'] == i]
    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], 
              label=f'Cluster {i}', alpha=0.6, s=40)

ax.set_title(f"K-Means Clustering ({embedding_type} + PCA)", fontsize=14, fontweight='bold')
ax.set_xlabel("Principal Component 1", fontsize=11)
ax.set_ylabel("Principal Component 2", fontsize=11)
ax.legend(loc='best', fontsize=9)
plt.tight_layout()
plt.show()

# Word clouds for clusters
print("\nGenerating word clouds for clusters...")
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i in range(num_clusters):
    cluster_texts = ' '.join(df_sample[df_sample['cluster'] == i]['cleaned_text'].values)
    wordcloud = WordCloud(width=400, height=300, background_color='white', 
                         colormap='viridis', max_words=50).generate(cluster_texts)
    
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'Cluster {i}', fontsize=12, fontweight='bold')
    axes[i].axis('off')

axes[5].axis('off')
plt.tight_layout()
plt.show()

# Topic Modeling - LDA
print("\n\nPerforming Latent Dirichlet Allocation (LDA)...")
n_topics_lda = 5
lda_model = LatentDirichletAllocation(n_components=n_topics_lda, random_state=42, max_iter=20)
lda_topics = lda_model.fit_transform(count_matrix)

def display_topics(model, feature_names, num_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words-1:-1]]
        topics.append(top_words)
        print(f"\nTopic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

print("\nLDA Topics:")
lda_topic_words = display_topics(lda_model, count_vectorizer.get_feature_names_out())

# Topic distribution heatmap for LDA
print("\nGenerating LDA topic distribution heatmap...")
fig, ax = plt.subplots(figsize=(12, 6))
sample_docs = min(50, len(lda_topics))
sns.heatmap(lda_topics[:sample_docs].T, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Topic Probability'})
ax.set_xlabel("Document Index", fontsize=11)
ax.set_ylabel("Topic", fontsize=11)
ax.set_title("LDA Topic Distribution (First 50 Documents)", fontsize=14, fontweight='bold')
ax.set_yticklabels([f'Topic {i+1}' for i in range(n_topics_lda)], rotation=0)
plt.tight_layout()
plt.show()

# Topic Modeling - NMF
print("\n\nPerforming Non-negative Matrix Factorization (NMF)...")
n_topics_nmf = 5
nmf_model = NMF(n_components=n_topics_nmf, random_state=42, max_iter=200)
nmf_topics = nmf_model.fit_transform(tfidf_matrix)

print("\nNMF Topics:")
nmf_topic_words = display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())

# Topic distribution heatmap for NMF
print("\nGenerating NMF topic distribution heatmap...")
fig, ax = plt.subplots(figsize=(12, 6))
sample_docs = min(50, len(nmf_topics))
sns.heatmap(nmf_topics[:sample_docs].T, cmap='BuPu', ax=ax, cbar_kws={'label': 'Topic Weight'})
ax.set_xlabel("Document Index", fontsize=11)
ax.set_ylabel("Topic", fontsize=11)
ax.set_title("NMF Topic Distribution (First 50 Documents)", fontsize=14, fontweight='bold')
ax.set_yticklabels([f'Topic {i+1}' for i in range(n_topics_nmf)], rotation=0)
plt.tight_layout()
plt.show()

# Find optimal number of topics for LDA
print("\n\nFinding optimal number of topics...")
topic_range = range(2, 11)
perplexity_scores = []

for n_topics in topic_range:
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
    lda.fit(count_matrix)
    perplexity_scores.append(lda.perplexity(count_matrix))

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(topic_range, perplexity_scores, marker='o', linewidth=2, markersize=8)
ax.set_xlabel("Number of Topics", fontsize=11)
ax.set_ylabel("Perplexity Score", fontsize=11)
ax.set_title("LDA Perplexity vs. Number of Topics", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

optimal_topics = topic_range[np.argmin(perplexity_scores)]
print(f"\nOptimal number of topics (LDA): {optimal_topics} (perplexity: {min(perplexity_scores):.2f})")

# Find optimal number of topics for NMF
print("\nEvaluating NMF with different topic numbers...")
nmf_reconstruction_errors = []

for n_topics in topic_range:
    nmf = NMF(n_components=n_topics, random_state=42, max_iter=200)
    nmf.fit(tfidf_matrix)
    nmf_reconstruction_errors.append(nmf.reconstruction_err_)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(topic_range, nmf_reconstruction_errors, marker='s', linewidth=2, markersize=8, color='orange')
ax.set_xlabel("Number of Topics", fontsize=11)
ax.set_ylabel("Reconstruction Error", fontsize=11)
ax.set_title("NMF Reconstruction Error vs. Number of Topics", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Elbow method for NMF
diffs = np.diff(nmf_reconstruction_errors)
optimal_topics_nmf = topic_range[np.argmin(diffs) + 1]
print(f"Optimal number of topics (NMF): {optimal_topics_nmf} (error: {nmf_reconstruction_errors[optimal_topics_nmf-2]:.2f})")

# Comparison of methods
print("\n" + "="*70)
print("COMPARISON OF VECTORIZATION METHODS")
print("="*70)
print(f"{'Method':<20} {'Shape':<20} {'Sparsity':<15}")
print("-"*70)
print(f"{'TF-IDF':<20} {str(tfidf_matrix.shape):<20} {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.2%}")
print(f"{'Count':<20} {str(count_matrix.shape):<20} {(1 - count_matrix.nnz / (count_matrix.shape[0] * count_matrix.shape[1])):.2%}")
if spacy_available:
    print(f"{'spaCy':<20} {str(spacy_embeddings.shape):<20} {'Dense'}")
print("="*70)

# Gensim-based LDA for pyLDAvis Interactive Visualization
print("\n\nGenerating interactive topic visualization with Gensim LDA...")

# Prepare data for Gensim LDA
texts = df_sample['cleaned_text'].str.split().tolist()
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train Gensim LDA model
print("Training Gensim LDA model for visualization...")
lda_gensim = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=optimal_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Display Gensim LDA topics
print("\nGensim LDA Topics:")
for idx, topic in lda_gensim.print_topics(-1):
    print(f"Topic {idx+1}: {topic}")

# Prepare and display interactive visualization
print("\nPreparing interactive pyLDAvis visualization...")
lda_vis = gensimvis.prepare(lda_gensim, corpus, dictionary, sort_topics=False)

# Save as HTML file for interactive viewing
html_file = 'lda_visualization.html'
pyLDAvis.save_html(lda_vis, html_file)
print(f"\nInteractive visualization saved as '{html_file}'")
print("Open this file in your web browser to explore the topics interactively.")

# Display the visualization inline (for Jupyter notebooks)
try:
    from IPython.display import display, HTML
    display(pyLDAvis.display(lda_vis))
except:
    print("\nNote: Interactive display requires Jupyter notebook environment.")
    print(f"Please open '{html_file}' in your browser to view the interactive visualization.")

# Final summary
print("\n" + "="*70)
print("ANALYSIS SUMMARY")
print("="*70)
print(f"Total samples analyzed: {len(df_sample)}")
print(f"Number of clusters (K-Means): {num_clusters}")
print(f"Vectorization techniques: TF-IDF, Count Vectorization" + (", spaCy Embeddings" if spacy_available else ""))
print(f"Topic modeling techniques: LDA, NMF")
print(f"Optimal topics (LDA): {optimal_topics}")
print(f"Optimal topics (NMF): {optimal_topics_nmf}")
print(f"\nKey insights:")
print(f"  - Most prevalent complaint type: {df_sample['Product'].value_counts().index[0] if 'Product' in df_sample.columns else 'N/A'}")
print(f"  - Average complaint length: {df_sample[complaint_col].str.len().mean():.0f} characters")
print(f"  - Clusters show clear separation in {embedding_type} space")
print("="*70)