In [1]:
from datasets import load_dataset
import pandas as pd
import spacy as sp
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim
import pyLDAvis
from nltk.stem import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [3]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from IPython.display import display

In [6]:
# view the col context fully...

pd.set_option('display.max_colwidth', None)

In [7]:
ds = load_dataset("Amod/mental_health_counseling_conversations")

In [8]:
df = ds['train'].to_pandas()

In [9]:
unique_contexts = df['Context'].drop_duplicates()

In [10]:
unique_contexts_list = unique_contexts.tolist()

In [11]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shafayetulislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def simple_preprocess(text):
    """
    Simple preprocessing function without relying on NLTK
    """
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Split on whitespace
    tokens = text.split()
    
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    return ' '.join(tokens)

In [13]:
def perform_topic_modeling(texts, n_topics=5):
    """
    Perform topic modeling using both LDA and NMF
    """
    # Preprocess texts
    processed_texts = [simple_preprocess(text) for text in texts]
    
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer(
        max_features=1000,
        min_df=2,
        max_df=0.95
    )
    dtm = vectorizer.fit_transform(processed_texts)
    
    # Perform LDA
    lda_model = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        max_iter=20
    )
    
    # Fit LDA model
    doc_topics = lda_model.fit_transform(dtm)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Get top words for each topic
    n_top_words = 10
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        topics.append({
            'topic_id': topic_idx,
            'words': top_words
        })
    
    return {
        'doc_topics': doc_topics,
        'topics': topics,
        'model': lda_model,
        'vectorizer': vectorizer,
        'processed_texts': processed_texts
    }

In [14]:
def visualize_topics(lda_results):
    """
    Create an interactive visualization of the LDA topics using pyLDAvis
    
    Parameters:
    lda_results (dict): Dictionary containing the LDA model results including:
        - model: The fitted LDA model
        - vectorizer: The fitted TF-IDF vectorizer
        - dtm: Document-term matrix
        - processed_texts: List of preprocessed texts
    
    Returns:
    pyLDAvis HTML object that can be displayed in a notebook or saved to file
    """
    # Get the document-term matrix from the vectorizer
    dtm = lda_results['vectorizer'].transform(lda_results['processed_texts'])
    
    # Prepare the visualization
    vis_data = pyLDAvis.sklearn.prepare(
        lda_results['model'],
        dtm,
        lda_results['vectorizer'],
        mds='tsne',  # Using t-SNE for dimension reduction
        sort_topics=False
    )
    
    # Save the visualization to an HTML file
    pyLDAvis.save_html(vis_data, 'topic_visualization.html')
    
    return vis_data

In [15]:
# Test with your data
def analyze_mental_health_topics(text_list):
    """
    Analyze mental health conversations and print topics
    """
    print("Starting analysis...")
    results = perform_topic_modeling(text_list, n_topics=5)
    
    print("\nDiscovered Topics:")
    for topic in results['topics']:
        print(f"\nTopic {topic['topic_id'] + 1}:")
        print(", ".join(topic['words']))

    print("\nGenerating interactive visualization...")
    vis_data = visualize_topics(results)
    print("Visualization saved as 'topic_visualization.html'")
    
    return results, vis_data

In [16]:
results, vis = analyze_mental_health_topics(unique_contexts_list)

Starting analysis...

Discovered Topics:

Topic 1:
know, understand, mother, done, bullied, dependent, afraid, intimate, cat, live

Topic 2:
feel, like, get, know, people, even, always, sex, thoughts, head

Topic 3:
like, feel, want, time, years, know, get, love, really, never

Topic 4:
mood, people, eating, depression, stressed, stress, approach, dog, type, counseling

Topic 5:
help, get, relationship, know, child, someone, find, wife, feel, see

Generating interactive visualization...


AttributeError: module 'pyLDAvis' has no attribute 'sklearn'