In [23]:
from datasets import load_dataset
import pandas as pd
import spacy as sp
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim
import pyLDAvis
from nltk.stem import WordNetLemmatizer

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# view the col context fully...

pd.set_option('display.max_colwidth', None)

In [3]:
ds = load_dataset("Amod/mental_health_counseling_conversations")

In [4]:
df = ds['train'].to_pandas()

In [5]:
unique_contexts = df['Context'].drop_duplicates()

In [14]:
unique_contexts_list = unique_contexts.tolist()

In [36]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shafayetulislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def simple_preprocess(text):
    """
    Simple preprocessing function without relying on NLTK
    """
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Split on whitespace
    tokens = text.split()
    
    # Remove short words and common words
    common_words = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", 
                   "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 
                   'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
                   'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
                   'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
                   'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
                   'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 
                   'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
                   'under', 'again', 'further', 'then', 'once'}
    
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    return ' '.join(tokens)

In [29]:
def perform_topic_modeling(texts, n_topics=5):
    """
    Perform topic modeling using both LDA and NMF
    """
    # Preprocess texts
    processed_texts = [simple_preprocess(text) for text in texts]
    
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer(
        max_features=1000,
        min_df=2,
        max_df=0.95
    )
    dtm = vectorizer.fit_transform(processed_texts)
    
    # Perform LDA
    lda_model = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        max_iter=20
    )
    
    # Fit LDA model
    doc_topics = lda_model.fit_transform(dtm)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Get top words for each topic
    n_top_words = 10
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        topics.append({
            'topic_id': topic_idx,
            'words': top_words
        })
    
    return {
        'doc_topics': doc_topics,
        'topics': topics,
        'model': lda_model,
        'vectorizer': vectorizer,
        'processed_texts': processed_texts
    }

In [30]:
# Test with your data
def analyze_mental_health_topics(text_list):
    """
    Analyze mental health conversations and print topics
    """
    print("Starting analysis...")
    results = perform_topic_modeling(text_list, n_topics=5)
    
    print("\nDiscovered Topics:")
    for topic in results['topics']:
        print(f"\nTopic {topic['topic_id'] + 1}:")
        print(", ".join(topic['words']))
    
    return results

In [31]:
results = analyze_mental_health_topics(unique_contexts_list)

Starting analysis...

Discovered Topics:

Topic 1:
get, thoughts, like, feel, can, when, want, how, names, all

Topic 2:
don, feel, like, not, can, know, just, how, want, years

Topic 3:
feel, don, sleep, afraid, can, everyone, like, act, people, get

Topic 4:
abused, child, hospital, harm, sad, find, mother, sensitive, getting, self

Topic 5:
disorder, depression, counseling, anxiety, how, anger, can, help, know, issues
