In [2]:
# Import libraries
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Sample data
data = {
    'disorder': ['bipolar-disorder', 'bipolar-disorder', 'obsessive-compulsive-disorder-ocd'],
    'text': [
        "I'm feeling very anxious today. Is this normal for people with bipolar disorder?",
        "Does anyone have experience with bipolar medication? I need advice.",
        "I can't stop obsessing about my thoughts. Is this a symptom of OCD?"
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Text preprocessing
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Topic Modeling (LDA)
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda_matrix = lda.fit_transform(tfidf_matrix)

# Word Vector Similarity
query = "I'm feeling anxious and need help."
query = preprocess_text(query)

query_vector = tfidf_vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

# Get top similar documents
similar_documents = df.iloc[cosine_similarities.argsort()[0][::-1]]

# Print top similar documents
print("Top similar documents:")
similar_documents[['disorder', 'text']]

Top similar documents:


Unnamed: 0,disorder,text
0,bipolar-disorder,I'm feeling very anxious today. Is this normal...
1,bipolar-disorder,Does anyone have experience with bipolar medic...
2,obsessive-compulsive-disorder-ocd,I can't stop obsessing about my thoughts. Is t...
