In [7]:
!pip install pandas numpy scikit-learn nltk transformers sentence-transformers torch



In [8]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sentence Transformers for better embeddings
from sentence_transformers import SentenceTransformer

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

print("NLTK downloads completed!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


NLTK downloads completed!


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [10]:
df = pd.read_csv('/content/Mental_Health_FAQ.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn Names:", df.columns.tolist())
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Shape: (98, 3)

First few rows:
   Question_ID                                          Questions  \
0      1590140        What does it mean to have a mental illness?   
1      2110618                    Who does mental illness affect?   
2      6361820                        What causes mental illness?   
4      7657263            Can people with mental illness recover?   

                                             Answers  
0  Mental illnesses are health conditions that di...  
1  It is estimated that mental illness affects 1 ...  
2  It is estimated that mental illness affects 1 ...  
3  Symptoms of mental health disorders vary depen...  
4  When healing from mental illness, early identi...  

Column Names: ['Question_ID', 'Questions', 'Answers']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question_ID  98 non-null 

In [11]:
# Clean the dataset
df = df.dropna()  # Remove any rows with missing values

# Rename columns for easier access
df.columns = ['Question_ID', 'Questions', 'Answers']

# Display cleaned data
print("Cleaned Dataset Shape:", df.shape)
print("\nSample Questions and Answers:")
for i in range(3):
    print(f"\nQ: {df['Questions'].iloc[i]}")
    print(f"A: {df['Answers'].iloc[i][:200]}...")  # Show first 200 chars

Cleaned Dataset Shape: (98, 3)

Sample Questions and Answers:

Q: What does it mean to have a mental illness?
A: Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the or...

Q: Who does mental illness affect?
A: It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of g...

Q: What causes mental illness?
A: It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of g...


In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess text: lowercase, tokenize, remove stopwords, lemmatize
    """
    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens
              if word.isalnum() and word not in stop_words]

    return ' '.join(tokens)

# Apply preprocessing to questions
df['Processed_Questions'] = df['Questions'].apply(preprocess_text)

print("Sample Processed Questions:")
for i in range(3):
    print(f"\nOriginal: {df['Questions'].iloc[i]}")
    print(f"Processed: {df['Processed_Questions'].iloc[i]}")


Sample Processed Questions:

Original: What does it mean to have a mental illness?
Processed: mean mental illness

Original: Who does mental illness affect?
Processed: mental illness affect

Original: What causes mental illness?
Processed: cause mental illness


In [13]:
print("\n" + "="*50)
print("METHOD 1: TF-IDF Based Approach")
print("="*50)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

# Fit and transform the questions
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Questions'])

print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Vocabulary Size: {len(tfidf_vectorizer.vocabulary_)}")

# Save TF-IDF model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'")


METHOD 1: TF-IDF Based Approach

TF-IDF Matrix Shape: (98, 428)
Vocabulary Size: 428
TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'


In [14]:
print("\n" + "="*50)
print("METHOD 2: Sentence Transformer Based Approach")
print("="*50)

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all questions
print("Generating embeddings for all questions...")
question_embeddings = model.encode(df['Questions'].tolist(),
                                   show_progress_bar=True,
                                   convert_to_numpy=True)

print(f"\nEmbeddings Shape: {question_embeddings.shape}")

# Save embeddings
np.save('question_embeddings.npy', question_embeddings)
print("Question embeddings saved as 'question_embeddings.npy'")


METHOD 2: Sentence Transformer Based Approach


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for all questions...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]


Embeddings Shape: (98, 384)
Question embeddings saved as 'question_embeddings.npy'


In [15]:
# Save the cleaned dataset
df.to_csv('cleaned_mental_health_faq.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_mental_health_faq.csv'")

# Create a model info dictionary
model_info = {
    'model_name': 'all-MiniLM-L6-v2',
    'embedding_dim': question_embeddings.shape[1],
    'num_questions': len(df),
    'methods': ['tfidf', 'sentence_transformer']
}

with open('model_info.pkl', 'wb') as f:
    pickle.dump(model_info, f)

print("Model info saved as 'model_info.pkl'")


Cleaned dataset saved as 'cleaned_mental_health_faq.csv'
Model info saved as 'model_info.pkl'


In [16]:
def get_response_tfidf(user_query, df, vectorizer, tfidf_matrix, threshold=0.3):
    """
    Get response using TF-IDF method
    """
    # Preprocess user query
    processed_query = preprocess_text(user_query)

    # Transform query to TF-IDF vector
    query_vector = vectorizer.transform([processed_query])

    # Calculate cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get best match
    best_match_idx = similarities.argmax()
    best_similarity = similarities[best_match_idx]

    if best_similarity > threshold:
        return df['Answers'].iloc[best_match_idx], best_similarity
    else:
        return "I'm sorry, I don't have information about that specific question. Please try rephrasing or ask about mental health topics like stress, therapy, treatment options, or symptoms.", 0.0

def get_response_transformer(user_query, df, model, question_embeddings, threshold=0.5):
    """
    Get response using Sentence Transformer method
    """
    # Generate embedding for user query
    query_embedding = model.encode([user_query], convert_to_numpy=True)

    # Calculate cosine similarity
    similarities = cosine_similarity(query_embedding, question_embeddings).flatten()

    # Get top 3 matches
    top_indices = similarities.argsort()[-3:][::-1]
    best_match_idx = top_indices[0]
    best_similarity = similarities[best_match_idx]

    if best_similarity > threshold:
        return df['Answers'].iloc[best_match_idx], best_similarity, top_indices, similarities[top_indices]
    else:
        return "I'm sorry, I don't have information about that specific question. Please try rephrasing or ask about mental health topics like stress, therapy, treatment options, or symptoms.", 0.0, [], []

print("Response functions defined successfully!")

Response functions defined successfully!


In [17]:
print("\n" + "="*50)
print("TESTING THE CHATBOT")
print("="*50)

test_queries = [
    "What is mental illness?",
    "How can I find a therapist?",
    "What are the symptoms of depression?",
    "How do I help someone with mental health issues?",
    "What treatment options are available?"
]

print("\n--- TF-IDF Method ---")
for query in test_queries[:3]:
    response, similarity = get_response_tfidf(query, df, tfidf_vectorizer, tfidf_matrix)
    print(f"\nQuery: {query}")
    print(f"Similarity: {similarity:.3f}")
    print(f"Response: {response[:200]}...")

print("\n\n--- Sentence Transformer Method ---")
for query in test_queries[:3]:
    response, similarity, top_idx, top_sim = get_response_transformer(query, df, model, question_embeddings)
    print(f"\nQuery: {query}")
    print(f"Similarity: {similarity:.3f}")
    print(f"Response: {response[:200]}...")



TESTING THE CHATBOT

--- TF-IDF Method ---

Query: What is mental illness?
Similarity: 0.659
Response: It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of g...

Query: How can I find a therapist?
Similarity: 0.000
Response: I'm sorry, I don't have information about that specific question. Please try rephrasing or ask about mental health topics like stress, therapy, treatment options, or symptoms....

Query: What are the symptoms of depression?
Similarity: 0.312
Response: Positive and negative symptoms are medical terms for two groups of symptoms in schizophrenia. 
 Positive symptoms add. Positive symptoms include hallucinations (sensations that aren’t real), delusions...


--- Sentence Transformer Method ---

Query: What is mental illness?
Similarity: 0.893
Response: We all have mental health which is made up of our beliefs, thoughts, feeli

In [18]:
chatbot_system = {
    'dataframe': df,
    'tfidf_vectorizer': tfidf_vectorizer,
    'tfidf_matrix': tfidf_matrix,
    'sentence_model_name': 'all-MiniLM-L6-v2',
    'question_embeddings': question_embeddings,
    'preprocess_function': preprocess_text
}

with open('chatbot_system.pkl', 'wb') as f:
    pickle.dump(chatbot_system, f)

print("\n" + "="*50)
print("COMPLETE CHATBOT SYSTEM SAVED!")
print("="*50)
print("\nGenerated Files:")
print("1. tfidf_vectorizer.pkl")
print("2. question_embeddings.npy")
print("3. cleaned_mental_health_faq.csv")
print("4. model_info.pkl")
print("5. chatbot_system.pkl")
print("\nYou can now use these files in your Streamlit app!")


COMPLETE CHATBOT SYSTEM SAVED!

Generated Files:
1. tfidf_vectorizer.pkl
2. question_embeddings.npy
3. cleaned_mental_health_faq.csv
4. model_info.pkl
5. chatbot_system.pkl

You can now use these files in your Streamlit app!


In [19]:
from sklearn.model_selection import train_test_split

print("\n" + "="*50)
print("PERFORMANCE EVALUATION")
print("="*50)

# Create test scenarios
test_samples = df.sample(min(10, len(df)))

print("\nTesting on random samples from dataset:\n")

correct_tfidf = 0
correct_transformer = 0

for idx, row in test_samples.iterrows():
    query = row['Questions']
    expected_answer = row['Answers']

    # TF-IDF Method
    response_tfidf, sim_tfidf = get_response_tfidf(query, df, tfidf_vectorizer, tfidf_matrix)
    if response_tfidf == expected_answer:
        correct_tfidf += 1

    # Transformer Method
    response_trans, sim_trans, _, _ = get_response_transformer(query, df, model, question_embeddings)
    if response_trans == expected_answer:
        correct_transformer += 1

print(f"TF-IDF Accuracy: {correct_tfidf}/{len(test_samples)} = {correct_tfidf/len(test_samples)*100:.2f}%")
print(f"Transformer Accuracy: {correct_transformer}/{len(test_samples)} = {correct_transformer/len(test_samples)*100:.2f}%")

print("\n✓ Training Complete! Ready for deployment with Streamlit!")


PERFORMANCE EVALUATION

Testing on random samples from dataset:

TF-IDF Accuracy: 10/10 = 100.00%
Transformer Accuracy: 10/10 = 100.00%

✓ Training Complete! Ready for deployment with Streamlit!
