In [1]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix  # Import csr_matrix directly

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\foura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\foura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\foura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df=pd.read_csv("Mental_Health_FAQ.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question_ID  98 non-null     int64 
 1   Questions    98 non-null     object
 2   Answers      98 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [4]:
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [5]:
df = df.drop(["Question_ID"], axis="columns")
df["Questions"] = df["Questions"].str.lower()

In [6]:
# Function for text preprocessing
def data_prep(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join(t for t in text.split() if t not in stopwords.words('english'))
    return text

df["Questions"] = df["Questions"].apply(data_prep)

In [7]:
# Vectorize questions using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(df["Questions"])  # Fit the vectorizer

In [8]:
# Encode answers
label_encoder = LabelEncoder()
df["Answers_Code"] = label_encoder.fit_transform(df["Answers"])

In [9]:
# Train Naive Bayes model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(tfidf_train, df["Answers_Code"])

In [10]:
# Save the model, vectorizer, and label encoder to .pkl files
with open('./model.pkl', 'wb') as model_file:
    pickle.dump(naive_bayes_model, model_file)
with open('./vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)
with open('./label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)
