# Loading data and preliminary analysis

In [None]:
# Install dependencies 



%pip install -r '../requirements.txt'



import nltk 



nltk.download('punkt') 

nltk.download('averaged_perceptron_tagger') 



# Set paths 



path_to_csv = '../Dataset/mentalhealth.csv'

In [2]:
import pandas as pd

import nltk 

import numpy as np

import re



from nltk.stem import wordnet                                  # to perform lemmitization

from sklearn.feature_extraction.text import CountVectorizer    # to perform bow

from sklearn.feature_extraction.text import TfidfVectorizer    # to perform tfidf

from nltk import pos_tag                                       # for parts of speech

from sklearn.metrics import pairwise_distances                 # to perfrom cosine similarity

from nltk import word_tokenize                                 # to create tokens

from nltk.corpus import stopwords                              # for stop words

In [3]:
df = pd.read_csv(path_to_csv, nrows = 20)

df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl..."
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...


In [4]:
df.isnull().sum()

Question_ID    0
Questions      0
Answers        0
dtype: int64

# Clean data using NLTK










In [5]:
s = 'tell me about your personality'

words = word_tokenize(s)                    # tokenize words

print(words)

['tell', 'me', 'about', 'your', 'personality']


In [6]:
# nltk.download('wordnet')                    # uncomment if running the cell for the first time

lemma = wordnet.WordNetLemmatizer()         

lemma.lemmatize('absorbed', pos = 'v')        # lemmatize words

'absorb'

In [7]:
pos_tag(nltk.word_tokenize(s),tagset = None)       # returns the parts of speech of every word

[('tell', 'VB'),
 ('me', 'PRP'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('personality', 'NN')]

In [None]:
 nltk.download('stopwords')            # uncomment if running the cell for the first time



stop = stopwords.words('english')

print(stop)

In [9]:
# function that performs text normalization steps and returns the lemmatized tokens as a sentence



def text_normalization(text):

    text = str(text).lower()                        # text to lower case

    spl_char_text = re.sub(r'[^ a-z]','',text)      # removing special characters

    tokens = nltk.word_tokenize(spl_char_text)      # word tokenizing

    lema = wordnet.WordNetLemmatizer()              # intializing lemmatization

    tags_list = pos_tag(tokens,tagset=None)         # parts of speech

    lema_words = []                                 # empty list 

    for token,pos_token in tags_list:               # lemmatize according to POS

        if pos_token.startswith('V'):               # Verb

            pos_val = 'v'

        elif pos_token.startswith('J'):             # Adjective

            pos_val = 'a'

        elif pos_token.startswith('R'):             # Adverb

            pos_val = 'r'

        else:

            pos_val = 'n'                           # Noun

        lema_token = lema.lemmatize(token,pos_val)



        if lema_token in stop: 

          lema_words.append(lema_token)             # appending the lemmatized token into a list

    

    return " ".join(lema_words) 

In [10]:
text_normalization('telling you some stuffs about me')  # example

'you some about me'

In [11]:
df['lemmatized_text'] = df['Questions'].apply(text_normalization)   # clean text

df.head(5)

Unnamed: 0,Question_ID,Questions,Answers,lemmatized_text
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...,what do it to have a
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl...",who do
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...,what be some of the of
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi...",can with
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...,what should i do if i who to have the of a


In [12]:
cv = CountVectorizer()                                  # intializing the count vectorizer

X = cv.fit_transform(df['lemmatized_text']).toarray()

In [13]:
# returns all the unique word from data 



features = cv.get_feature_names_out()

df_bow = pd.DataFrame(X, columns = features)

df_bow.head()

Unnamed: 0,about,after,and,be,before,between,can,do,for,have,...,or,should,some,the,this,to,what,where,who,with
0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,1,...,0,1,0,1,0,1,1,0,1,0


In [14]:
Question = 'What treatment options are available'                           # example

In [15]:
Question_lemma = text_normalization(Question)                               # clean text

Question_bow = cv.transform([Question_lemma]).toarray()                     # applying bow

# Cosine similarity

In [16]:
# cosine similarity for the above question we considered.



cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )

(cosine_value)

array([[0.31622777],
       [0.        ],
       [0.5       ],
       [0.        ],
       [0.23570226],
       [0.        ],
       [1.        ],
       [0.31622777],
       [0.70710678],
       [0.        ],
       [0.31622777],
       [0.        ],
       [0.40824829],
       [0.25      ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.70710678],
       [0.        ],
       [0.        ]])

In [17]:
df['similarity_bow'] = cosine_value                                         # create cosine value as a new column

In [18]:
simiscores = pd.DataFrame(df, columns=['Answers','similarity_bow'])         # taking similarity value of responses for the question we took

simiscores

Unnamed: 0,Answers,similarity_bow
0,Mental illnesses are health conditions that di...,0.316228
1,"Mental illness does can affect anyone, regardl...",0.0
2,Symptoms of mental health disorders vary depen...,0.5
3,"When healing from mental illness, early identi...",0.0
4,We encourage those with symptoms to talk to th...,0.235702
5,Feeling comfortable with the professional you ...,0.0
6,Different treatment options are available for ...,1.0
7,It is important to be as involved and engaged ...,0.316228
8,There are many types of mental health professi...,0.707107
9,Feeling comfortable with the professional you ...,0.0


In [19]:
simscoresDescending = simiscores.sort_values(by = 'similarity_bow', ascending=False)          # sorting the values

simscoresDescending.head()

Unnamed: 0,Answers,similarity_bow
6,Different treatment options are available for ...,1.0
17,There are many types of mental health professi...,0.707107
8,There are many types of mental health professi...,0.707107
2,Symptoms of mental health disorders vary depen...,0.5
12,The best source of information regarding medic...,0.408248


In [20]:
threshold = 0.1                                                                         # considering the value of smiliarity to be greater than 0.1

df_threshold = simscoresDescending[simscoresDescending['similarity_bow'] > threshold] 

df_threshold

Unnamed: 0,Answers,similarity_bow
6,Different treatment options are available for ...,1.0
17,There are many types of mental health professi...,0.707107
8,There are many types of mental health professi...,0.707107
2,Symptoms of mental health disorders vary depen...,0.5
12,The best source of information regarding medic...,0.408248
0,Mental illnesses are health conditions that di...,0.316228
7,It is important to be as involved and engaged ...,0.316228
10,It is important to continue involvement in the...,0.316228
13,Create a plan for switching to a different tre...,0.25
4,We encourage those with symptoms to talk to th...,0.235702


In [21]:
index_value = cosine_value.argmax()         # index number of highest value

index_value

6

In [22]:
df['Answers'].loc[index_value]              # The text at the above index becomes the response for the question

'Different treatment options are available for individuals with mental illness.'

# Tf-Idf

In [23]:
Question1 = 'What treatment options are available'

In [24]:
# using tf-idf



tfidf = TfidfVectorizer()                                             # intializing tf-id 

x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()        # transforming the data into array

In [25]:
Question_lemma1 = text_normalization(Question1)

Question_tfidf = tfidf.transform([Question_lemma1]).toarray()         # applying tf-idf

In [26]:
# returns all the unique word from data with a score of that word



df_tfidf = pd.DataFrame(x_tfidf,columns = tfidf.get_feature_names_out()) 

df_tfidf.head()

Unnamed: 0,about,after,and,be,before,between,can,do,for,have,...,or,should,some,the,this,to,what,where,who,with
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.392029,0.0,0.550307,...,0.0,0.0,0.0,0.0,0.0,0.367085,0.325401,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.580211,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814466,0.0
2,0.0,0.0,0.0,0.321859,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.478821,0.347908,0.0,0.0,0.248876,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.440977,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.897519
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282658,0.0,0.396779,...,0.0,0.396779,0.0,0.327977,0.0,0.264673,0.234618,0.0,0.396779,0.0


In [27]:
cos = 1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')                     # applying cosine similarity

cos

array([[0.19904882],
       [0.        ],
       [0.40685646],
       [0.        ],
       [0.14351684],
       [0.        ],
       [1.        ],
       [0.20934186],
       [0.56647821],
       [0.        ],
       [0.20934186],
       [0.        ],
       [0.22245129],
       [0.22913146],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.6372619 ],
       [0.        ],
       [0.        ]])

In [28]:
df['similarity_tfidf'] = cos                                                    # creating a new column 

df_simi_tfidf = pd.DataFrame(df, columns=['Answers','similarity_tfidf'])        # taking similarity value of responses for the question we took

df_simi_tfidf

Unnamed: 0,Answers,similarity_tfidf
0,Mental illnesses are health conditions that di...,0.199049
1,"Mental illness does can affect anyone, regardl...",0.0
2,Symptoms of mental health disorders vary depen...,0.406856
3,"When healing from mental illness, early identi...",0.0
4,We encourage those with symptoms to talk to th...,0.143517
5,Feeling comfortable with the professional you ...,0.0
6,Different treatment options are available for ...,1.0
7,It is important to be as involved and engaged ...,0.209342
8,There are many types of mental health professi...,0.566478
9,Feeling comfortable with the professional you ...,0.0


In [29]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False)            # sorting the values

df_simi_tfidf_sort.head(10)

Unnamed: 0,Answers,similarity_tfidf
6,Different treatment options are available for ...,1.0
17,There are many types of mental health professi...,0.637262
8,There are many types of mental health professi...,0.566478
2,Symptoms of mental health disorders vary depen...,0.406856
13,Create a plan for switching to a different tre...,0.229131
12,The best source of information regarding medic...,0.222451
10,It is important to continue involvement in the...,0.209342
7,It is important to be as involved and engaged ...,0.209342
0,Mental illnesses are health conditions that di...,0.199049
4,We encourage those with symptoms to talk to th...,0.143517


In [30]:
threshold = 0.1                                                                                   # considering the value of smiliarity to be greater than 0.1

df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 

df_threshold

Unnamed: 0,Answers,similarity_tfidf
6,Different treatment options are available for ...,1.0
17,There are many types of mental health professi...,0.637262
8,There are many types of mental health professi...,0.566478
2,Symptoms of mental health disorders vary depen...,0.406856
13,Create a plan for switching to a different tre...,0.229131
12,The best source of information regarding medic...,0.222451
10,It is important to continue involvement in the...,0.209342
7,It is important to be as involved and engaged ...,0.209342
0,Mental illnesses are health conditions that di...,0.199049
4,We encourage those with symptoms to talk to th...,0.143517


In [31]:
index_value1 = cos.argmax()                                                   # returns the index number of highest value

index_value1

6

In [32]:
df['Answers'].loc[index_value1]                                               # returns the text at that index

'Different treatment options are available for individuals with mental illness.'

# Testing chatbot

In [33]:
# defining a function that returns response to query using bow



def chat_bow(text):

    lemma = text_normalization(text) # calling the function to perform text normalization

    bow = cv.transform([lemma]).toarray() # applying bow

    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )

    index_value = cosine_value.argmax() # getting index value 

    return df['Answers'].loc[index_value]

In [34]:
chat_bow('can you prevent mental health problems')

'When healing from mental illness, early identification and treatment are of vital importance. '

In [35]:
chat_bow('what is mental health')

'Different treatment options are available for individuals with mental illness.'

In [36]:
chat_bow('are there cures for mental health problems')

'Different treatment options are available for individuals with mental illness.'

In [37]:
chat_bow('how do I know if i am unwell')

'Create a plan for switching to a different treatment that will be a better fit.'

In [38]:
chat_bow('what do you mean by mental health')

"Mental illnesses are health conditions that disrupt a person's thoughts, emotions, relationships, and daily functioning."

In [39]:
# defining a function that returns response to query using tf-idf



def chat_tfidf(text):

    lemma = text_normalization(text) # calling the function to perform text normalization

    tf = tfidf.transform([lemma]).toarray() # applying tf-idf

    cos = 1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity

    index_value = cos.argmax() # getting index value 

    return df['Answers'].loc[index_value]

In [40]:
chat_tfidf('how do i see a counsellor')

'Visit Healthfinder.gov to learn more.'

In [41]:
chat_tfidf('how to find a support group')

'Visit Healthfinder.gov to learn more.'