<a href="https://colab.research.google.com/github/sasidn/ConersationalAgent/blob/main/Rule_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading data and preliminary analysis

In [None]:
import pandas as pd
import nltk 
import numpy as np
import re

from nltk.stem import wordnet                                  # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer    # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer    # to perform tfidf
from nltk import pos_tag                                       # for parts of speech
from sklearn.metrics import pairwise_distances                 # to perfrom cosine similarity
from nltk import word_tokenize                                 # to create tokens
from nltk.corpus import stopwords                              # for stop words

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Chatgbt/Chatbot-for-mentalHealth/mentalhealth.csv')
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl..."
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...


In [None]:
df.isnull().sum()

Question_ID    0
Questions      0
Answers        0
dtype: int64

# Clean data using NLTK






In [None]:
nltk.download('punkt')                    # uncomment if running the cell for the first time
   
s = 'tell me about your personality'
words = word_tokenize(s)                    # tokenize words
print(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['tell', 'me', 'about', 'your', 'personality']


In [None]:
nltk.download('wordnet')                    # uncomment if running the cell for the first time
lemma = wordnet.WordNetLemmatizer()         
lemma.lemmatize('absorbed', pos = 'v')        # lemmatize words

[nltk_data] Downloading package wordnet to /root/nltk_data...


'absorb'

In [None]:
nltk.download('averaged_perceptron_tagger')      # uncomment if running the cell for the first time
pos_tag(nltk.word_tokenize(s),tagset = None)       # returns the parts of speech of every word

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('tell', 'VB'),
 ('me', 'PRP'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('personality', 'NN')]

In [None]:
 nltk.download('stopwords')            # uncomment if running the cell for the first time

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# function that performs text normalization steps and returns the lemmatized tokens as a sentence

def text_normalization(text):
    text = str(text).lower()                        # text to lower case
    spl_char_text = re.sub(r'[^ a-z]','',text)      # removing special characters
    tokens = nltk.word_tokenize(spl_char_text)      # word tokenizing
    lema = wordnet.WordNetLemmatizer()              # intializing lemmatization
    tags_list = pos_tag(tokens,tagset=None)         # parts of speech
    lema_words = []                                 # empty list 
    for token,pos_token in tags_list:               # lemmatize according to POS
        if pos_token.startswith('V'):               # Verb
            pos_val = 'v'
        elif pos_token.startswith('J'):             # Adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):             # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n'                           # Noun
        lema_token = lema.lemmatize(token,pos_val)

        if lema_token in stop: 
          lema_words.append(lema_token)             # appending the lemmatized token into a list
    
    return " ".join(lema_words) 

In [None]:
text_normalization('telling you some stuffs about me')  # example

'you some about me'

In [None]:
df['lemmatized_text'] = df['Questions'].apply(text_normalization)   # clean text
df.head(5)

Unnamed: 0,Question_ID,Questions,Answers,lemmatized_text
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...,what do it to have a
1,2110618,Who does mental illness affect?,"Mental illness does can affect anyone, regardl...",who do
2,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...,what be some of the of
3,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi...",can with
4,1619387,What should I do if I know someone who appears...,We encourage those with symptoms to talk to th...,what should i do if i who to have the of a


In [None]:
cv = CountVectorizer()                                  # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [None]:
# returns all the unique word from data 

features = cv.get_feature_names_out()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,about,after,an,and,any,be,before,between,but,can,...,to,too,we,what,when,where,who,why,with,you
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0


In [None]:
Question = 'What treatment options are available'                           # example

In [None]:
Question_lemma = text_normalization(Question)                               # clean text
Question_bow = cv.transform([Question_lemma]).toarray()                     # applying bow

# Cosine similarity

In [None]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

array([[0.31622777],
       [0.        ],
       [0.5       ],
       [0.        ],
       [0.23570226],
       [0.        ],
       [1.        ],
       [0.31622777],
       [0.70710678],
       [0.        ],
       [0.31622777],
       [0.        ],
       [0.40824829],
       [0.25      ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.70710678],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [1.        ],
       [0.26726124],
       [0.        ],
       [0.40824829],
       [0.70710678],
       [0.25      ],
       [0.        ],
       [0.28867513],
       [0.23570226],
       [0.        ],
       [0.31622777],
       [0.        ],
       [0.35355339],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [None]:
df['similarity_bow'] = cosine_value                                         # create cosine value as a new column

In [None]:
simiscores = pd.DataFrame(df, columns=['Answers','similarity_bow'])         # taking similarity value of responses for the question we took
simiscores

Unnamed: 0,Answers,similarity_bow
0,Mental illnesses are health conditions that di...,0.316228
1,"Mental illness does can affect anyone, regardl...",0.000000
2,Symptoms of mental health disorders vary depen...,0.500000
3,"When healing from mental illness, early identi...",0.000000
4,We encourage those with symptoms to talk to th...,0.235702
...,...,...
92,Sorting out if you are drinking too much can b...,0.000000
93,"Cannabis smoke, for example, contains cancer-c...",0.500000
94,You can't. But you can influence their capacit...,0.000000
95,Cannabidiol or CBD is a naturally occurring co...,0.632456


In [None]:
simscoresDescending = simiscores.sort_values(by = 'similarity_bow', ascending=False)          # sorting the values
simscoresDescending.head()

Unnamed: 0,Answers,similarity_bow
78,Cyclothymic disorder is a subtype of bipolar d...,1.0
56,MSP stands for Medical Services Plan. It’s a h...,1.0
57,A referral means someone recommends you to ano...,1.0
88,Prodrome is a medical term for early signs or ...,1.0
6,Different treatment options are available for ...,1.0


In [None]:
threshold = 0.1                                                                         # considering the value of smiliarity to be greater than 0.1
df_threshold = simscoresDescending[simscoresDescending['similarity_bow'] > threshold] 
df_threshold

Unnamed: 0,Answers,similarity_bow
78,Cyclothymic disorder is a subtype of bipolar d...,1.0
56,MSP stands for Medical Services Plan. It’s a h...,1.0
57,A referral means someone recommends you to ano...,1.0
88,Prodrome is a medical term for early signs or ...,1.0
6,Different treatment options are available for ...,1.0
23,We all have mental health which is made up of ...,1.0
84,Binge-eating disorder or BED is a type of eati...,1.0
83,A personality disorder is a pattern of thought...,1.0
82,A personality disorder is a pattern of thought...,1.0
81,A personality disorder is a pattern of thought...,1.0


In [None]:
index_value = cosine_value.argmax()         # index number of highest value
index_value

6

In [None]:
df['Answers'].loc[index_value]              # The text at the above index becomes the response for the question

'Different treatment options are available for individuals with mental illness.'

# Tf-Idf

In [None]:
Question1 = 'What treatment options are available'

In [None]:
# using tf-idf

tfidf = TfidfVectorizer()                                             # intializing tf-id 
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()        # transforming the data into array

In [None]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray()         # applying tf-idf

In [None]:
# returns all the unique word from data with a score of that word

df_tfidf = pd.DataFrame(x_tfidf,columns = tfidf.get_feature_names_out()) 
df_tfidf.head()

Unnamed: 0,about,after,an,and,any,be,before,between,but,can,...,to,too,we,what,when,where,who,why,with,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.451394,0.0,0.0,0.302478,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.877701,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.243612,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.213472,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.91316,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.309284,0.0,0.0,0.207251,0.0,0.0,0.471012,0.0,0.0,0.0


In [None]:
cos = 1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')                     # applying cosine similarity
cos

array([[0.19934817],
       [0.        ],
       [0.32390963],
       [0.        ],
       [0.13658848],
       [0.        ],
       [1.        ],
       [0.19419492],
       [0.60151332],
       [0.        ],
       [0.19419492],
       [0.        ],
       [0.20023572],
       [0.17842037],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.57482171],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [1.        ],
       [0.19775935],
       [0.        ],
       [0.29108153],
       [0.65904938],
       [0.17156074],
       [0.        ],
       [0.1796983 ],
       [0.14501087],
       [0.        ],
       [0.30481321],
       [0.        ],
       [0.28362797],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [None]:
df['similarity_tfidf'] = cos                                                    # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Answers','similarity_tfidf'])        # taking similarity value of responses for the question we took
df_simi_tfidf

Unnamed: 0,Answers,similarity_tfidf
0,Mental illnesses are health conditions that di...,0.199348
1,"Mental illness does can affect anyone, regardl...",0.000000
2,Symptoms of mental health disorders vary depen...,0.323910
3,"When healing from mental illness, early identi...",0.000000
4,We encourage those with symptoms to talk to th...,0.136588
...,...,...
92,Sorting out if you are drinking too much can b...,0.000000
93,"Cannabis smoke, for example, contains cancer-c...",0.370213
94,You can't. But you can influence their capacit...,0.000000
95,Cannabidiol or CBD is a naturally occurring co...,0.516687


In [None]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False)            # sorting the values
df_simi_tfidf_sort.head(10)

Unnamed: 0,Answers,similarity_tfidf
78,Cyclothymic disorder is a subtype of bipolar d...,1.0
56,MSP stands for Medical Services Plan. It’s a h...,1.0
88,Prodrome is a medical term for early signs or ...,1.0
6,Different treatment options are available for ...,1.0
84,Binge-eating disorder or BED is a type of eati...,1.0
83,A personality disorder is a pattern of thought...,1.0
82,A personality disorder is a pattern of thought...,1.0
57,A referral means someone recommends you to ano...,1.0
81,A personality disorder is a pattern of thought...,1.0
23,We all have mental health which is made up of ...,1.0


In [None]:
threshold = 0.1                                                                                   # considering the value of smiliarity to be greater than 0.1
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

Unnamed: 0,Answers,similarity_tfidf
78,Cyclothymic disorder is a subtype of bipolar d...,1.0
56,MSP stands for Medical Services Plan. It’s a h...,1.0
88,Prodrome is a medical term for early signs or ...,1.0
6,Different treatment options are available for ...,1.0
84,Binge-eating disorder or BED is a type of eati...,1.0
83,A personality disorder is a pattern of thought...,1.0
82,A personality disorder is a pattern of thought...,1.0
57,A referral means someone recommends you to ano...,1.0
81,A personality disorder is a pattern of thought...,1.0
23,We all have mental health which is made up of ...,1.0


In [None]:
index_value1 = cos.argmax()                                                   # returns the index number of highest value
index_value1

6

In [None]:
df['Answers'].loc[index_value1]                                               # returns the text at that index

'Different treatment options are available for individuals with mental illness.'

# Testing chatbot

In [None]:
# defining a function that returns response to query using bow

def chat_bow(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    bow = cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value = cosine_value.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [None]:
chat_bow('can you prevent mental health problems')

'We can all suffer from mental health challenges, but developing our wellbeing, resilience, and seeking help early can help prevent challenges becoming serious.'

In [None]:
chat_bow('what is mental health')

'Different treatment options are available for individuals with mental illness.'

In [None]:
chat_bow('are there cures for mental health problems')

'It is often more realistic and helpful to find out what helps with the issues you face. Talking, counselling, medication, friendships, exercise, good sleep and nutrition, and meaningful occupation can all help.'

In [None]:
chat_bow('how do I know if i am unwell')

'If your beliefs , thoughts , feelings or behaviours have a significant impact on your ability to function in what might be considered a normal or ordinary way, it would be important to seek help.'

In [None]:
chat_bow('what do you mean by mental health')

'Rapid cycling means that someone diagnosed with bipolar disorder (or depression) experiences four or more episodes of depression and/or mania in one year. \n Rapid cycling can happen any time someone experiences bipolar disorder—about 10-20% of people diagnosed with bipolar disorder experience rapid cycling at some point. In many cases, rapid cycling eventually goes away on its own and people return to a pattern of longer and less frequent episodes.'

In [None]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma = text_normalization(text) # calling the function to perform text normalization
    tf = tfidf.transform([lemma]).toarray() # applying tf-idf
    cos = 1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value = cos.argmax() # getting index value 
    return df['Answers'].loc[index_value]

In [None]:
chat_tfidf('what is mental health')

'Different treatment options are available for individuals with mental illness.'

In [None]:
chat_tfidf('what is vaping')

'Different treatment options are available for individuals with mental illness.'

In [None]:
chat_tfidf('how do i see a counsellor')

'If your beliefs , thoughts , feelings or behaviours have a significant impact on your ability to function in what might be considered a normal or ordinary way, it would be important to seek help.'

In [None]:
chat_tfidf('how to find a support group')

"Distraction is a very valid tool to help you cope when everything feels overwhelming or when you feel lonely or isolated. \n If you don't have a lot of energy or focus right now, try low-effort distractions like watching TV, browsing Youtube, listening to a podcast or audiobook, playing a game on your phone, reading an easy book or magazine, or working on a simple art project. \n If you have more energy and focus, give yourself a to-do list every day: you can clean and take care of projects around your home, work on hobbies, connect with family or friends, read a new book and catch up on your favourite TV shows. You can find interesting opportunities to take online courses from universities all over the world through MOOCs and other online learning platforms, you can learn a new language online or through apps, and you can learn new hobbies and activities. As more people have to practice social distancing or self-isolation, people are finding creative ways to bring the world into thei

In [None]:
chat_tfidf('how to get more focus')

'How you think about something impacts your feelings and your behaviours. \n When we feel stressed out, angry, or fearful, it\'s hard to look at the situation realistically and see all of the options we have. (Remember: we all control our own actions and reactions, no matter what\'s going on in the world. We can call do something about this pandemic.) \n People often overestimate the negative parts—their own feelings, their own abilities to manage a difficult situation, or the situation itself—and underestimate positive parts—their own abilities to care for themselves and loved ones, their support networks, and opportunities. \n How does the thought "We\'re never going to make it through this!" make you feel? It likely doesn\'t feel good—and it isn\'t even true. \n Challenging negative, unhelpful thoughts can improve your mood, validate your ability to get through this, and help you see new options or opportunities to stay well. \n Stop and notice thoughts that come up. How do they mak

In [None]:
chat_tfidf('how can i get rid of my depression?')

"Taking care of your physical health is also good for your mental health. It's more important than ever to keep yourself healthy. \n Try to eat as well as you can. It may be easier to reach for unhealthier comfort foods and snacks while you spend more time at home, but try to keep a balanced approach. When you stock up on groceries, don’t ignore fresh fruit and vegetables—we still have everything we need to prepare food. Now that we're advised to limit the amount of time we spend in public spaces like grocery stores, this is a great time to try out new fruits and vegetables that keep at home for longer periods of time. \n If it's safer for you to stay home or you are in self-isolation, reach out for help. Many grocery stores and meal prep services offer safe, no-contact delivery. You can also ask family or friends to bring you groceries, or look for local COVID-19 support groups on social media. It's safest if others leave food and other supplies outside of your door to avoid spreading