# Alex Liddle
## University of Denver

# Chatbot
A chatbot is an intelligent piece of software that is capable of communicating and performing actions similar to a human. The goal of this project is to build a model that predicts answers using predefined patterns and responses. You are provided with a file named intents.json that contains these patterns. Words and classes files are provided as extra help. Feel free to make a more complex bot by extending the intents file. 

#### Possible chat with your bot
<code>
You: Hello, how are you? 
Bot: Hi there, how can I help?
You: what can you do?
Bot: I can guide you through Adverse drug reaction list, Blood pressure tracking, Hospitals and Pharmacies
You: thanks
Bot: My pleasure
You: see ya. got to go!
Bot: See you
</code>

In [5]:
import nltk
import string
import re
import sklearn
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy import stats
#nltk.download('stopwords') #<---uncomment if you haven't downloaded the stopwords library
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [14]:
# Load pickle files
import urllib.request
import pickle

classes = pickle.load(urllib.request.urlopen("https://raw.githubusercontent.com/emmanueliarussi/DataScienceCapstone/master/3_MidtermProjects/ProjectPCB/data/classes.pkl"))
words   = pickle.load(urllib.request.urlopen("https://raw.githubusercontent.com/emmanueliarussi/DataScienceCapstone/master/3_MidtermProjects/ProjectPCB/data/words.pkl"))
print(classes)
print(words)

['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
["'s", ',', 'a', 'adverse', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'can', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'give', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'module', 'nearby', 'next', 'nice', 'of', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 'suitable', 'support', 'task', 'thank', 'thanks', 'that', 'there', 'till', 'time', 'to', 'transfer', 'up', 'want', 'what', 'which', 'with', 'you']


In [15]:
# Load json file with answer patterns
import requests
import json

intents = json.loads(requests.get("https://raw.githubusercontent.com/emmanueliarussi/DataScienceCapstone/master/3_MidtermProjects/ProjectPCB/data/intents.json").text)
intents

{'intents': [{'context': [''],
   'patterns': ['Hi there',
    'How are you',
    'Is anyone there?',
    'Hey',
    'Hola',
    'Hello',
    'Good day'],
   'responses': ['Hello, thanks for asking',
    'Good to see you again',
    'Hi there, how can I help?'],
   'tag': 'greeting'},
  {'context': [''],
   'patterns': ['Bye',
    'See you later',
    'Goodbye',
    'Nice chatting to you, bye',
    'Till next time'],
   'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'],
   'tag': 'goodbye'},
  {'context': [''],
   'patterns': ['Thanks',
    'Thank you',
    "That's helpful",
    'Awesome, thanks',
    'Thanks for helping me'],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure'],
   'tag': 'thanks'},
  {'context': [''],
   'patterns': [],
   'responses': ["Sorry, can't understand you",
    'Please give me more info',
    'Not sure I understand'],
   'tag': 'noanswer'},
  {'context': [''],
   'patterns': ['How you could help me?',
    'What you can do?

Process patterns for nlp and clustering

In [106]:
df_intents = pd.DataFrame(intents['intents'])
df_intents

Unnamed: 0,context,patterns,responses,tag
0,[],"[Hi there, How are you, Is anyone there?, Hey,...","[Hello, thanks for asking, Good to see you aga...",greeting
1,[],"[Bye, See you later, Goodbye, Nice chatting to...","[See you!, Have a nice day, Bye! Come back aga...",goodbye
2,[],"[Thanks, Thank you, That's helpful, Awesome, t...","[Happy to help!, Any time!, My pleasure]",thanks
3,[],[],"[Sorry, can't understand you, Please give me m...",noanswer
4,[],"[How you could help me?, What you can do?, Wha...",[I can guide you through Adverse drug reaction...,options
5,[],"[How to check Adverse drug reaction?, Open adv...",[Navigating to Adverse drug reaction module],adverse_drug
6,[],"[Open blood pressure module, Task related to b...",[Navigating to Blood Pressure module],blood_pressure
7,[search_blood_pressure_by_patient_id],[I want to search for blood pressure result hi...,"[Please provide Patient ID, Patient ID?]",blood_pressure_search
8,[],[],[Loading Blood pressure result for Patient],search_blood_pressure_by_patient_id
9,[search_pharmacy_by_name],"[Find me a pharmacy, Find pharmacy, List of ph...",[Please provide pharmacy name],pharmacy_search


In [107]:
tqdm.pandas()
stop = stopwords.words()

df_intents['patterns_vector'] = [[]]*len(df_intents.patterns.values)
df_intents.patterns = df_intents.patterns.apply(' '.join)
for idx, val in df_intents.patterns.items():
    lowered = val.replace("[^\w\s]", "").lower()
    df_intents.set_value(idx, 'patterns_vector', ' '.join([item for item in lowered.split() 
                                                               if item not in stop]))
tfidf = TfidfVectorizer(
    min_df = 0,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)

print(df_intents.patterns_vector)
tfidf.fit(df_intents.patterns_vector)
patterns_tfidf = tfidf.transform(df_intents.patterns_vector)

df_intents.patterns_vector = patterns_tfidf

df_intents.head()

0              hi anyone there? hey hola hello good day
1     bye see later goodbye nice chatting you, bye n...
2     thanks thank that's helpful awesome, thanks th...
3                                                      
4     could help me? do? help provide? helpful? supp...
5     check adverse reaction? open adverse drugs mod...
6     open blood pressure module task related blood ...
7     search blood pressure result history blood pre...
8                                                      
9     find pharmacy find pharmacy list pharmacies ne...
10                                                     
11    lookup hospital searching hospital transfer pa...
12                                                     
13                                                     
Name: patterns_vector, dtype: object


Unnamed: 0,context,patterns,responses,tag,patterns_vector
0,[],Hi there How are you Is anyone there? Hey Hola...,"[Hello, thanks for asking, Good to see you aga...",greeting,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408..."
1,[],Bye See you later Goodbye Nice chatting to you...,"[See you!, Have a nice day, Bye! Come back aga...",goodbye,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408..."
2,[],"Thanks Thank you That's helpful Awesome, thank...","[Happy to help!, Any time!, My pleasure]",thanks,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408..."
3,[],,"[Sorry, can't understand you, Please give me m...",noanswer,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408..."
4,[],How you could help me? What you can do? What h...,[I can guide you through Adverse drug reaction...,options,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408..."


In [108]:
n_clusters = len(intents['intents'])

mini = MiniBatchKMeans(n_clusters=n_clusters, init_size=1024, batch_size=2048, 
                                            random_state=20).fit(patterns_tfidf)
df_intents['patterns_cluster'] = mini.predict(patterns_tfidf)
df_intents

Unnamed: 0,context,patterns,responses,tag,patterns_vector,patterns_cluster
0,[],Hi there How are you Is anyone there? Hey Hola...,"[Hello, thanks for asking, Good to see you aga...",greeting,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",6
1,[],Bye See you later Goodbye Nice chatting to you...,"[See you!, Have a nice day, Bye! Come back aga...",goodbye,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",3
2,[],"Thanks Thank you That's helpful Awesome, thank...","[Happy to help!, Any time!, My pleasure]",thanks,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",10
3,[],,"[Sorry, can't understand you, Please give me m...",noanswer,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",0
4,[],How you could help me? What you can do? What h...,[I can guide you through Adverse drug reaction...,options,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",8
5,[],How to check Adverse drug reaction? Open adver...,[Navigating to Adverse drug reaction module],adverse_drug,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",5
6,[],Open blood pressure module Task related to blo...,[Navigating to Blood Pressure module],blood_pressure,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",9
7,[search_blood_pressure_by_patient_id],I want to search for blood pressure result his...,"[Please provide Patient ID, Patient ID?]",blood_pressure_search,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",1
8,[],,[Loading Blood pressure result for Patient],search_blood_pressure_by_patient_id,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",0
9,[search_pharmacy_by_name],Find me a pharmacy Find pharmacy List of pharm...,[Please provide pharmacy name],pharmacy_search,"(0, 23)\t0.408248290463863\n (0, 21)\t0.408...",7


In [119]:
test = 'blood pressure results'

lowered = test.replace("[^\w\s]", "").lower()
stripped = ' '.join([item for item in lowered.split() if item not in stop])

test_vector = tfidf.transform([stripped])

predicted_cluster = mini.predict(test_vector)[0]

response = list(df_intents[df_intents.patterns_cluster==predicted_cluster].responses)[0][0]

print(response)

Please provide Patient ID
