In [1]:
# Description: It's a bot that uses some machine learning techniques to chat

In [2]:
# Import the packages
import nltk  # used for natural language processing and it is a natural language tool kit
from newspaper import Article # To pull up the aricles from a newspaper3k library or
                                #Newspaper is an amazing python library for extracting articles.

In [3]:
import random #Functions in the random module depend on a pseudo-random number generator function random(),
               #which generates a random float number between 0.0 and 1.0.
import string  #Python String module contains some constants, utility function, and classes for string manipulation.
                    #ex: string.punctuation


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  #Cosine similarity is a metric used to determine how similar the documents are irrespective of their size.
import numpy as np
import pandas as pd
import warnings   # To ignore any warnings
warnings.filterwarnings("ignore")

In [5]:
# Download the packages from NLTK
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True) #You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more

True

In [6]:
# Let's scrape the Article
article=Article("https://www.medpagetoday.com/infectiousdisease/covid19/85165")

In [7]:
article.download()  # To download teh article

In [8]:
article.parse()   #resolve (a sentence) into its component parts and describe their syntactic roles.

In [9]:
article.nlp()  # To apply nlp(natural language procesing) on article

In [10]:
article.title #To extract articles title

'FAQs About COVID-19'

In [11]:
corpus=article.text
print(corpus)

What about mild or asymptomatic cases of COVID-19?

Asymptomatic transmission has not only been confirmed in China, but recent modeling data found that mild or asymptomatic cases that went undetected ("undocumented") accounted for 85% of total infections in the earliest stages of the outbreak. The study found these cases were less infectious on a per-contact basis, but because those individuals weren't isolated they infected more people in total.

The big unknown, however, is how common it may be for people to become infected but with symptoms too mild to seek treatment. Currently, detection is based on molecular testing, which is performed only on individuals who come into contact with the healthcare system. The prevalence of such mild or asymptomatic infections won't be known until an inexpensive serological test, detecting antibodies to the virus that signal previous exposure, is available for use with routine blood draws. Thus, the extent of exposure in the population may not be kn

In [12]:
# Tokenization
# Here we are performing sentence tokenization
text=corpus
sent_tokens=nltk.sent_tokenize(text) # convert sentences into list of sentences
# print the list of sentences
print(sent_tokens)

['What about mild or asymptomatic cases of COVID-19?', 'Asymptomatic transmission has not only been confirmed in China, but recent modeling data found that mild or asymptomatic cases that went undetected ("undocumented") accounted for 85% of total infections in the earliest stages of the outbreak.', "The study found these cases were less infectious on a per-contact basis, but because those individuals weren't isolated they infected more people in total.", 'The big unknown, however, is how common it may be for people to become infected but with symptoms too mild to seek treatment.', 'Currently, detection is based on molecular testing, which is performed only on individuals who come into contact with the healthcare system.', "The prevalence of such mild or asymptomatic infections won't be known until an inexpensive serological test, detecting antibodies to the virus that signal previous exposure, is available for use with routine blood draws.", 'Thus, the extent of exposure in the popula

In [13]:
# Text cleaning: Basic step is to remove punctuations, 2nd is to convert entire text to lower case or upper case
#Re moving the stop words(occuring frequenlty in a sentence that have no significance)
# step4 is to remove unnecessary characters(\n) steps5: Tokenization-sentence or word tokens, step6: Stemming or lemmatization
#(stemming removes end words like playing will have ing truncated, lemmatization gives meaningful words.)
#Advanced preprocessing- Normalization( words like ttyl, b4 shrt form of words will be mapped to it's acutal word)
# correction of typos= fen when it's fan

# Text cleaning

In [14]:
# print the punctuations
print(string.punctuation)  # list of punctuations that needs to be removed

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [15]:
# create a dictionary(key:value) pair to remove punctuations
remove_puncts=dict(  (punct, None) for punct in string.punctuation)

In [16]:
# print remove_puncts dictionary
print(remove_puncts)   # replace symbols with values

{'!': None, '"': None, '#': None, '$': None, '%': None, '&': None, "'": None, '(': None, ')': None, '*': None, '+': None, ',': None, '-': None, '.': None, '/': None, ':': None, ';': None, '<': None, '=': None, '>': None, '?': None, '@': None, '[': None, '\\': None, ']': None, '^': None, '_': None, '`': None, '{': None, '|': None, '}': None, '~': None}


In [17]:
# Replacing the symbols with values
remove_puncts=dict(  (ord(punct), None) for punct in string.punctuation)

In [18]:
print(remove_puncts)   # replace symbols with values

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [19]:
# create a function to return a list of lemmatized lowercase words after removing punctuations
def Lemnormalize(text):
    return nltk.word_tokenize(text)  # It converts the sentences into words
# print the function
print(Lemnormalize(text))

['What', 'about', 'mild', 'or', 'asymptomatic', 'cases', 'of', 'COVID-19', '?', 'Asymptomatic', 'transmission', 'has', 'not', 'only', 'been', 'confirmed', 'in', 'China', ',', 'but', 'recent', 'modeling', 'data', 'found', 'that', 'mild', 'or', 'asymptomatic', 'cases', 'that', 'went', 'undetected', '(', '``', 'undocumented', "''", ')', 'accounted', 'for', '85', '%', 'of', 'total', 'infections', 'in', 'the', 'earliest', 'stages', 'of', 'the', 'outbreak', '.', 'The', 'study', 'found', 'these', 'cases', 'were', 'less', 'infectious', 'on', 'a', 'per-contact', 'basis', ',', 'but', 'because', 'those', 'individuals', 'were', "n't", 'isolated', 'they', 'infected', 'more', 'people', 'in', 'total', '.', 'The', 'big', 'unknown', ',', 'however', ',', 'is', 'how', 'common', 'it', 'may', 'be', 'for', 'people', 'to', 'become', 'infected', 'but', 'with', 'symptoms', 'too', 'mild', 'to', 'seek', 'treatment', '.', 'Currently', ',', 'detection', 'is', 'based', 'on', 'molecular', 'testing', ',', 'which', 'i

In [20]:
# We could see there are spaces in the words that needs to be removed
# create a function to return a list of lemmatized lowercase words after removing punctuations
def Lemnormalize(text):
    return nltk.word_tokenize(text.lower().translate(remove_puncts))  # It converts the sentences into words
# print the function
print(Lemnormalize(text))

['what', 'about', 'mild', 'or', 'asymptomatic', 'cases', 'of', 'covid19', 'asymptomatic', 'transmission', 'has', 'not', 'only', 'been', 'confirmed', 'in', 'china', 'but', 'recent', 'modeling', 'data', 'found', 'that', 'mild', 'or', 'asymptomatic', 'cases', 'that', 'went', 'undetected', 'undocumented', 'accounted', 'for', '85', 'of', 'total', 'infections', 'in', 'the', 'earliest', 'stages', 'of', 'the', 'outbreak', 'the', 'study', 'found', 'these', 'cases', 'were', 'less', 'infectious', 'on', 'a', 'percontact', 'basis', 'but', 'because', 'those', 'individuals', 'werent', 'isolated', 'they', 'infected', 'more', 'people', 'in', 'total', 'the', 'big', 'unknown', 'however', 'is', 'how', 'common', 'it', 'may', 'be', 'for', 'people', 'to', 'become', 'infected', 'but', 'with', 'symptoms', 'too', 'mild', 'to', 'seek', 'treatment', 'currently', 'detection', 'is', 'based', 'on', 'molecular', 'testing', 'which', 'is', 'performed', 'only', 'on', 'individuals', 'who', 'come', 'into', 'contact', 'wit

In [21]:
# keyword:matching
#Greeting inputs
Greeting_inputs=["hi", "hello", "hola", "greetings", "wassup", "hey"]
# Greeting responses back to the user
Greeting_Response=["howdy", "hi", "hey", "what's good", "hello", "hey there"]


In [22]:
# Function to return a random greeting response to a users greeting.
def greeting(sentence):
    #if the user's input is from a greeting, then return a randomly chosen greeting response
    for word in sentence.split():
        if word.lower() in Greeting_inputs:
            return random.choice(Greeting_Response)

In [23]:
def response(user_response):
    user_response=user_response.lower()
    robo_response=''    # set the chatbot response to an empty string
    sent_tokens.append(user_response)   # Append the users response to the sentence list
    tfidf=TfidfVectorizer(tokenizer=Lemnormalize, stop_words="english")   # create a TFIdfVectorizer object
    TF=tfidf.fit_transform(sent_tokens)    # convert the text into a mtrix of Tf-Idf features
    val=cosine_similarity(TF[-1], TF)    # Get the measure of similarity(similarity score)
# Here we are finding the similariy score between the user resonse and the text scrapped from the website
# Get the index to the most similar text/sentence to the users response
    index=val.argsort()[0][-2]  # gives us most similar score at the end of the list
    flat=val.flatten()    # Reduce the dimensionaltiy of val to a single list
    flat.sort()
    score=flat[-2]    # Get the most similar score to the users response
    if (score==0):
        robo_response=robo_response+" I apologize, I did not understand"
    else:
        robo_response=robo_response+sent_tokens[index]
    
    sent_tokens.remove(user_response)
        
    return robo_response
        
        
    
    
    
    
    
    



# The users response/query
#user_response="what is a kidney disease?"



#user_response=user_response.lower()

#print(user_response)


#robo_response=''


#sent_tokens.append(user_response)

#print(sent_tokens)

#tfidf=TfidfVectorizer(tokenizer=Lemnormalize, stop_words="english")

#TF=tfidf.fit_transform(sent_tokens)


#print(TF)


#val=cosine_similarity(TF[-1], TF)

#print(val)

#index=val.argsort()[0][-2]


#flat=val.flatten()

#flat.sort()

#score=flat[-2]

#print(score)


#if (score==0):
    #robo_response=robo_response+" I apologize, I did not understand"
#else:
    #robo_response=robo_response+sent_tokens[index]
    
#print(robo_response)

#sent_tokens.remove(user_response)
    












In [24]:
flag=True
print("Cobot: I am a Doctor please ask your queries, If you want to exit, Enter Bye!")
while(flag==True):
    user_response=input()
    user_response=user_response.lower()
    if (user_response!="bye"):
        if(user_response=="thanks" or user_response=="Thank you"):
            flag=False
            print("Cobot: You are welcome")
        else:
            if(greeting(user_response)):
                print("Cobot: "+greeting(user_response))
            else:
                print("Cobot: "+response(user_response))
        
    else:
        flag=False
        print("Cobot: Chat  with you later")

Cobot: I am a Doctor please ask your queries, If you want to exit, Enter Bye!
bye
Cobot: Chat  with you later
