## 1. Load Libraries & Datas

In [14]:
#import needed libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

In [15]:
# load bot dataset
df = pd.read_csv('data.txt', sep='\n', header=None, names=["data"])

In [16]:
print(df.shape)
df.head()

(83, 1)


Unnamed: 0,data
0,Coronaviruses are a large family of viruses wh...
1,COVID-19 is the infectious disease caused by t...
2,The most common symptoms of COVID-19 are fever...
3,Most people (about 80%) recover from the disea...
4,"If you have minor symptoms, such as a slight c..."


## 2. Préprocessing

In [17]:
def get_wordnet_pos(pos_tag):
    # canonical form
    output = np.asarray(pos_tag)
    for i in range(len(pos_tag)):
        if pos_tag[i][1].startswith('J'):
            output[i][1] = wordnet.ADJ
        elif pos_tag[i][1].startswith('V'):
            output[i][1] = wordnet.VERB
        elif pos_tag[i][1].startswith('R'):
            output[i][1] = wordnet.ADV
        else:
            output[i][1] = wordnet.NOUN
    return output


lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

def preprocessing_func(article):
    # Tokenize
    tokens = word_tokenize(article)
    # Remove punctuation
    tokens = [t.lower() for t in tokens if (t.lower() == 'covid-19' or t.isalpha())]
    # Remove stop words
    tokens = [t for t in tokens if t not in stop_words]
    tokens_postag = get_wordnet_pos(pos_tag(tokens))
    tokens_clean = [lemmatizer.lemmatize(t, postag) for t, postag in tokens_postag]
    return tokens_clean

In [18]:
# Text preprocessing
df["tokens"] = df["data"].apply(lambda x: preprocessing_func(x))
df.head()

Unnamed: 0,data,tokens
0,Coronaviruses are a large family of viruses wh...,"[coronaviruses, large, family, virus, may, cau..."
1,COVID-19 is the infectious disease caused by t...,"[covid-19, infectious, disease, cause, recentl..."
2,The most common symptoms of COVID-19 are fever...,"[common, symptom, covid-19, fever, dry, cough,..."
3,Most people (about 80%) recover from the disea...,"[people, recover, disease, without, need, hosp..."
4,"If you have minor symptoms, such as a slight c...","[minor, symptom, slight, cough, mild, fever, g..."


## 3. TF-IDF

In [19]:
# Instantiate the TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer=lambda x: x)

In [20]:
# Compute the TF-IDF
tfidf = vectorizer.fit_transform(df['tokens']).toarray()
tfidf.shape

(83, 518)

## 4. cosine similarity

In [21]:
# Load metrics
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
#implement get_closest_sentence(query, tf_idf, vectorizer)
def get_closest_sentence(query, tfidf, vectorizer):
    query_tfidf = vectorizer.transform([preprocessing_func(query)]).toarray()
    sim = cosine_similarity(query_tfidf, tfidf)
    return sim.max(), sim.argmax()

In [23]:
# Define the greetings words and answers in two variables
greetings_inputs = ['hello', 'hi', 'good morning', 'hey']
greetings_answers = ['Hey there, I am CovidBot, how can I help you?', 'Hello, my name is CovidBot, nice to meet you.','CovidBot at your service, sir.', 'Hi Master, I am CovidBot.']
small_talks = ["Thanks for getting in touch with me", "I am so sorry I do not understand your point","I'll make sure to understand you after my next update"]

In [24]:
def greetings(sentence, greetings_inputs, greetings_outputs):
    for word in sentence.split():
        if word.lower() in greetings_inputs:
            return greetings_outputs[np.random.randint(len(greetings_outputs))]

## 5. Assembly

In [27]:
def covidbot(greetings_inputs, greetings_outputs, tf_idf, vectorizer, database):
    print('Please type rour question here')
    
    quit = False
    
    while(quit==False):
        query = input("&gt;")
        greet = greetings(query, greetings_inputs, greetings_outputs)
        if(greet!=None):
            print("CovidBot: "+greet)
        elif query.lower()=="bye":
            print("Bye! Have a wonderful day!")
            quit = True
        else:
            sim, closest = get_closest_sentence(query, tf_idf, vectorizer)
            if sim > 0.1:
                answer = database["data"].iloc[closest]
            else:
                answer = small_talks[np.random.randint(len(small_talks))]
            print(answer)

## 6. Test

In [28]:
covidbot(greetings_inputs, greetings_answers, tfidf, vectorizer, df)

Please type rour question here
&gt;hello
CovidBot: Hello, my name is CovidBot, nice to meet you.
&gt;what's covid-19?
COVID-19 is mainly spread through respiratory droplets expelled by someone who is coughing or has other symptoms such as fever or tiredness. Many people with COVID-19 experience only mild symptoms. This is particularly true in the early stages of the disease. It is possible to catch COVID-19 from someone who has just a mild cough and does not feel ill.
&gt;what's coronavirus ?
Coronaviruses are a large family of viruses which may cause illness in animals or humans.  In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19.
&gt;bye
Bye! Have a wonderful day!
