In [None]:
#reference: https://www.youtube.com/watch?v=9KZwRBg4-P0
#import required libraries
import numpy as np
import pandas as pd
from newspaper import Article
import random
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer #see impact of TfidfVectorizer and see each of their usage
from sklearn.metrics.pairwise import cosine_similarity
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt', quiet=True)

In [None]:
#get article from source
article = Article(url = 'https://en.wikipedia.org/wiki/Natural_language_processing')
article.download()
article.parse()
article.nlp()
corpus = article.text

In [None]:
corpus

In [None]:
text = corpus

In [None]:
sentence_list = nltk.sent_tokenize(text)

In [None]:
sentence_list[0]

In [None]:
#Function to return a greeting response to user greeting
def greeting_response(text):
    
    #bot greetings
    bot_greetings = ['howdy', 'hi', 'hey', 'hola', 'hello']
    
    #user greetings
    user_greetings = ['hi', 'hey', 'hello', 'greetings', 'wasup']
    
    for word in text.split():
        if word in user_greetings:
            return random.choice(bot_greetings)

In [None]:
#Pre-processing the raw text

lemmer = nltk.stem.WordNetLemmatizer()

#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [None]:
#bot response
def bot_response(user_input):
    sentence_list.append(user_input)
    bot_response = ''
    
    #cm = CountVectorizer().fit_transform(sentence_list)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sentence_list)
    
    similarity_scores = cosine_similarity(tfidf[-1], tfidf)
    
    similarity_scores_list = similarity_scores.flatten() #reduce dimension of similarity_scores
    indexes_similar_low_to_high = similarity_scores_list.argsort() #1st index show similiarity of sentence to itself
    
    sorted_values = sorted(similarity_scores_list)
    
    #print(len(similarity_scores_list))
    #print(similarity_scores_list)
    #print(indexes_similar_low_to_high)
    #print(sorted_values)
    
    if(sorted_values[-2] == 0):
        bot_response += "I am sorry! I don't understand you"
    else:
        for i in range(-3,-1,1):
            bot_response += ' '+sentence_list[indexes_similar_low_to_high[i]] #pick the most similar sentence after sentence itself

    return bot_response

In [None]:
#for i in range(-3,-1,1):
#    print(i)

In [None]:
##numpy array argsort example
#ar = [10, 1, 11, 5]
#arr = np.array(ar)
#arr.argsort()
#arr.argsort()[-2]
#print(arr.sort())
#print(sorted(arr))

In [None]:
#strart chat
print('Bot: Hi am here to help you on general queries on NLP. Type your queries.')

chat_exist_list = ['bye', 'talk to you later', 'good bye', 'exit', 'quit', 'break', 'hang up', 'bye see you']

while(True):
    user_input = input()
    
    if(user_input.lower() in chat_exist_list):
        print('Bot: See you later')
        break
        
    else:
        #print(greeting_response(user_input.lower()))
        if(greeting_response(user_input.lower()) != None):
            print('Bot: '+ greeting_response(user_input.lower()))
        else:
            print('Bot: '+ bot_response(user_input.lower()))

In [None]:
#DONE