# Importing the required libraries

In [1]:
import numpy as np
import nltk
import string
import random

# Importing and reading the corpus

In [4]:
f=open('chatbot.txt','r')
raw_doc=f.read()
raw_doc=raw_doc.lower() #Converts text to lowercase

In [5]:
nltk.download('punkt') #Using the Punkt tokenizer
nltk.download('wordnet') #Using the WordNet dictionary-- lexical database for the English language
sent_tokens = nltk.sent_tokenize(raw_doc) #Converts doc to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) #Converts doc to list of words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Example of sentence tokens

In [6]:
sent_tokens[:2]

['data science is an interdisciplinary academic field[1] that uses statistics, scientific computing, scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from potentially noisy, structured, or unstructured data.',
 '[2]\n\ndata science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine).']

### Example of word tokens

In [7]:
nltk.word_tokenize(raw_doc)[:2]

['data', 'science']

#### **Text preprocessing**


In [8]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

**Defining the greeting function**

In [9]:
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey")
GREET_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greet(sentence):

    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)

**Response generation**

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer#information retrieval to analyze,Measures the frequency of a term (word) in a document.
from sklearn.metrics.pairwise import cosine_similarity## compare text documents, metric that measures the similarity of two documents

In [11]:
# Define a function named 'response' that takes a user response as input
def response(user_response):
    # Initialize an empty string for the response generated by the chatbot
    robo1_response = ''

    # Create a TfidfVectorizer object with LemNormalize as the tokenizer and English stop words
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

    # Fit and transform the TfidfVectorizer on the existing sentences (sent_tokens)
    tfidf = TfidfVec.fit_transform(sent_tokens)

    # Calculate cosine similarity between the TF-IDF vector of the user response and all other sentences
    vals = cosine_similarity(tfidf[-1], tfidf)

    # Get the index of the sentence with the second highest cosine similarity
    idx = vals.argsort()[0][-2]

    # Flatten the cosine similarity values, sort them, and get the second highest value
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]

    # Check if the second highest TF-IDF value is 0 (no significant match)
    if req_tfidf == 0:
        robo1_response = robo1_response + "I am sorry! I don't understand you"
        return robo1_response
    else:
        # Get the response from the sentence with the second highest cosine similarity
        robo1_response = robo1_response + sent_tokens[idx]
        return robo1_response


### **Defining conversation start/end protocols**





In [12]:
# Set the initial value of the flag to True
flag = True

# Introduction message
print("BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!")

# Main conversation loop
while flag == True:
    # Get user input
    user_response = input()

    # Convert user input to lowercase
    user_response = user_response.lower()

    # Check if user wants to end the conversation
    if user_response != 'bye':
        # Check for specific responses
        if user_response == 'thanks' or user_response == 'thank you':
            # If the user says thanks, end the conversation
            flag = False
            print("BOT: You are welcome..")
        else:
            # Check if the user's input is a greeting
            if greet(user_response) is not None:
                print("BOT: " + greet(user_response))
            else:
                # If not a greeting, process the user's input
                sent_tokens.append(user_response)
                word_tokens = word_tokens + nltk.word_tokenize(user_response)
                final_words = list(set(word_tokens))
                print("BOT: ", end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        # If the user says bye, end the conversation
        flag = False
        print("BOT: Goodbye! Take care <3 ")

BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!
tell me about data science
BOT: 



[6] however, data science is different from computer science and information science.
hi
BOT: *nods*
bye
BOT: Goodbye! Take care <3 


In [13]:
# Set the initial value of the flag to True
flag = True

# Introduction message
print("BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!")

# Main conversation loop
while flag == True:
    # Get user input
    user_response = input()

    # Convert user input to lowercase
    user_response = user_response.lower()

    # Check if user wants to end the conversation
    if user_response != 'bye':
        # Check for specific responses
        if user_response == 'thanks' or user_response == 'thank you':
            # If the user says thanks, end the conversation
            flag = False
            print("BOT: You are welcome..")
        else:
            # Check if the user's input is a greeting
            if greet(user_response) is not None:
                print("BOT: " + greet(user_response))
            else:
                # If not a greeting, process the user's input
                sent_tokens.append(user_response)
                word_tokens = word_tokens + nltk.word_tokenize(user_response)
                final_words = list(set(word_tokens))
                print("BOT: ", end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        # If the user says bye, end the conversation
        flag = False
        print("BOT: Goodbye! Take care <3 ")

BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!
hi
BOT: I am glad! You are talking to me
Data Science
BOT: [6] however, data science is different from computer science and information science.
NLP
BOT: I am sorry! I don't understand you
Neutral language processing
BOT: I am sorry! I don't understand you
bye
BOT: Goodbye! Take care <3 
