# Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import random
from datetime import datetime
import glob

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Accesing the Files from Github to Read

### Load and Preprocess the Data

In [60]:
!git clone https://github.com/sdey96/Chatbot.git

fatal: destination path 'Chatbot' already exists and is not an empty directory.


In [61]:
import os

path = '/content/Chatbot'

In [62]:
file_paths = glob.glob(os.path.join(path, '*.txt'))

In [63]:
raw_docs = []
for file_path in file_paths:
    with open(file_path, 'r', errors='ignore') as f:
        raw_docs.append(f.read().lower())

### Combining into a Single String

In [64]:
combined_raw_doc = ' '.join(raw_docs)

In [65]:
print(combined_raw_doc)

random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. for classification tasks, the output of the random forest is the class selected by most trees. for regression tasks, the mean or average prediction of the individual trees is returned.[1][2] random decision forests correct for decision trees' habit of overfitting to their training set.[3]: 587–588 

the first algorithm for random decision forests was created in 1995 by tin kam ho[1] using the random subspace method,[2] which, in ho's formulation, is a way to implement the "stochastic discrimination" approach to classification proposed by eugene kleinberg.[4][5][6]

an extension of the algorithm was developed by leo breiman[7] and adele cutler,[8] who registered[9] "random forests" as a trademark in 2006 (as of 2019, owned by minitab, inc.).[10] the extension combines breiman's "bagging" idea

# Tokenization and Lemmatization

In [66]:
lemmatizer = nltk.stem.WordNetLemmatizer()
tokens = nltk.word_tokenize(combined_raw_doc)
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

In [67]:
def LemNormalize(text):
    return nltk.word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))

### Store Sentences for Searching

In [68]:
sent_tokens = nltk.sent_tokenize(combined_raw_doc)

# Greetings & Responses

In [69]:
greetings = [
    "hello", "hi", "greetings", "sup", "what's up", "hey",
    "howdy", "good day", "yo", "hi there", "hiya"
]
greeting_responses = [
    "Hi there!", "Hey!", "*nods*", "Hello!", "Greetings!", "Hi!",
    "Howdy!", "Good day to you!", "Yo!", "Hi there!", "Hello, how can I help?"
]

In [70]:
how_are_you_phrases = [
    "how are you", "how's it going", "how do you do", "how are things"
]
how_are_you_responses = [
    "I'm just a program, but I'm functioning as expected! How about you?",
    "I'm here to assist you! How can I help today?",
    "Doing well, thank you! What can I do for you today?",
    "All systems operational! How are you?"
]

In [71]:
positive_responses = ["good", "all good", "well", "fine", "okay", "great"]

### Context to track the chatbot's last question

In [72]:
context = {"last_question": None}

In [73]:
def greet(sentence):
    for word in sentence.split():
        if word.lower() in greetings:
            return random.choice(greeting_responses)
        elif any(phrase in sentence.lower() for phrase in how_are_you_phrases):
            context["last_question"] = "how_are_you"
            return random.choice(how_are_you_responses)

### Simple Keyword-based Response Generation

In [74]:
def find_answer(user_response):
    user_response = LemNormalize(user_response)
    best_match = ""
    max_overlap = 0
    for sentence in sent_tokens:
        tokenized_sentence = LemNormalize(sentence)
        common_words = set(user_response).intersection(set(tokenized_sentence))
        if len(common_words) > max_overlap:
            max_overlap = len(common_words)
            best_match = sentence
    return best_match if best_match else "I am sorry, I don't have information on that."

### Recognizing and Responding to Date Queries

In [75]:
def handle_special_queries(user_response):
    date_phrases = ["what is the date", "what's the date", "today's date", "current date", "what date is it"]
    day_phrases = ["what day is it", "what's the day", "what is the day today", "what's the day today", "day"]
    if any(phrase in user_response for phrase in date_phrases):
        return f"Today's date is {datetime.now().strftime('%B %d, %Y')}."
    elif any(phrase in user_response for phrase in day_phrases):
        return f"Today is {datetime.now().strftime('%A')}."
    return None

In [43]:
'''
context = {}
def set_context(user, context_data):
    context[user] = context_data

def get_context(user):
    return context.get(user, {})
'''

### Fallback Options and Clarifying Options

In [76]:
def fallback():
    return "I'm not sure I understand. Could you please rephrase?"

def clarify():
    return "Do you mean...?"

# Main Chatbot Loop

In [81]:
def chatbot():
    flag = True
    print("Ruby: My name is Ruby. Let's have a conversation! Also, if you want to exit any time, just type Bye!")

    while flag:
        user_response = input("You: ").lower()
        if user_response != 'bye':
            if user_response in ('thanks', 'thank you', 'thanks and bye'):
                flag = False
                print("Ruby: You're welcome! Have a nice day!")
            else:
                if context.get("last_question") == "how_are_you" and any(resp in user_response for resp in positive_responses):
                    print("Ruby: How can I help you?")
                    context["last_question"] = None
                else:
                    greeting_response = greet(user_response)
                    if greeting_response:
                        print("Ruby:", greeting_response)
                    else:
                        special_response = handle_special_queries(user_response)
                        if special_response:
                            print("Ruby:", special_response)
                        else:
                            response_text = find_answer(user_response)
                            if response_text:
                                print("Ruby:", response_text)
                            else:
                                print("Ruby:", clarify())
                                print("Ruby:", fallback())
        else:
            flag = False
            print("Ruby: Bye! Take care...")

# Executing the Bot

In [None]:
chatbot()

Ruby: My name is Ruby. Let's have a conversation! Also, if you want to exit any time, just type Bye!
You: hi
Ruby: Yo!
You: how are you
Ruby: All systems operational! How are you?
You: all good
Ruby: How can I help you?
You: what is the date
Ruby: Today's date is July 27, 2024.
You: what is data science
Ruby: data science is an interdisciplinary academic field[1] that uses statistics, scientific computing, scientific methods, processes, scientific visualization, algorithms and systems to extract or extrapolate knowledge and insights from potentially noisy, structured, or unstructured data.
You: what is decision tree
Ruby: the left tree is the decision tree we obtain from using information gain to split the nodes and the right tree is what we obtain from using the phi function to split the nodes.
You: what is the day
Ruby: Today is Saturday.
You: who coined the term data science
Ruby: [20] in 1985, in a lecture given to the chinese academy of sciences in beijing, c. f. jeff wu used the 