In [12]:
import numpy as np
import nltk
nltk.download('stopwords')
import string
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Importing and reading the corpus
f = open('yugas.txt', 'r', errors='ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower()  # converts text to lower case
nltk.download('punkt')  # Using the punkt tokenizer
nltk.download('wordnet')  # Using the WordNet dictionary
sent_tokens = nltk.sent_tokenize(raw_doc)  # converts doc to list of sentence
word_tokens = nltk.word_tokenize(raw_doc)  # converts doc to list of words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Text preprocessing
lemmer = nltk.stem.WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))
def LemNormalize(text):
    return [lemmer.lemmatize(token) for token in nltk.word_tokenize(text.lower()) if token not in stopwords and token not in string.punctuation]

In [15]:
# Training Naïve Bayes classifier
tfidf_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
X_train = tfidf_vectorizer.fit_transform(sent_tokens)
y_train = np.arange(len(sent_tokens))  # Assigning unique labels to each sentence for training
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)



In [16]:
# Training Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

In [17]:
# Training LSTM model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_tokens)
total_words = len(tokenizer.word_index) + 1

In [24]:
input_sequences = []
for line in sent_tokens:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

X_train_lstm, y_train_lstm = input_sequences[:,:-1],input_sequences[:,-1]
y_train_lstm = tf.keras.utils.to_categorical(y_train_lstm, num_classes=total_words)

lstm_model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(total_words, activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
# Defining the greeting function
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "whats up", "hey")
GREET_RESPONSES = ["hi", "hey", "nods", "hi there", "hello", "I am glad! you are talking to me"]
def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)


In [26]:
# Response generation using Naïve Bayes classifier
def nb_response(user_response):
    X_test = tfidf_vectorizer.transform([user_response])
    predicted_index = nb_classifier.predict(X_test)[0]
    return sent_tokens[predicted_index]


In [27]:
# Response generation using Support Vector Machine (SVM) classifier
def svm_response(user_response):
    X_test = tfidf_vectorizer.transform([user_response])
    predicted_index = svm_classifier.predict(X_test)[0]
    return sent_tokens[predicted_index]

In [28]:
# Response generation using LSTM
def lstm_response(seed_text):
    for _ in range(10):  # Generate 10 words (adjust as needed)
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = lstm_model.predict(token_list)[0]
        predicted_index = np.argmax(predicted_probs)
        predicted_word = [word for word, index in tokenizer.word_index.items() if index == predicted_index][0]
        seed_text += " " + predicted_word
    return seed_text

In [None]:
# Defining conversation start/end protocols
flag = True
print("BOT: My name is Ramya. Let's have a conversation. Also, if you want to exit anytime just type 'Bye!'")
while flag:
    user_response = input()
    if user_response != 'Bye!':
        if user_response == 'thanks' or user_response == 'thank you':
            flag = False
            print("BOT: You are welcome")
        else:
            if greet(user_response) is not None:
                print("BOT: " + greet(user_response))
            else:
                print("BOT (Naïve Bayes):", nb_response(user_response))  # Generate response using Naïve Bayes
                print("BOT (SVM):", svm_response(user_response))  # Generate response using SVM
                print("BOT (LSTM):", lstm_response(user_response))  # Generate response using LSTM
    else:
        flag = False
        print("BOT: Goodbye! Take care <3")