In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import numpy as np

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('nps_chat')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [3]:
# Load the NLTK chatbot corpus dataset
posts = nltk.corpus.nps_chat.xml_posts()

In [4]:
# Preprocessing functions
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove stopwords
    return tokens

In [5]:
# Apply stemming
stemmer = PorterStemmer()
stemmed_posts = [' '.join([stemmer.stem(word) for word in preprocess(post.text)]) for post in posts]

In [6]:
# Apply lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_posts = [' '.join([lemmatizer.lemmatize(word) for word in preprocess(post.text)]) for post in posts]

# Bag of Words representation
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(lemmatized_posts)

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_posts)


In [9]:
# Train Word2Vec model
word2vec_model = Word2Vec([preprocess(post.text) for post in posts], min_count=1)

# Define function to get Word2Vec embeddings
def get_word2vec_embeddings(tokens, model):
    embeddings = []
    for token in tokens:
        if token in model.wv.key_to_index:  # Checking if token exists in the vocabulary
            embeddings.append(model.wv[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


In [None]:
# Create chatbot function
def simple_chatbot(user_input):
    user_input = lemmatizer.lemmatize(user_input.lower())
    user_input_vec = get_word2vec_embeddings(word_tokenize(user_input), word2vec_model)

    similarity_scores = []
    for post in lemmatized_posts:
        post_vec = get_word2vec_embeddings(word_tokenize(post), word2vec_model)
        similarity = cosine_similarity([user_input_vec], [post_vec])[0][0]
        similarity_scores.append(similarity)

    best_match_index = np.argmax(similarity_scores)
    response = posts[best_match_index].get('class')
    return response

# Test the chatbot
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break
    response = simple_chatbot(user_input)
    print("Chatbot:", response)

You: hi
Chatbot: Greet
You: how are you today
Chatbot: ynQuestion
You: i am really good
Chatbot: Statement
You: thats good to hear
Chatbot: Statement
You: it was nice chatting with you
Chatbot: Statement
You: bye
Chatbot: Bye
