In [3]:
import nltk
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# import spacy
lemmatizer = nltk.stem.WordNetLemmatizer()

In [6]:
data = pd.read_csv('Samsung Dialog.txt', sep = ':', header = None)
data.head()

Unnamed: 0,0,1
0,Customer,"Hi, I'm looking to buy a new phone, and I'm i..."
1,Sales Agent,"Great, we have a wide range of Samsung phones..."
2,Customer,"Well, I want a phone with a good camera, long..."
3,Sales Agent,Absolutely. We have a lot of great options th...
4,Customer,"No, I haven't. Tell me more about it."


In [7]:
cust = data.loc[data[0] == 'Customer']
sales = data.loc[data[0] == 'Sales Agent']

sales.head()

Unnamed: 0,0,1
1,Sales Agent,"Great, we have a wide range of Samsung phones..."
3,Sales Agent,Absolutely. We have a lot of great options th...
5,Sales Agent,The Galaxy S21 Ultra has a 108-megapixel came...
7,Sales Agent,"The Galaxy S21 Ultra starts at $1,199, but we..."
9,Sales Agent,The Galaxy S21 Ultra comes with a standard on...


In [8]:
df = pd.DataFrame()

df['Questions'] = cust[1].reset_index(drop = True)
df['Answer'] = sales[1].reset_index(drop = True)

df

Unnamed: 0,Questions,Answer
0,"Hi, I'm looking to buy a new phone, and I'm i...","Great, we have a wide range of Samsung phones..."
1,"Well, I want a phone with a good camera, long...",Absolutely. We have a lot of great options th...
2,"No, I haven't. Tell me more about it.",The Galaxy S21 Ultra has a 108-megapixel came...
3,That sounds great. How much does it cost?,"The Galaxy S21 Ultra starts at $1,199, but we..."
4,"Okay, I'm interested. But I have a few more q...",The Galaxy S21 Ultra comes with a standard on...
5,That's good to know. And what about the opera...,"Yes, the Galaxy S21 Ultra runs on Android 11,..."
6,"Okay, that's good. But I'm also interested in...",Absolutely. The Galaxy A52 is a great mid-ran...
7,That sounds like a good option for me. How mu...,"The Galaxy A52 starts at $399, but again, we ..."
8,"Okay, I'll think about it. But can you also t...",Of course. The Galaxy Z Fold2 is a really uni...
9,"That sounds really cool, but it also sounds e...","The Galaxy Z Fold2 starts at $1,999, but agai..."


In [16]:
# Define a function for text preprocessing (including lemmatization)
def preprocess_text(text):
    # Identifies all sentences in the df
    sentences = nltk.sent_tokenize(text)

    # Tokenize and lemmatize each word in each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum()]
        # Turns to basic root - each word in the tokenized word found in the tokenized sentence - if they are all alphanumeric
        # The code above does the following:
        # Identifies every word in the sentence
        # Turns it to a lower case
        # Lemmatizes it if the word is alphanumeric

        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)

    return ' '.join(preprocessed_sentences)


df['tokenized Questions'] = df['Questions'].apply(preprocess_text)
df.head()

Unnamed: 0,Questions,Answer,tokenized Questions
0,"Hi, I'm looking to buy a new phone, and I'm i...","Great, we have a wide range of Samsung phones...",hi i looking to buy a new phone and i interest...
1,"Well, I want a phone with a good camera, long...",Absolutely. We have a lot of great options th...,well i want a phone with a good camera long ba...
2,"No, I haven't. Tell me more about it.",The Galaxy S21 Ultra has a 108-megapixel came...,no i have tell me more about it
3,That sounds great. How much does it cost?,"The Galaxy S21 Ultra starts at $1,199, but we...",that sound great how much doe it cost
4,"Okay, I'm interested. But I have a few more q...",The Galaxy S21 Ultra comes with a standard on...,okay i interested but i have a few more questi...


In [18]:
corpus = df['tokenized Questions'].to_list()
# Vectorize corpus
tfidf_vectorizer = TfidfVectorizer()

vectorised_corpus = tfidf_vectorizer.fit_transform(corpus)
# TDIDF is a numerical statistic used to evaluate how important a word is to a document in a collection or corpus.
# The TfidfVectorizer calculates the Tfidf values for each word in the corpus and uses them to create a matrix where each row represents a document and each column represents a word.
# The cell values in the matrix correspond to the importance of each word in each document.

In [None]:
def get_response(user_input):
    user_input_processed = preprocess_text(user_input) # ....................... Preprocess the user's input using the preprocess_text function

    user_input_vector = tfidf_vectorizer.transform([user_input_processed])# .... Vectorize the preprocessed user input using the TF-IDF vectorizer

    similarity_scores = cosine_similarity(user_input_vector, vectorised_corpus) # .. Calculate the score of similarity between the user input vector and the corpus (df) vector

    most_similar_index = similarity_scores.argmax() # ..... Find the index of the most similar question in the corpus (df) based on cosine similarity

    return data['Answers'].iloc[most_similar_index] # ... Retrieve the corresponding answer from the df DataFrame and return it as the chatbot's response


# create greeting list
greetings = ["Hey There.... I am a creation of Ehiz Danny Agba Coder.... How can I help",
            "Hi Human.... How can I help",
            'Twale baba nla, wetin dey happen nah',
            'How far Alaye, wetin happen'
            "Good Day .... How can I help",
            "Hello There... How can I be useful to you today",
            "Hi Student.... How can I be of use"]

exits = ['thanks bye', 'bye', 'quit', 'exit', 'bye bye', 'close']
farewell = ['Thanks....see you soon', 'Babye, See you soon', 'Bye... See you later', 'Bye... come back soon']

random_farewell = random.choice(farewell) # ---------------- Randomly select a farewell message from the list
random_greetings = random.choice(greetings) # -------- Randomly select greeting message from the list

# Test your chatbot
while True:
    user_input = input("You: ")
    if user_input.lower() in exits:
        print(f"\nChatbot: {random_farewell}!")
        break
    if user_input.lower() in ['hi', 'hello', 'hey', 'hi there']:
        print(f"\nChatbot: {random_greetings}!")
    else:
        response = get_response(user_input)
        print(f"\nChatbot: {response}")