In [5]:
pip install pandas nltk scikit-learn



In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK data (only need to do this once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def preprocess_text(text):
    """Applies preprocessing (tokenization, stopword removal, lemmatization)."""
    # 1. Tokenization and lowercasing
    tokens = nltk.word_tokenize(text.lower())

    # 2. Stopword removal and lemmatization
    processed_tokens = [
        lemmatizer.lemmatize(token) for token in tokens
        if token.isalnum() and token not in stop_words
    ]

    return " ".join(processed_tokens)

In [9]:
# Load the self-created CSV dataset
try:
    df = pd.read_csv('faq_dataset.csv')
except FileNotFoundError:
    print("Error: faq_dataset.csv not found.")
    # Handle error appropriately
    exit()

# Ensure columns are named correctly
if 'Question' not in df.columns or 'Answer' not in df.columns:
    print("Error: CSV must have 'Question' and 'Answer' columns.")
    exit()

# Preprocess all questions in the dataset
df['processed_question'] = df['Question'].apply(preprocess_text)

In [10]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Create the TF-IDF matrix for the dataset questions
question_tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_question'])

In [11]:
def get_best_answer(user_query):
    """Finds and returns the best answer from the dataset."""

    # 1. Preprocess user input query
    processed_query = preprocess_text(user_query)

    # 2. Vectorize the user query using the *same* vectorizer
    query_tfidf_vector = tfidf_vectorizer.transform([processed_query])

    # 3. Compute cosine similarity [cite: 8, 26]
    similarities = cosine_similarity(query_tfidf_vector, question_tfidf_matrix)

    # 4. Find the index of the most similar question
    best_match_index = np.argmax(similarities)

    # Get the similarity score of the best match
    best_match_score = similarities[0, best_match_index]

    # 5. Return the best matching answer
    # You can set a threshold to handle irrelevant questions
    if best_match_score > 0.2: # Threshold (tune as needed)
        return df['Answer'].iloc[best_match_index]
    else:
        return "I'm sorry, I don't have an answer for that. Please try rephrasing."

In [12]:
# --- 1. Comment out the original interactive loop ---

# print("University FAQ Chatbot: Ask me anything! (Type 'exit' to quit)")
# while True:
#     user_input = input("You: ")
#     if user_input.lower() == 'exit':
#         break
#
#     answer = get_best_answer(user_input)
#     print(f"Bot: {answer}")


# --- 2. Add your test questions directly here ---

print("--- Chatbot Test Run ---")

# Test Question 1
query1 = "How much is the admission fee?"
answer1 = get_best_answer(query1)
print(f"Q: {query1}")
print(f"A: {answer1}\n") # \n adds a new line for spacing

# Test Question 2
query2 = "When do exams start?"
answer2 = get_best_answer(query2)
print(f"Q: {query2}")
print(f"A: {answer2}\n")

# Test Question 3
query3 = "What is the hostel cost?"
answer3 = get_best_answer(query3)
print(f"Q: {query3}")
print(f"A: {answer3}\n")

# Test Question 4 (An unknown question)
query4 = "What is the menu for the cafeteria?"
answer4 = get_best_answer(query4)
print(f"Q: {query4}")
print(f"A: {answer4}\n")

print("--- End of Test Run ---")

--- Chatbot Test Run ---
Q: How much is the admission fee?
A: Admission fee is ₹5000.

Q: When do exams start?
A: Exams will begin in December as per the academic calendar.

Q: What is the hostel cost?
A: The annual hostel fee is ₹1,20,000.

Q: What is the menu for the cafeteria?
A: I'm sorry, I don't have an answer for that. Please try rephrasing.

--- End of Test Run ---
