In [3]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK resources
nltk.download('punkt')

# Module 1: Data Preparation & NLP Model Development
def load_and_vectorize_data(file_path):
    """
    Load FAQ data from a CSV file and vectorize the questions using TF-IDF.

    Parameters:
    - file_path (str): Path to the CSV file containing FAQ data with 'Question' and 'Answer' columns.

    Returns:
    - questions (list): List of FAQ questions.
    - answers (list): List of FAQ answers.
    - vectorizer (TfidfVectorizer): TF-IDF vectorizer fitted on the questions.
    - question_vectors (sparse matrix): TF-IDF vectors of the questions.
    """
    try:
        # Load the FAQ data from the CSV file
        data = pd.read_csv(file_path)

        # Check if expected columns exist
        if 'Question' not in data.columns or 'Answer' not in data.columns:
            raise ValueError("CSV file must contain 'Question' and 'Answer' columns")

        # Extract questions and answers from the dataframe
        questions = data['Question'].tolist()
        answers = data['Answer'].tolist()

        # Initialize the TF-IDF vectorizer
        vectorizer = TfidfVectorizer()

        # Fit and transform the questions into TF-IDF vectors
        question_vectors = vectorizer.fit_transform(questions)

        return questions, answers, vectorizer, question_vectors

    except Exception as e:
        print(f"An error occurred: {e}")

# Module 2: Chatbot Logic & Integration
def find_closest_answer(user_question, vectorizer, question_vectors, answers, threshold=0.2):
    """
    Find the closest matching FAQ question to the user's input and return the corresponding answer.

    Parameters:
    - user_question (str): The question input by the user.
    - vectorizer (TfidfVectorizer): The TF-IDF vectorizer fitted on the FAQ questions.
    - question_vectors (sparse matrix): The TF-IDF vectors of the FAQ questions.
    - answers (list): The list of FAQ answers.
    - threshold (float): The minimum cosine similarity score required to return an answer.

    Returns:
    - str: The answer corresponding to the closest matching FAQ question, or a message if no match is found.
    """
    try:
        # Transform the user's question into a vector using the TF-IDF Vectorizer
        user_question_vector = vectorizer.transform([user_question])

        # Compute cosine similarity between the user's question vector and the FAQ question vectors
        similarities = cosine_similarity(user_question_vector, question_vectors)

        # Find the index of the closest matching question
        closest_index = similarities.argmax()
        closest_similarity = similarities[0, closest_index]

        # Check if the closest similarity is above the threshold
        if closest_similarity >= threshold:
            return answers[closest_index]
        else:
            return "Sorry, I couldn't find an answer to your question. Please try rephrasing it."

    except Exception as e:
        print(f"An error occurred: {e}")
        return "Sorry, I couldn't find an answer to your question."

# Module 3: User Interface & Deployment
def chat_with_bot(questions, answers, vectorizer, question_vectors):
    """
    Simple command-line interface to chat with the FAQ bot.

    Parameters:
    - questions (list): List of FAQ questions.
    - answers (list): List of FAQ answers.
    - vectorizer (TfidfVectorizer): The TF-IDF vectorizer fitted on the FAQ questions.
    - question_vectors (sparse matrix): The TF-IDF vectors of the FAQ questions.
    """
    print("Welcome to the FAQ Chatbot! Type 'exit' to quit.")

    while True:
        # Get user input
        user_question = input("\nYou: ")

        # Exit condition
        if user_question.lower() == 'exit':
            print("Goodbye!")
            break

        # Find the closest answer and print it
        answer = find_closest_answer(user_question, vectorizer, question_vectors, answers)
        print("Bot:", answer)

# Sample usage of the complete FAQ chatbot system
if __name__ == "__main__":
    file_path = 'BankFAQs.csv'  # Path to your CSV file
    results = load_and_vectorize_data(file_path)

    if results:
        questions, answers, vectorizer, question_vectors = results

        # Verify by printing some data
        print("Sample Questions:")
        print(questions[:5])
        print("\nSample Answers:")
        print(answers[:5])
        print("\nTF-IDF Matrix Shape:")
        print(question_vectors.shape)
        print("\nchatbot is ready. please type your question")

        # Start the chatbot interface
        chat_with_bot(questions, answers, vectorizer, question_vectors)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sample Questions:
['Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number', 'What details are required when I want to perform a secure IVR transaction', 'How should I get the IVR Password  if I hold an add-on card', 'How do I register my Mobile number for IVR Password ', 'How can I obtain an IVR Password ']

Sample Answers:
['Please listen to the recorded message and follow the instructions while entering your card details.', 'To perform a secure IVR transaction, you will need your 16-digit Card number, Card expiry date, CVV number, mobile number and IVR password.', 'An IVR password can be requested only from the registered mobile number and will be sent to the registered mobile number / email ID of the primary card holder only.', 'Please call our Customer Service Centre and ensure that your mobile number is updated in our records.', "By Sending SMS request: Send an SMS 'PWD<space>1234' to 9717465555 or to 5676712 from your registered (with Bank) mobile nu