In [None]:
import os
import re
import json
import pickle
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


def clean_text(text):
    """Cleans text by removing punctuation while preserving spaces between words."""
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation but keep spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Ensure single spaces
    return text.lower()


def load_faq_data(files):
    """Loads FAQ data from JSON/TXT files and returns cleaned questions & answers."""
    questions, answers = [], []

    for file in files:
        try:
            with open(file, "r", encoding="utf-8") as f:
                first_line = f.readline().strip()
                f.seek(0)  # Reset file pointer

                # JSON Format
                if first_line.startswith("[") or first_line.startswith("{"):
                    try:
                        faq_data = json.load(f)
                        for entry in faq_data:
                            if "question" in entry and "answer" in entry:
                                questions.append(clean_text(entry["question"]))
                                answers.append(clean_text(entry["answer"]))
                    except json.JSONDecodeError:
                        print(f" Error loading JSON: {file} (Check formatting)")

                # TXT Format (Assumes "Question: Answer" format)
                else:
                    for line in f:
                        if ":" in line:
                            q, a = line.split(":", 1)
                            questions.append(clean_text(q))
                            answers.append(clean_text(a))

        except FileNotFoundError:
            print(f" File not found: {file}")
        except Exception as e:
            print(f" Error reading {file}: {e}")

    return questions, answers

# Define FAQ file paths
faq_files = [
    "/content/Aadhar_Faq.txt",
    "/content/Amazon_sagemaker_Faq.txt",
    "/content/faq_results.txt",
    "/content/HDFC_Faq.txt",
    "/content/Sevenhillshospital_faq.txt",
    "/content/Tata_comm_faq.txt"
]

# Load the data
questions, answers = load_faq_data(faq_files)

# Validate dataset
if not questions:
    raise ValueError(" Error: The 'questions' list is empty. Check your dataset!")

print(f" Loaded {len(questions)} questions.")
print("Sample Questions:", questions[:5])


summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_answer(answer):
    """Summarizes an answer only if it's long enough and not empty."""
    if not answer.strip():  # Skip empty answers
        return answer

    words = answer.split()
    if len(words) > 30:  # Only summarize if answer is long enough
        max_len = min(50, int(len(words) * 0.7))  # Adaptive max_length
        min_len = max(20, int(max_len * 0.5))  # Adaptive min_length

        try:
            summary = summarizer(answer, max_length=max_len, min_length=min_len, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            print(f" Summarization failed: {e}. Using the original answer.")
            return answer  # Return original text if summarization fails

    return answer  # Return original text if it's too short


# Remove empty answers before summarizing
filtered_answers = [ans for ans in answers if ans.strip()]

# Apply summarization only to longest valid answers
long_answers = sorted(filtered_answers, key=len, reverse=True)[:max(1, int(len(filtered_answers) * 0.1))]
summarized_long_answers = {ans: summarize_answer(ans) for ans in long_answers}

# Replace summarized answers in the original list
answers = [summarized_long_answers.get(ans, ans) for ans in answers]

print(f" Summarized {len(summarized_long_answers)} long answers.")


valid_questions, valid_answers = [], []
for q, a in zip(questions, answers):
    if len(q.split()) > 1:  # Ensure question has at least two words
        valid_questions.append(q)
        valid_answers.append(a)

if not valid_questions:
    raise ValueError(" Error: All questions are either empty or too short.")

# Train TF-IDF model
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(valid_questions)

# Save vectorizer & dataset
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("faq_data.pkl", "wb") as f:
    pickle.dump({"questions": valid_questions, "answers": valid_answers}, f)

print(f" TF-IDF model trained on {len(valid_questions)} questions and saved successfully.")



from sklearn.metrics.pairwise import cosine_similarity

def get_best_answer(user_query):
    """
    Finds the best-matching FAQ answer using cosine similarity.
    If similarity is too low, return a fallback response.
    """
    user_query_vec = vectorizer.transform([user_query])
    similarities = cosine_similarity(user_query_vec, tfidf_matrix)

    best_match_idx = similarities.argmax()
    best_score = similarities[0, best_match_idx]

    # Define a threshold for a valid match (adjust as needed)
    threshold = 0.2  # Experiment with different values

    if best_score < threshold:
        return "Sorry, I couldn't find an exact match. Please try rephrasing your question."

    return valid_answers[best_match_idx]



test_queries = [
    "How do I apply for Aadhaar?",
    "What are the required documents for enrolment?",
    "Do I have to pay for Aadhaar registration?",
    "Tell me about Amazon SageMaker.",
    "What is the process for opening an HDFC bank account?",
]

print("\n🔹 **FAQ Chatbot Predictions:**\n")
for query in test_queries:
    response = get_best_answer(query)
    print(f"User: {query}\nBot: {response}\n")


✅ Loaded 2367 questions.
Sample Questions: ['where can i enrol for aadhaar', 'what are the documents required for enrolment for aadhaar', 'do i need to bring original documents for aadhaar enrolment', 'do i have to pay any fee for aadhaar enrolment', 'what kind of data gets captured during aadhaar enrolment']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


⚠️ Summarization failed: index out of range in self. Using the original answer.
✅ Summarized 234 long answers.
✅ TF-IDF model trained on 2367 questions and saved successfully.

🔹 **FAQ Chatbot Predictions:**

User: How do I apply for Aadhaar?
Bot: The nro card can be used only in india process to apply. You may apply for the card by submitting the eage form to any of the below addresses.

User: What are the required documents for enrolment?
Bot: you need to fill up an application form available at enrolment centre along with supporting documents which are proof of identity poi proof of address poa proof of relationship por and date of birthdob document uidai accepts 31 poi and 44 poa 14 por and 14 dob documents view the nationally valid list of supporting documents

User: Do I have to pay for Aadhaar registration?
Bot: no aadhaar enrolment is totally free of cost therefore you need not pay anything at the enrolment centre

User: Tell me about Amazon SageMaker.
Bot: amazon sagemaker is 

In [2]:
!pip install flask flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [3]:
!pip install flask flask-ngrok pyngrok tensorflow numpy pillow scikit-learn



Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [5]:
import os
import pickle
from flask import Flask, request, render_template_string, jsonify
from pyngrok import ngrok
from sklearn.metrics.pairwise import cosine_similarity


app = Flask(__name__)


NGROK_AUTH_TOKEN = "2sAS7yHBpbbFZcQD2GcUdaLOCgO_3fM2kKzd1jjPhzCMvbouz"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Load vectorizer and FAQ data
with open("/content/vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

with open("/content/faq_data.pkl", "rb") as f:
    faq_data = pickle.load(f)

questions = faq_data["questions"]
answers = faq_data["answers"]
tfidf_matrix = vectorizer.transform(questions)


def get_best_answer(user_query):
    """Finds the best-matching FAQ answer using cosine similarity."""
    user_query_vec = vectorizer.transform([user_query])
    similarities = cosine_similarity(user_query_vec, tfidf_matrix)
    best_match_idx = similarities.argmax()
    return answers[best_match_idx]


HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>FAQ Chatbot</title>
    <style>
        body {
            font-family: 'Arial', sans-serif;
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
            background: linear-gradient(to right, #00c6ff, #0072ff);
            margin: 0;
        }
        .chat-container {
            width: 400px;
            background: white;
            border-radius: 10px;
            box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.2);
            overflow: hidden;
        }
        .header {
            background: #0072ff;
            color: white;
            text-align: center;
            padding: 15px;
            font-size: 20px;
        }
        .chat-box {
            height: 350px;
            overflow-y: auto;
            padding: 15px;
            background: #f4f4f9;
        }
        .message {
            padding: 10px;
            margin: 5px 0;
            border-radius: 10px;
            max-width: 80%;
        }
        .user {
            background: #d1e7dd;
            text-align: right;
            margin-left: auto;
        }
        .bot {
            background: #f8d7da;
            text-align: left;
            margin-right: auto;
        }
        .input-area {
            display: flex;
            padding: 10px;
            background: white;
            border-top: 1px solid #ddd;
        }
        input {
            flex: 1;
            padding: 10px;
            border: 1px solid #ddd;
            border-radius: 5px;
            outline: none;
        }
        button {
            background: #0072ff;
            color: white;
            border: none;
            padding: 10px 15px;
            margin-left: 10px;
            cursor: pointer;
            border-radius: 5px;
        }
        button:hover {
            background: #005bb5;
        }
    </style>
</head>
<body>
    <div class="chat-container">
        <div class="header">FAQ Chatbot</div>
        <div class="chat-box" id="chat-box"></div>
        <div class="input-area">
            <input type="text" id="user-input" placeholder="Ask a question...">
            <button onclick="sendMessage()">Send</button>
        </div>
    </div>

    <script>
        function sendMessage() {
            var userInput = document.getElementById("user-input").value.trim();
            if (userInput === "") return;

            var chatBox = document.getElementById("chat-box");
            chatBox.innerHTML += `<div class='message user'>${userInput}</div>`;
            document.getElementById("user-input").value = "";

            fetch("/get_answer", {
                method: "POST",
                headers: { "Content-Type": "application/json" },
                body: JSON.stringify({ question: userInput })
            })
            .then(response => response.json())
            .then(data => {
                chatBox.innerHTML += `<div class='message bot'>${data.answer}</div>`;
                chatBox.scrollTop = chatBox.scrollHeight;
            });
        }
    </script>
</body>
</html>
"""

@app.route("/")
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route("/get_answer", methods=["POST"])
def get_answer():
    data = request.get_json()
    user_query = data.get("question", "").strip()
    response = get_best_answer(user_query) if user_query else "Please enter a valid question."
    return jsonify({"answer": response})


port = 5000
public_url = ngrok.connect(port).public_url
print(f"FAQ Chatbot is live at: {public_url}")
app.run(port=port)

FAQ Chatbot is live at: https://8566-34-138-207-249.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [14/Mar/2025 07:21:05] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [14/Mar/2025 07:21:05] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [14/Mar/2025 07:21:27] "POST /get_answer HTTP/1.1" 200 -
