In [1]:
import pandas as pd

# Load dataset from CSV
df = pd.read_csv("faq_dataset.csv")

# Convert to list of dictionaries
faq_dataset = df.to_dict(orient="records")

In [5]:
print(faq_dataset[:5])  # Print the first 5 entries


[{'Category': 'Services', 'Question': 'What services does your software house offer?', 'Answer': 'We offer custom software development, mobile and web app development, cloud solutions, UI/UX design, and IT consulting.'}, {'Category': 'Services', 'Question': 'Do you provide mobile app development?', 'Answer': 'Yes, we develop native and cross-platform mobile apps using technologies like Flutter and React Native.'}, {'Category': 'Services', 'Question': 'Can you handle enterprise-level software development?', 'Answer': 'Yes, we specialize in building scalable and secure enterprise-grade software solutions.'}, {'Category': 'Services', 'Question': 'Do you offer e-commerce development services?', 'Answer': 'Yes, we create custom e-commerce platforms and integrate existing solutions like Shopify and Magento.'}, {'Category': 'Services', 'Question': 'Do you provide web application development?', 'Answer': 'Yes, we develop responsive and robust web applications tailored to your business needs.'}

In [2]:
!pip install transformers sentence-transformers




In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Extract all questions
faq_questions = [item["Question"] for item in faq_dataset]

# Generate embeddings for all FAQ questions
faq_embeddings = model.encode(faq_questions)

In [7]:
# User query
user_query = "Can you develop apps for mobile phones?"

# Generate embedding for the query
query_embedding = model.encode(user_query)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between query and FAQ embeddings
similarities = cosine_similarity([query_embedding], faq_embeddings)

# Find the index of the most similar question
most_similar_idx = np.argmax(similarities)

# Retrieve the corresponding answer
best_match = faq_dataset[most_similar_idx]
print("Answer:", best_match["Answer"])


Answer: Yes, we develop native and cross-platform mobile apps using technologies like Flutter and React Native.


In [14]:
# Function to process user queries
def get_answer(user_query):
    # Generate embedding for the user's query
    query_embedding = model.encode(user_query)

    # Compute cosine similarity between query and FAQ embeddings
    similarities = cosine_similarity([query_embedding], faq_embeddings)

    # Find the index of the most similar question
    most_similar_idx = np.argmax(similarities)
    similarity_score = similarities[0][most_similar_idx]

    # Set a threshold for valid similarity
    threshold = 0.6  # Adjust this based on testing
    if similarity_score < threshold:
        return None, "I'm sorry, I couldn't find an answer to your question."

    # Retrieve the best match and its answer
    best_match = faq_dataset[most_similar_idx]
    return best_match["Question"], best_match["Answer"]

# Interactive loop
print("Welcome to the FAQ Chatbot! Type your question or 'exit' to quit.")
while True:
    user_query = input("You: ")  # Take user input
    if user_query.lower() == "exit":  # Exit condition
        print("Goodbye!")
        break

    # Get the best matching FAQ and answer
    question, answer = get_answer(user_query)
    print(f"Chatbot: {answer}")

Welcome to the FAQ Chatbot! Type your question or 'exit' to quit.
You: what is your minimum budget?
Chatbot: Our minimum project budget typically starts at $5,000, depending on the scope.
You: what technologies you use?
Chatbot: We specialize in a wide range of technologies including Java, Python, JavaScript, React, Angular, Node.js, and many others, depending on the project needs.
You: Can you make my website on short call?
Chatbot: I'm sorry, I couldn't find an answer to your question.
You: exit
Goodbye!


In [19]:

# # Precompute embeddings for all questions in the dataset
# faq_questions = [faq["question"] for faq in faq_dataset]
# faq_embeddings = model.encode(faq_questions)

# Unified function to get answers with optional category filtering
def get_answer(user_query, user_category=None):
    # Filter FAQs by category (if specified)
    if user_category:
        filtered_dataset = [faq for faq in faq_dataset if faq["Category"].lower() == user_category.lower()]
    else:
        filtered_dataset = faq_dataset

    # Handle empty filtered datasets
    if not filtered_dataset:
        return None, "I'm sorry, no questions match your selected category."

    # Extract questions and compute embeddings for filtered dataset
    filtered_questions = [faq["Question"] for faq in filtered_dataset]
    filtered_embeddings = model.encode(filtered_questions)

    # Generate embedding for user query
    query_embedding = model.encode(user_query)

    # Compute cosine similarity
    similarities = cosine_similarity([query_embedding], filtered_embeddings)
    most_similar_idx = np.argmax(similarities)
    similarity_score = similarities[0][most_similar_idx]

    # Threshold for similarity
    threshold = 0.6
    if similarity_score < threshold:
        return None, "I'm sorry, I couldn't find an answer to your question."

    # Retrieve the best match
    best_match = filtered_dataset[most_similar_idx]
    return best_match["Question"], best_match["Answer"]

# Interactive chatbot loop
print("Welcome to the FAQ Chatbot! Type your question or 'exit' to quit.")
while True:
    # Ask the user for a query
    user_query = input("You: ")
    if user_query.lower() == "exit":
        print("Goodbye!")
        break

    # Ask if the user wants to specify a category
    category = input("Do you want to specify a category (e.g., 'Services', 'Pricing')? Type 'no' to skip: ")
    if category.lower() == "no":
        category = None

    # Get the answer based on the query and optional category
    question, answer = get_answer(user_query, user_category=category)
    print(f"Chatbot: {answer}")


Welcome to the FAQ Chatbot! Type your question or 'exit' to quit.
You: I need an E-commerce website
Do you want to specify a category (e.g., 'Services', 'Pricing')? Type 'no' to skip: no
Chatbot: I'm sorry, I couldn't find an answer to your question.
You: exit
Goodbye!
