In [1]:
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


**Import Question Bank**

In [2]:
with open("da_questions.json", "r", encoding="utf-8") as f:
    QA_KB = json.load(f)

In [3]:
docs = [(item["q"] + " " + item["a"]) for item in QA_KB]
questions = docs

In [4]:
docs

['What is the difference between a data analyst and a data scientist? A data analyst focuses on querying, cleaning, and visualizing data to generate insights, while a data scientist builds predictive models and uses machine learning to forecast outcomes.',
 'What is SQL used for in data analytics? SQL is used to query, filter, aggregate, join, and manipulate structured data stored in relational databases.',
 'What are the different types of joins in SQL? The main SQL joins are INNER JOIN, LEFT JOIN, RIGHT JOIN, FULL JOIN, and CROSS JOIN.',
 'What is normalization in databases? Normalization is the process of organizing data to reduce redundancy and improve data integrity.',
 'What is ETL? ETL stands for Extract, Transform, Load. It is the process of extracting data from sources, transforming it into usable format, and loading it into a data warehouse.',
 'What is the difference between WHERE and HAVING in SQL? WHERE filters rows before aggregation, while HAVING filters results after ag

**Clean text**

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

**TF-IDF**

In [6]:
q = [clean_text(item["q"]) for item in QA_KB]
df = pd.DataFrame(q,columns=["questions"])
df.head()

Unnamed: 0,questions
0,what is the difference between a data analyst ...
1,what is sql used for in data analytics
2,what are the different types of joins in sql
3,what is normalization in databases
4,what is etl


In [7]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["questions"])
feature_names = vectorizer.get_feature_names_out()
results = []

for i in range(X.shape[0]):
    row = X[i].toarray().flatten()
    word_scores = {
        feature_names[j]: round(row[j], 3)
        for j in range(len(row)) if row[j] > 0
    }
    results.append(word_scores)

df["TFIDF"] = results

df.head()

Unnamed: 0,questions,TFIDF
0,what is the difference between a data analyst ...,"{'analyst': 0.475, 'data': 0.632, 'difference'..."
1,what is sql used for in data analytics,"{'analytics': 0.601, 'data': 0.4, 'sql': 0.49,..."
2,what are the different types of joins in sql,"{'different': 0.522, 'joins': 0.522, 'sql': 0...."
3,what is normalization in databases,"{'databases': 0.707, 'normalization': 0.707}"
4,what is etl,{'etl': 1.0}


In [8]:
#vectorizer = TfidfVectorizer(stop_words="english",norm=None)
#X = vectorizer.fit_transform(df["questions"])
def dotproduct(query):
    query_vec = vectorizer.transform([query])
    query_array = query_vec.toarray()[0]

    dot_products = np.dot(X, query_vec.T).toarray().flatten()
    doc_magnitudes = np.linalg.norm(X.toarray(), axis=1)
    query_magnitude = np.linalg.norm(query_vec.toarray())
    cosine_scores = dot_products / (doc_magnitudes * query_magnitude + 1e-10)
    out = pd.DataFrame({
        "Query": query,
        "questions": df["questions"].values,
        "Dot_Product": dot_products,
        "Doc Magnitude": doc_magnitudes,
        "Query_Magnitude": np.full(len(df), query_magnitude),
        "Cosine_Similarity": cosine_scores
    })

    return out

**Vectorization**

In [10]:
vectorizer = TfidfVectorizer(preprocessor=clean_text, ngram_range=(1,2))
question_vectors = vectorizer.fit_transform(questions)

**Retrieval Funciton**

In [11]:
def get_answer(user_query, threshold=0.05):
    query_vec = vectorizer.transform([user_query])
    similarities = cosine_similarity(query_vec, question_vectors).flatten()

    best_index = similarities.argmax()
    best_score = similarities[best_index]

    if best_score < threshold:
        return "I don't have an answer for that yet. Try asking again"

    return QA_KB[best_index]["a"]

In [14]:
def chat():
    print(" Interview FAQ Chatbot (type 'quit' to exit)")
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ["quit", "exit"]:
            print("Bot: Goodbye!")
            break
        response = get_answer(user_input)
        print("Bot:", response)
        dotproduct(response)

**Chat Box**

In [15]:
if __name__ == "__main__":
    chat()

 Interview FAQ Chatbot (type 'quit' to exit)



You:  sql


Bot: SQL is used to query, filter, aggregate, join, and manipulate structured data stored in relational databases.


ValueError: matmul: dimension mismatch with signature (n,k=53),(k=761,m)->(n,m)