In [15]:
# Kaggle already has scikit-learn preinstalled, so normally no install is needed
!pip install -q scikit-learn


In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [19]:
documents = [
    "Artificial Intelligence is the simulation of human intelligence by machines.",
    "Machine learning is a subset of AI focused on learning from data.",
    "Transformers are deep learning models used for NLP tasks.",
    "Retrieval Augmented Generation combines information retrieval with text generation.",
    "Vector databases store embeddings for fast similarity search."
]


In [20]:
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents)


In [21]:
def retrieve_docs(question, top_k=2):
    q_vec = vectorizer.transform([question])
    similarities = cosine_similarity(q_vec, doc_vectors).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    retrieved_docs = [documents[i] for i in top_indices]
    return retrieved_docs


In [22]:
def answer_question(question, top_k=2):
    retrieved_docs = retrieve_docs(question, top_k)
    context = " ".join(retrieved_docs)
    answer = f"Based on the context: {context} The answer to your question '{question}' is likely contained above."
    return answer


In [23]:
question = "What is Retrieval Augmented Generation?"
answer = answer_question(question)
print(answer)


Based on the context: Retrieval Augmented Generation combines information retrieval with text generation. Machine learning is a subset of AI focused on learning from data. The answer to your question 'What is Retrieval Augmented Generation?' is likely contained above.


In [24]:
# -----------------------------
# STEP 0: Imports
# -----------------------------
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# STEP 1: Prepare documents
# -----------------------------
documents = [
    "Artificial Intelligence is the simulation of human intelligence by machines.",
    "Machine learning is a subset of AI focused on learning from data.",
    "Transformers are deep learning models used for NLP tasks.",
    "Retrieval Augmented Generation combines information retrieval with text generation.",
    "Vector databases store embeddings for fast similarity search.",
    "FAISS is a library for efficient similarity search of dense vectors.",
    "LangChain is a framework to build applications with LLMs and retrieval.",
    "RAG systems improve LLM factual accuracy by providing context."
]

# -----------------------------
# STEP 2: TF-IDF Vectorization
# -----------------------------
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents)

# -----------------------------
# STEP 3: Retrieval Function
# -----------------------------
def retrieve_docs(question, top_k=2):
    """
    Retrieve top_k most relevant documents based on cosine similarity
    """
    q_vec = vectorizer.transform([question])
    similarities = cosine_similarity(q_vec, doc_vectors).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    retrieved_docs = [documents[i] for i in top_indices]
    return retrieved_docs

# -----------------------------
# STEP 4: Simulated Answer Generation
# -----------------------------
def answer_question(question, top_k=2):
    """
    Generate a simple answer by combining retrieved context
    """
    retrieved_docs = retrieve_docs(question, top_k)
    context = " ".join(retrieved_docs)
    answer = f"Based on the context: {context} The answer to your question '{question}' is likely contained above."
    return answer

# -----------------------------
# STEP 5: Test the system
# -----------------------------
test_questions = [
    "What is Retrieval Augmented Generation?",
    "Explain the purpose of a vector database.",
    "What is the role of machine learning in AI?"
]

for q in test_questions:
    print(f"Question: {q}")
    print(f"Answer: {answer_question(q)}")
    print("---------------------------------------------------")


Question: What is Retrieval Augmented Generation?
Answer: Based on the context: Retrieval Augmented Generation combines information retrieval with text generation. LangChain is a framework to build applications with LLMs and retrieval. The answer to your question 'What is Retrieval Augmented Generation?' is likely contained above.
---------------------------------------------------
Question: Explain the purpose of a vector database.
Answer: Based on the context: Artificial Intelligence is the simulation of human intelligence by machines. Vector databases store embeddings for fast similarity search. The answer to your question 'Explain the purpose of a vector database.' is likely contained above.
---------------------------------------------------
Question: What is the role of machine learning in AI?
Answer: Based on the context: Machine learning is a subset of AI focused on learning from data. Artificial Intelligence is the simulation of human intelligence by machines. The answer to yo