In [13]:
from google.colab import files
uploaded = files.upload()

Saving sample.txt to sample (1).txt


In [14]:
import utils
import pickle, os
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np


In [15]:
text_path = list(uploaded.keys())[0]
text = utils.load_text(text_path)
chunks = utils.chunk_text(text, chunk_size=100)
print(f"Loaded and chunked {len(chunks)} chunks.")


Loaded and chunked 1 chunks.


In [16]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embeddings = model.encode(chunks)
print(f"Generated {len(embeddings)} embeddings.")


Generated 1 embeddings.


In [17]:
os.makedirs("/content/vector_store", exist_ok=True)
with open("/content/vector_store/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
with open("/content/vector_store/embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
print("Embeddings saved to /content/vector_store/")


Embeddings saved to /content/vector_store/


In [18]:
with open("/content/vector_store/chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
with open("/content/vector_store/embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)


In [19]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")


Device set to use cpu


In [20]:
def semantic_search(query, top_k=3):
    query_embedding = model.encode([query])
    sims = utils.cosine_similarity_manual(query_embedding, embeddings)[0]
    top_indices = sims.argsort()[::-1][:top_k]
    return [(chunks[i], sims[i]) for i in top_indices]


In [21]:
def answer_query(query):
    top_chunks = semantic_search(query)
    context = "\n".join([chunk for chunk, _ in top_chunks])
    result = qa_pipeline({"context": context, "question": query})
    return result["answer"]


In [22]:
query = input("Ask a question: ")
print("Answer:", answer_query(query))


Ask a question: what is artificial intelligence




Answer: the simulation of human intelligence processes by machines
