In [7]:
import os

os.makedirs("storage", exist_ok=True)      # PDFs will be saved here
os.makedirs("faiss_index", exist_ok=True)  # FAISS indices
if not os.path.exists("summaries.json"):
    with open("summaries.json", "w") as f:
        f.write("{}")


In [8]:
!pip install PyPDF2 sentence-transformers faiss-cpu transformers


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.12.0


In [9]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss, json, os, uuid

model = SentenceTransformer('all-MiniLM-L6-v2')

# Load summaries.json
with open("summaries.json", "r") as f:
    summaries = json.load(f)

def upload_pdf(file_path):
    pdf_id = str(uuid.uuid4())
    os.makedirs("storage", exist_ok=True)
    dest_path = f"storage/{pdf_id}.pdf"
    # Copy PDF
    import shutil
    shutil.copy(file_path, dest_path)

    # Extract text
    reader = PdfReader(dest_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    # Chunk text
    chunk_size = 500
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Embeddings
    embeddings = model.encode(chunks)

    # FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, f"faiss_index/{pdf_id}.index")

    # Store summary (just first 500 chars for now)
    summary = text[:500] + "..."
    summaries[pdf_id] = {"summary": summary, "chunks": chunks}
    with open("summaries.json", "w") as f:
        json.dump(summaries, f)

    return pdf_id, summary

def ask_question(pdf_id, question):
    if pdf_id not in summaries:
        return "PDF not found"
    chunks = summaries[pdf_id]["chunks"]
    query_emb = model.encode([question])
    index = faiss.read_index(f"faiss_index/{pdf_id}.index")
    D, I = index.search(query_emb, k=3)  # top 3 chunks
    answer_chunks = [chunks[i] for i in I[0]]
    return " ".join(answer_chunks)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Upload PDF from local or Google Drive
pdf_id, summary = upload_pdf("/content/drive/MyDrive/PDFs/most_significant_papers.pdf")
print("Summary:", summary)

# Ask a question
answer = ask_question(pdf_id, "What is the subject of research")
print("Answer:", answer)


Summary: Sustainable Energy Technologies and Assessments 57 (2023) 103291
2213-1388/© 2023 Elsevier Ltd. All rights reserved.Geometric triangulation uncertainty for sustainable urban underwater 
localization in presence of malicious virtual node 
Prateeka, Rajeev Aryaa,*, Ajit K. Vermab 
aWireless Sensor Networks Lab, Department of Electronics and Communication Engineering, National Institute of Technology Patna, Patna, Bihar, India 
bFaculty of Engineering and Natural Sciences, Western Norway University...
Answer: Techniques.
Rajeev Arya received the Engi-
neering Degree in Electronics &
Communication Engineeringfrom Government EngineeringCollege, Ujjain, (RGPV
University, Bhopal) India in
2008, and the Master of Tech-nology in Electronics & Com-
munication Engineering from
Indian Institute of Technology(ISM), Dhanbad, India in 2012.He received the Ph.D. degree in
Communication Engineering
from Indian Institute of Tech-nology (IIT Roorkee), Roorkee,
India in 2016. He has received Mini