In [1]:
!pip install llama-index==0.9.44 sentence-transformers faiss-cpu pdfplumber pytesseract Pillow streamlit langchain tqdm pyngrok
!sudo apt-get install -y tesseract-ocr


Collecting llama-index==0.9.44
  Downloading llama_index-0.9.44-py3-none-any.whl.metadata (8.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m638.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting dataclasses-json (from llama-index==0.9.44)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index==0.9.44)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llam

In [2]:
from google.colab import files
uploaded = files.upload()


Saving qatar_test_doc.pdf to qatar_test_doc (1).pdf


In [3]:
import os
os.makedirs("data", exist_ok=True)
os.rename("qatar_test_doc.pdf", "data/qatar_test_doc.pdf")
print("File moved to data/qatar_test_doc.pdf")


File moved to data/qatar_test_doc.pdf


In [6]:
import pdfplumber
from pathlib import Path
import pytesseract
from PIL import Image
from tqdm import tqdm
import faiss
import pickle
import os

from sentence_transformers import SentenceTransformer
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser

In [9]:
DATA_PATH = Path("data/qatar_test_doc.pdf")
INDEX_DIR = Path("index_store")
INDEX_DIR.mkdir(exist_ok=True)

def extract_pages(pdf_path):
    pages = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        for i, p in enumerate(pdf.pages, start=1):
            text = p.extract_text() or ""
            if not text.strip():
                # OCR fallback for scanned pages
                im = p.to_image(resolution=200).original
                text = pytesseract.image_to_string(im)
            pages.append({"page": i, "text": text})
    return pages

pages = extract_pages(DATA_PATH)
print("Pages extracted:", len(pages))

# build Documents
docs = []
for p in pages:
    docs.append(Document(text=p["text"], extra_info={"page": p["page"], "source": DATA_PATH.name}))



Pages extracted: 78


In [12]:
# chunking
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(docs)

print("Nodes after chunking:", len(nodes))

Nodes after chunking: 90


In [13]:
# embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# embed nodes
vectors = model.encode([n.get_text() for n in nodes])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index saved!


In [None]:
# build FAISS
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vectors)

# store metadata
meta = [{"text": nodes[i].get_text(), "meta": nodes[i].extra_info} for i in range(len(nodes))]

with open(INDEX_DIR / "faiss_index.pkl", "wb") as f:
    pickle.dump({"faiss": index, "meta": meta}, f)

print("FAISS index saved!")

In [14]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

with open("index_store/faiss_index.pkl", "rb") as f:
    data = pickle.load(f)

faiss_index = data["faiss"]
meta = data["meta"]

model = SentenceTransformer("all-MiniLM-L6-v2")

def ask(query, top_k=4):
    qv = model.encode([query])
    D, I = faiss_index.search(qv, top_k)
    results = []
    for idx in I[0]:
        results.append(meta[idx])
    return results

query = "What is IMF's assessment of Qatar’s fiscal outlook?"
answers = ask(query)

for i, r in enumerate(answers):
    print("----- RESULT", i+1, "-----")
    print("Page:", r["meta"]["page"])
    print(r["text"][:600], "...\n")


----- RESULT 1 -----
Page: 39
1 24.2 24.5 25.4 25.0 24.4 24.3 24.5
Memorandum items
Local currency per U.S. dollar (average) 6/ 3.6 3.6 3.6 3.6 3.6 3.6 … … … …
Real effective exchange rate (percent change) 5/ -3.2 -2.6 6.5 0.2 -0.5 … … … … …
Credit rating (Moody's investor services) 6/ Aa3 Aa3 Aa3 Aa3 Aa2 Aa2 … … … …
Sources: Qatari authorities, and IMF staff.
1/ Crude oil, natural gas, propane, butane, and condensates.
2/ Central government fiscal balance excluding investment income and corporate income tax from hydrocarbon activities.
3/ Credit to the central government, and government and semi-government institutions, a ...

----- RESULT 2 -----
Page: 63
QATAR
Figure IV. 9. Qatar: Realism of Baseline Assumptions
Forecast Track Record 1/ t+1 t+3 t+5 Comparator Group:
Public debt to GDP Emerging Markets, Commodity Exporter,
Primary deficit Surveillance
r - g Color Code:
Exchange rate depreciation █> 75th percentile
Optimistic
SFA █50-75th percentile
real-time t+3 t+5 █25-50th percenti

In [15]:
test_queries = [
    "What is the IMF's view on Qatar's fiscal outlook?",
    "What risks does the IMF mention about the banking sector?",
    "How will LNG expansion impact Qatar's economy?",
    "What structural reforms are recommended under NDS3?"
]

for q in test_queries:
    print("\n======== QUESTION ========")
    print(q)
    print("==========================")
    answers = ask(q)
    for a in answers:
        print("-- Page", a["meta"]["page"])
        print(a["text"][:300], "...\n")



What is the IMF's view on Qatar's fiscal outlook?
-- Page 39
1 24.2 24.5 25.4 25.0 24.4 24.3 24.5
Memorandum items
Local currency per U.S. dollar (average) 6/ 3.6 3.6 3.6 3.6 3.6 3.6 … … … …
Real effective exchange rate (percent change) 5/ -3.2 -2.6 6.5 0.2 -0.5 … … … … …
Credit rating (Moody's investor services) 6/ Aa3 Aa3 Aa3 Aa3 Aa2 Aa2 … … … …
Sources: Qa ...

-- Page 63
QATAR
Figure IV. 9. Qatar: Realism of Baseline Assumptions
Forecast Track Record 1/ t+1 t+3 t+5 Comparator Group:
Public debt to GDP Emerging Markets, Commodity Exporter,
Primary deficit Surveillance
r - g Color Code:
Exchange rate depreciation █> 75th percentile
Optimistic
SFA █50-75th percentile
r ...

-- Page 41
QATAR
Table 3a. Qatar: Summary of Central Government Finance, 2020–29
(Billions of Qatari Riyals unless otherwise noted)
Projections
2020 2021 2022 2023 2024 2025 2026 2027 2028 2029
Revenue 171.2 193.7 297.8 254.4 213.5 238.2 266.4 275.8 281.6 293.2
Oil 32.3 55.2 57.5 39.0 33.5 30.3 29.6 29.6 30.2  ...