In [1]:
import os
from pathlib import Path
import sys
import pandas as pd

print("Python:", sys.version)
print("CWD:", os.getcwd())

# Figure out where the project root is
cwd = Path().resolve()
print("Resolved cwd:", cwd)

# Try to find the chunks file from either root or notebooks/
chunks_path = cwd / "data" / "processed" / "tables-charts_chunks.csv"
if not chunks_path.exists():
    chunks_path = cwd.parent / "data" / "processed" / "tables-charts_chunks.csv"

print("Chunks path:", chunks_path)
print("Exists:", chunks_path.exists())

df_chunks = pd.read_csv(chunks_path)
print("Loaded chunks:", df_chunks.shape)
df_chunks.head(5)


Python: 3.12.4 (tags/v3.12.4:8e8a4ba, Jun  6 2024, 19:30:16) [MSC v.1940 64 bit (AMD64)]
CWD: c:\Users\SEC\OneDrive\Desktop\docinsight\notebooks
Resolved cwd: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks
Chunks path: C:\Users\SEC\OneDrive\Desktop\docinsight\notebooks\data\processed\tables-charts_chunks.csv
Exists: True
Loaded chunks: (65, 6)


Unnamed: 0,chunk_id,page_number,start_block_index,end_block_index,text,char_len
0,0,0,1,18,"Tables, Charts and Graphs\nUNIT 5\nTABLES, CHA...",594
1,1,0,19,22,"5.1\nINTRODUCTION\nWe have, in the various uni...",434
2,2,0,23,23,"a)\nWhen you decide to use tables, charts and ...",610
3,3,0,24,0,b)\nLet us now try and understand the function...,399
4,4,1,1,3,Writing Skills\n\nThese devices enable you to...,246


In [2]:
from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"  # small, fast, good enough
embed_model = SentenceTransformer(model_name)

print("Loaded model:", model_name)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded model: all-MiniLM-L6-v2


In [3]:
texts = df_chunks["text"].astype(str).tolist()
print("Number of chunks:", len(texts))

chunk_embeddings = embed_model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # helps similarity
)

chunk_embeddings.shape


Number of chunks: 65


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

(65, 384)

In [4]:
import faiss
import numpy as np

# Dimension of embeddings
embedding_dim = chunk_embeddings.shape[1]
print("Embedding dim:", embedding_dim)

# Use inner product similarity (since we normalized embeddings)
index = faiss.IndexFlatIP(embedding_dim)

# Add all chunk embeddings
index.add(chunk_embeddings.astype(np.float32))

print("Index size (ntotal):", index.ntotal)


Embedding dim: 384
Index size (ntotal): 65


In [5]:
def retrieve_similar_chunks(query: str, top_k: int = 5):
    """
    Given a natural language query, return top_k most similar chunks.
    """
    # 1) Encode the query
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype(np.float32)

    # 2) Search FAISS index
    scores, indices = index.search(q_emb, top_k)
    scores = scores[0]
    indices = indices[0]

    # 3) Collect results in a DataFrame
    results = df_chunks.iloc[indices].copy()
    results["similarity"] = scores

    return results


In [6]:
query = "What is the document mainly about?"
results = retrieve_similar_chunks(query, top_k=5)

results[["chunk_id", "page_number", "char_len", "similarity", "text"]]


Unnamed: 0,chunk_id,page_number,char_len,similarity,text
56,56,11,200,0.42624,"In this unit, we made you aware of the communi..."
3,3,0,399,0.4159,b)\nLet us now try and understand the function...
1,1,0,434,0.341747,"5.1\nINTRODUCTION\nWe have, in the various uni..."
4,4,1,246,0.331472,Writing Skills\n\nThese devices enable you to...
6,6,1,592,0.298814,\nThey can show a number of specific data is ...
