In [1]:
texts = [
    "sri wants to do something exciting",
    "sri is working on mp4 based vector db",
    "this vector db has inbuilt meta store and multi index properties",
    "we consider mp4 files as containers ",
    "plans require photosyntheses"
]


In [2]:
import requests

def get_embedding(text):
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        json={
            "model": "nomic-embed-text",
            "prompt": text
        }
    )
    return response.json()["embedding"]

embeddings = [get_embedding(text) for text in texts]


In [3]:
embeddings

[[-0.8153625130653381,
  1.7168004512786865,
  -4.369712829589844,
  0.8694280982017517,
  -0.4399224817752838,
  -0.29280394315719604,
  -0.4287497401237488,
  -0.625956654548645,
  0.3576461672782898,
  -1.069076657295227,
  0.05172792077064514,
  2.441244125366211,
  1.3769707679748535,
  0.9720081090927124,
  0.40823206305503845,
  0.4828709363937378,
  1.6232784986495972,
  -2.0971014499664307,
  -1.090485692024231,
  0.7618647813796997,
  -0.8685513734817505,
  -2.0670700073242188,
  -0.510538637638092,
  0.9779260754585266,
  1.4487309455871582,
  -0.6858269572257996,
  0.7183105945587158,
  -0.816207766532898,
  -0.04843398183584213,
  0.5010818839073181,
  0.84625244140625,
  -1.3808238506317139,
  -0.33945950865745544,
  -0.09547822177410126,
  0.3526496887207031,
  0.08997350931167603,
  0.020176175981760025,
  0.00821855291724205,
  -0.0482841394841671,
  -1.0074917078018188,
  0.06449941545724869,
  -0.9617227911949158,
  -0.36976999044418335,
  -1.3342292308807373,
  1.79

In [8]:
import os
import json
import struct
import numpy as np
import faiss
import requests

def get_embedding(text):
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        json={
            "model": "nomic-embed-text",
            "prompt": text
        }
    )
    return response.json()["embedding"]


# Convert float32 array to big-endian bytes
def float32_array_to_bytes(arr):
    arr = np.array(arr, dtype="float32")  
    return arr.astype(">f4").tobytes()

# Write a box with proper MP4 format
def write_box(f, box_type: str, payload: bytes):
    box_len = 4 + 4 + len(payload)  # 4 bytes for length, 4 for type, rest is payload
    f.write(struct.pack(">I4s", box_len, box_type.encode("utf-8")))
    f.write(payload)

# Write embeddings, metadata, and text into a MP4-like format
def write_mp4_file(filename, embeddings, texts, metadatas):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        oidx = {}

        for i, (vec, text, meta) in enumerate(zip(embeddings, texts, metadatas)):
            doc_id = str(i)

            # Vector box
            vct_offset = f.tell()
            write_box(f, "vct1", float32_array_to_bytes(vec))

            # Metadata box
            json_offset = f.tell()
            meta_bytes = json.dumps(meta).encode("utf-8")
            write_box(f, "json", meta_bytes)

            # Text content box
            txt_offset = f.tell()
            text_bytes = text.encode("utf-8")
            write_box(f, "txt1", text_bytes)

            oidx[doc_id] = {"vec": vct_offset, "json": json_offset, "txt": txt_offset}
           
        # Final oidx box  odix is offest like index
        write_box(f, "oidx", json.dumps(oidx).encode("utf-8"))


    print(f"MP4 written with {len(embeddings)} records at {filename} and odix is {oidx}")

# Load oidx index from the MP4 file
def load_oidx_from_mp4(filename):
    with open(filename, "rb") as f:
        f.seek(0, 2)
        filesize = f.tell()
        seek_back = min(filesize, 1024 * 1024)
        f.seek(-seek_back, 2)
        trailer = f.read()
        idx = trailer.rfind(b"oidx")
        if idx == -1:
            raise ValueError("oidx not found")

        # Go back 4 bytes before "oidx" to read the box length
        box_start = idx - 4
        box_len = struct.unpack(">I", trailer[box_start:idx])[0]
        json_bytes = trailer[idx + 4 : idx + box_len]
        return json.loads(json_bytes.decode("utf-8"))

# Build FAISS index from vector boxes
def build_faiss_flat_index(mp4_path, oidx, dim=768):
    vectors = []
    ids = []
    with open(mp4_path, "rb") as f:
        for doc_id in sorted(oidx.keys(), key=int):
            offset = oidx[doc_id]["vec"]
            f.seek(offset + 8)  # Skip box length + type
            vec_data = f.read(4 * dim)
            vec = np.frombuffer(vec_data, dtype=">f4")
            vectors.append(vec)
            ids.append(doc_id)

    vec_matrix = np.stack(vectors).astype("float32")
    index = faiss.IndexFlatL2(dim)
    index.add(vec_matrix)
    print(f"Built FlatL2 FAISS index with {len(ids)} vectors")
    return index, ids

# FAISS search
def search_flat_index(index, query_vec, top_k=5):
    query_vec = np.array(query_vec, dtype="float32").reshape(1, -1)
    distances, indices = index.search(query_vec, top_k)
    return indices[0], distances[0]



def read_text_and_metadata(file_path, oidx, doc_ids):
    results = []

    with open(file_path, "rb") as f:
        for doc_id in doc_ids:
            doc_id = str(doc_id)
            if doc_id not in oidx:
                continue
            entry = oidx[doc_id]
            try:
                # --- Read Text Box ---
                f.seek(entry["txt"])
                txt_len = struct.unpack(">I", f.read(4))[0]
                f.read(4)  # Skip type
                text = f.read(txt_len - 8).decode("utf-8")  # Only payload

                # --- Read Metadata Box ---
                f.seek(entry["json"])
                meta_len = struct.unpack(">I", f.read(4))[0]
                f.read(4)  # Skip type
                metadata = json.loads(f.read(meta_len - 8).decode("utf-8"))

                results.append({
                    "doc_id": doc_id,
                    "text": text,
                    "metadata": metadata
                })
            except Exception as e:
                print(f" Could not read {doc_id}: {e}")
                continue

    return results

def save_faiss_index(index, path):
    faiss.write_index(index, path)
    print(f"Saved FAISS index to {path}")

def load_faiss_index(path):
    if os.path.exists(path):
        index = faiss.read_index(path)
        print(f"Loaded FAISS index from {path}")
        return index
    return None


# ---------- Run the full pipeline ----------

texts = [
    "sri wants to do something exciting",
    "sri is working on mp4 based vector db",
    "this vector db has inbuilt meta store and multi index properties",
    "we consider mp4 files as containers ",
    "plants require photosyntheses"
]

embeddings = [get_embedding(t) for t in texts]
metadatas = [{"doc_id": str(i), "source": "mock"} for i in range(len(texts))]

# mp4_path = "trial6/store.mp4"
# write_mp4_file(mp4_path, embeddings, texts, metadatas)

# oidx = load_oidx_from_mp4(mp4_path)

# index, ids = build_faiss_flat_index(mp4_path, oidx, dim=768)

# query_text = "sri"
# query_vec = get_embedding(query_text)

# top_indices, distances = search_flat_index(index, query_vec, top_k=3)

# print("\n Top match indices:", top_indices)

# results = read_text_and_metadata(mp4_path, oidx, [ids[i] for i in top_indices])

# #  Print final results
# for i, match in enumerate(results):
#     print(f"\n Match #{i+1}")
#     print(f"Score: {distances[i]}")
#     print(f"Text: {match['text']}")
#     print(f"Metadata: {match['metadata']}")


index_path = "trial6/index.faiss"
mp4_path = "trial6/store.mp4"

# Save everything to MP4
write_mp4_file(mp4_path, embeddings, texts, metadatas)

# Load oidx offset map
oidx = load_oidx_from_mp4(mp4_path)

# Try loading FAISS index from disk
index = load_faiss_index(index_path)
ids = list(sorted(oidx.keys(), key=int))

if index is None:
    index, ids = build_faiss_flat_index(mp4_path, oidx, dim=768)
    save_faiss_index(index, index_path)

# Run Query
query_text = "sri"
query_vec = get_embedding(query_text)
top_indices, distances = search_flat_index(index, query_vec, top_k=3)

# Read matched text + metadata from MP4
results = read_text_and_metadata(mp4_path, oidx, [ids[i] for i in top_indices])

# Print results
for i, match in enumerate(results):
    print(f"\n Match #{i+1}")
    print(f"Score: {distances[i]}")
    print(f"Text: {match['text']}")
    print(f"Metadata: {match['metadata']}")



MP4 written with 5 records at trial6/store.mp4 and odix is {'0': {'vec': 0, 'json': 3080, 'txt': 3121}, '1': {'vec': 3163, 'json': 6243, 'txt': 6284}, '2': {'vec': 6329, 'json': 9409, 'txt': 9450}, '3': {'vec': 9522, 'json': 12602, 'txt': 12643}, '4': {'vec': 12687, 'json': 15767, 'txt': 15808}}
Built FlatL2 FAISS index with 5 vectors
Saved FAISS index to trial6/index.faiss

 Match #1
Score: 318.68017578125
Text: sri wants to do something exciting
Metadata: {'doc_id': '0', 'source': 'mock'}

 Match #2
Score: 366.83673095703125
Text: sri is working on mp4 based vector db
Metadata: {'doc_id': '1', 'source': 'mock'}

 Match #3
Score: 656.865234375
Text: this vector db has inbuilt meta store and multi index properties
Metadata: {'doc_id': '2', 'source': 'mock'}


In [7]:

query_text = "plants"
query_vec = get_embedding(query_text)

top_indices, distances = search_flat_index(index, query_vec, top_k=3)

print("\n Top match indices:", top_indices)

results = read_text_and_metadata(mp4_path, oidx, [ids[i] for i in top_indices])

#  Print final results
for i, match in enumerate(results):
    print(f"\n Match #{i+1}")
    print(f"Score: {distances[i]}")
    print(f"Text: {match['text']}")
    print(f"Metadata: {match['metadata']}")


 Top match indices: [4 1 2]

 Match #1
Score: 239.33668518066406
Text: plants require photosyntheses
Metadata: {'doc_id': '4', 'source': 'mock'}

 Match #2
Score: 560.94482421875
Text: sri is working on mp4 based vector db
Metadata: {'doc_id': '1', 'source': 'mock'}

 Match #3
Score: 572.9727783203125
Text: this vector db has inbuilt meta store and multi index properties
Metadata: {'doc_id': '2', 'source': 'mock'}
