In [1]:
!pip install transformers datasets accelerate sentencepiece faiss-cpu
!pip install einops
!pip install handystuff

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2
Collecting handystuff
  Downloading handystuff-0.2.2-py3-none-any.whl.metadata (1.0 kB)
Collecting jsonnet>=0.17.0 (from handystuff)
  Downloading jsonnet-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (449 bytes)
Collecting colorlog>=5.0.0 (from handystuff)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading handystuff-0.2.2-py3-none-any.whl (10 kB)
Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Downloading jsonnet-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K   [90m━

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Colab-ready Python launcher for FAISS index building.
This replaces your bash build-index script.
It does NOT modify run_clm.py or knn.py.
"""

import os
import subprocess
from datetime import datetime

# ======================================
# 1. Mount Google Drive
# ======================================
print("[INFO] Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')


[INFO] Mounting Google Drive...
Mounted at /content/drive


In [None]:

# ======================================
# 2. Configuration (EXACT same as save step) #Replace with google drive equivalent files
# ======================================

MODEL = "allenai/OLMo-2-0425-1B-Instruct"
BASE_DIR = "/content/drive/MyDrive/faiss_mts_olmo/" #Replace this with your base dir

SCRIPT = os.path.join(BASE_DIR + "core_scripts/", "run_clm_chat.py")
TRAIN_FILE = os.path.join(BASE_DIR + 'execution_scripts/mock_datasets_radiology_jsonl/', "mimic_inspired_train_context_impression-finetune.jsonl")
VAL_FILE = os.path.join(BASE_DIR + 'execution_scripts/mock_datasets_radiology_jsonl/', "mimic_inspired_val_context_impression-finetune.jsonl")


OUT_DIR = os.path.join(BASE_DIR + "execution_scripts/", f"outputs/{MODEL.replace('/','_')}")
DSTORE_DIR = os.path.join(BASE_DIR + "execution_scripts/", f"datastores/{MODEL.replace('/','_')}")
LOG_DIR = os.path.join(BASE_DIR + "execution_scripts/", "logs")

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(DSTORE_DIR, exist_ok=True)

In [5]:
print(f"Checking existence for:")
print(f"SCRIPT ({SCRIPT}): {os.path.exists(SCRIPT)}")
print(f"TRAIN_FILE ({TRAIN_FILE}): {os.path.exists(TRAIN_FILE)}")
print(f"VAL_FILE ({VAL_FILE}): {os.path.exists(VAL_FILE)}")
print(f"LOG_DIR ({LOG_DIR}): {os.path.exists(LOG_DIR)}")
print(f"DSTORE_DIR ({DSTORE_DIR}): {os.path.exists(DSTORE_DIR)}")

Checking existence for:
SCRIPT (/content/drive/MyDrive/faiss_mts_olmo/core_scripts/run_clm_chat.py): True
TRAIN_FILE (/content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/mock_datasets_radiology_jsonl/mimic_inspired_train_context_impression-finetune.jsonl): True
VAL_FILE (/content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/mock_datasets_radiology_jsonl/mimic_inspired_val_context_impression-finetune.jsonl): True
LOG_DIR (/content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/logs): True
DSTORE_DIR (/content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/datastores/allenai_OLMo-2-0425-1B-Instruct): True


In [7]:
# ======================================
# 3. FAISS PARAMETERS FOR YOUR 352k DATASTORE
# ======================================

DSTORE_SIZE = 237510     # confirmed from .npy (check datastores file for the actual datastore)
NCENTROIDS = 1024         # ≈ sqrt(size)
CODE_SIZE = 32            # PQ code size
PROBE = 16                # IVF search probe
NUM_KEYS_TO_ADD = 50000  # add everything in one chunk

MOVE_DSTORE_TO_MEM = True
NO_LOAD_KEYS = True
RECOMPUTE_DISTS = False


In [8]:
# ======================================
# 4. Logging
# ======================================

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(LOG_DIR, f"build_index_{timestamp}.log")

print(f"[INFO] Log file: {log_file}")

[INFO] Log file: /content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/logs/build_index_20251227_085009.log


In [9]:
# ======================================
# 5. Build command
# ======================================

cmd = [
    "python", "-u", SCRIPT,
    "--model_name_or_path", MODEL,
    "--train_file", TRAIN_FILE,
    "--validation_file", VAL_FILE,
    "--apply_chat_template",
    "--dstore_dir", DSTORE_DIR,
    "--dstore_size", str(DSTORE_SIZE),
    "--build_index",
    "--ncentroids", str(NCENTROIDS),
    "--code_size", str(CODE_SIZE),
    "--probe", str(PROBE),
    "--num_keys_to_add_at_a_time", str(NUM_KEYS_TO_ADD),
    "--move_dstore_to_mem", str(MOVE_DSTORE_TO_MEM).lower(),
    "--no_load_keys", str(NO_LOAD_KEYS).lower(),
    "--recompute_dists", str(RECOMPUTE_DISTS).lower(),
    "--output_dir", OUT_DIR,
]

print("[INFO] Final FAISS index build command:")
print(" ".join(cmd), "\n")

[INFO] Final FAISS index build command:
python -u /content/drive/MyDrive/faiss_mts_olmo/core_scripts/run_clm_chat.py --model_name_or_path allenai/OLMo-2-0425-1B-Instruct --train_file /content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/mock_datasets_radiology_jsonl/mimic_inspired_train_context_impression-finetune.jsonl --validation_file /content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/mock_datasets_radiology_jsonl/mimic_inspired_val_context_impression-finetune.jsonl --apply_chat_template --dstore_dir /content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/datastores/allenai_OLMo-2-0425-1B-Instruct --dstore_size 237510 --build_index --ncentroids 1024 --code_size 32 --probe 16 --num_keys_to_add_at_a_time 50000 --move_dstore_to_mem true --no_load_keys true --recompute_dists false --output_dir /content/drive/MyDrive/faiss_mts_olmo/Attempts_27Dec25/outputs/allenai_OLMo-2-0425-1B-Instruct 



In [10]:
# ======================================
# 6. Run with live logging
# ======================================

with open(log_file, "w") as lf:
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
        bufsize=1,
    )

    print("[INFO] Building FAISS index... (streaming output)")
    for line in process.stdout:
        print(line, end="")
        lf.write(line)

process.wait()

print("\n[INFO] FAISS index build COMPLETE.")
print(f"[INFO] Log saved to: {log_file}")
print(f"[INFO] Index saved inside: {DSTORE_DIR}")

[INFO] Building FAISS index... (streaming output)
2025-12-27 08:50:29.140590: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-27 08:50:29.158572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766825429.179514    4104 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766825429.186019    4104 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766825429.202423    4104 computation_placer.cc:177] computation placer already r