In [None]:
# Required libraries
!pip install sentence-transformers PyPDF2

from sentence_transformers import SentenceTransformer
import numpy as np
import json

from google.colab import files
from IPython.display import display
import ipywidgets as widgets

# --- Colab bootstrap: clone repo if running on Colab ---
import os, sys, subprocess
IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB and not os.path.isdir('/content/noesisnoema-pipeline'):
    subprocess.run(['git', 'clone', 'https://github.com/raskolnikoff/noesisnoema-pipeline.git', '/content/noesisnoema-pipeline'], check=True)
    sys.path.append('/content/noesisnoema-pipeline')

# --- Model selection UI ---
model_options = [
    ('English MiniLM', 'all-MiniLM-L6-v2'),
    ('Japanese/Multilingual (LaBSE)', 'sentence-transformers/LaBSE'),
    ('Japanese S-BERT', 'sonoisa/sentence-bert-base-ja-mean-tokens-v2'),
]
model_dropdown = widgets.Dropdown(
    options=model_options,
    value='all-MiniLM-L6-v2',
    description='Model:'
)

chunk_size_slider = widgets.IntSlider(
    value=512,
    min=50,
    max=2048,
    step=50,
    description='Chunk size (tokens):'
)

overlap_slider = widgets.IntSlider(
    value=50,
    min=0,
    max=200,
    step=10,
    description='Overlap (tokens):'
)

display(model_dropdown)
display(chunk_size_slider)
display(overlap_slider)

# PDF upload
uploaded = files.upload()

# Initialize selected model
model = SentenceTransformer(model_dropdown.value)

# PDF to text extraction
import PyPDF2


def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(open(pdf_path, "rb"))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    # Check if extracted text is empty (image-only PDF)
    if not text.strip():
        print("No extractable text found: This PDF appears to be image-only. Skipping.")
        return ""
    return text


# Use TokenChunker from repo (installed via Colab bootstrap or local path)
import sys, os
try:
    from chunker import TokenChunker  # when running in the repo locally
except Exception:
    sys.path.append('/content/noesisnoema-pipeline')
    from chunker import TokenChunker  # when running on Colab after clone

def make_chunks_and_embeddings(text: str, chunk_size: int = 512, overlap: int = 50):
    """Create chunks and embeddings using the TokenChunker (token-based splitting)."""
    chunker = TokenChunker(chunk_size=chunk_size, overlap=overlap)
    chunks = chunker.chunk_text(text)
    embeddings = model.encode(chunks)
    return chunks, embeddings


# New function: Save chunks.json, embeddings.npy, embeddings.csv, metadata.json together as a .zip archive
import os
import io
import zipfile
from datetime import datetime

def export_zip_archive(base, chunks, embeddings, metadata):
    """
    Save chunks, embeddings (both .npy and .csv formats), and metadata into a .zip archive.
    chunks: list of text chunks
    embeddings: numpy array
    metadata: dict containing metadata information
    """
    # Create files in memory buffer temporarily
    chunks_json = json.dumps(chunks, ensure_ascii=False).encode('utf-8')
    metadata_json = json.dumps(metadata, ensure_ascii=False).encode('utf-8')
    embeddings_bytes_io = io.BytesIO()
    np.save(embeddings_bytes_io, embeddings)
    embeddings_bytes = embeddings_bytes_io.getvalue()
    # Create CSV in-memory string buffer
    embeddings_csv_io = io.StringIO()
    np.savetxt(embeddings_csv_io, embeddings, delimiter=",")
    embeddings_csv_str = embeddings_csv_io.getvalue().encode('utf-8')

    zip_path = f"/content/{base}.zip"
    with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.writestr("chunks.json", chunks_json)
        zf.writestr("embeddings.npy", embeddings_bytes)
        zf.writestr("embeddings.csv", embeddings_csv_str)
        zf.writestr("metadata.json", metadata_json)
        print(f"Exported ZIP archive: {zip_path}")
        print("Included embeddings.csv in the ZIP archive as well.")

import hashlib
from datetime import datetime
from typing import List, Dict, Any

def create_manifest(embedder_name: str, chunk_size: int, overlap: int, offset_unit: str = 'char') -> Dict[str, Any]:
    return {
        "pack_version": "1.1",
        "embedder": {
            "name": embedder_name,
            "version": datetime.utcnow().strftime('%Y-%m-%d'),
            "sha256": "",  # optional: provide model file hash if available
            "dim": int(embeddings.shape[1]) if 'embeddings' in globals() else None,
            "normalize": "L2",
            "text_norm": ["nfkc", "lower"],
        },
        "chunking": {
            "strategy": "sentence+fixed",
            "size": int(chunk_size),
            "overlap": int(overlap),
            "sentence_boundary": True,
        },
        "features": {
            "paragraph_offsets": True,
            "offset_unit": offset_unit,
            "bm25_stats": True,
            "source_diversity_key": "doc_id",
            "doc_timestamp": True,
        },
        "build": {
            "built_at": datetime.utcnow().isoformat() + "Z",
            "hostname": os.uname().nodename if hasattr(os, 'uname') else "colab",
            "pipeline_rev": "colab-notebook",
        },
    }

def write_citations_jsonl(path: str, doc_id: str, chunks: List[str]):
    """Create a minimal citations.jsonl with paragraph indices as offsets (char-based)."""
    with open(path, 'a', encoding='utf-8') as f:
        cursor = 0
        for idx, ch in enumerate(chunks):
            start = cursor
            end = cursor + len(ch)
            obj = {
                "chunk_id": f"{doc_id}#{idx:05d}",
                "doc_id": doc_id,
                "para_id": f"p-{idx:04d}",
                "start": int(start),
                "end": int(end),
                "snippet": ch[:200]
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
            cursor = end

for pdf_path in uploaded.keys():
    print(f"Processing {pdf_path} ...")
    text = extract_text_from_pdf(pdf_path)
    if not text:
        continue

    chunks, embeddings = make_chunks_and_embeddings(
        text,
        chunk_size=chunk_size_slider.value,
        overlap=overlap_slider.value,
    )

    base = pdf_path.rsplit('.', 1)[0]
    out_dir = f"/content/{base}_pack"
    os.makedirs(out_dir, exist_ok=True)

    # Save core artifacts
    with open(f"{out_dir}/chunks.json", "w", encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False)
    np.save(f"{out_dir}/embeddings.npy", embeddings)
    np.savetxt(f"{out_dir}/embeddings.csv", embeddings, delimiter=",")
    metadata = {
        "docs": [
            {
                "doc_id": base,
                "title": base,
                "path": f"/content/{pdf_path}",
                "page": None,
                "line": None,
                "timestamp": datetime.utcnow().isoformat() + "Z",
            }
        ]
    }
    with open(f"{out_dir}/metadata.json", "w", encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False)

    # v1.1 extras
    manifest = create_manifest(model_dropdown.value, chunk_size_slider.value, overlap_slider.value, offset_unit='char')
    with open(f"{out_dir}/pack.manifest.json", "w", encoding='utf-8') as f:
        json.dump(manifest, f, ensure_ascii=False)

    citations_path = f"{out_dir}/citations.jsonl"
    # truncate if exists
    open(citations_path, 'w', encoding='utf-8').close()
    write_citations_jsonl(citations_path, base, chunks)

    print(f"Saved v1.1 pack to: {out_dir}")




Dropdown(description='Model:', options=(('English MiniLM', 'all-MiniLM-L6-v2'), ('Japanese/Multilingual (LaBSE…

IntSlider(value=512, description='Chunk size (tokens):', max=2048, min=50, step=50)

IntSlider(value=50, description='Overlap (tokens):', max=200, step=10)

Saving Husserl-IdeasI_text.pdf to Husserl-IdeasI_text.pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing Husserl-IdeasI_text.pdf ...


Token indices sequence length is longer than the specified maximum sequence length for this model (272537 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# --- Quick validator (not a full schema check) ---
import glob

def quick_validate(pack_dir: str):
    required = [
        'pack.manifest.json',
        'chunks.json',
        'embeddings.npy',
        'embeddings.csv',
        'metadata.json',
        'citations.jsonl',
    ]
    missing = [p for p in required if not os.path.exists(os.path.join(pack_dir, p))]
    if missing:
        print('Missing:', missing)
        raise SystemExit(1)
    with open(os.path.join(pack_dir, 'pack.manifest.json'), 'r', encoding='utf-8') as f:
        man = json.load(f)
    print('Manifest pack_version:', man.get('pack_version'))
    print('Embedder:', man.get('embedder', {}).get('name'))
    print('Chunks:', len(json.load(open(os.path.join(pack_dir, 'chunks.json'), 'r'))))
    print('Embeddings shape:', np.load(os.path.join(pack_dir, 'embeddings.npy')).shape)
    print('OK')

# Example (validate the last built pack directory shown above)
# quick_validate('/content/<your_base>_pack')
export_zip_archive(base, chunks, embeddings, metadata)