In [None]:
# Required libraries
!pip install sentence-transformers PyPDF2

from sentence_transformers import SentenceTransformer
import numpy as np
import json

from google.colab import files
from IPython.display import display
import ipywidgets as widgets

# --- Model selection UI ---
model_options = [
    ('English MiniLM', 'all-MiniLM-L6-v2'),
    ('Japanese/Multilingual (LaBSE)', 'sentence-transformers/LaBSE'),
    ('Japanese S-BERT', 'sonoisa/sentence-bert-base-ja-mean-tokens-v2'),
]
model_dropdown = widgets.Dropdown(
    options=model_options,
    value='all-MiniLM-L6-v2',
    description='Model:'
)

chunk_size_slider = widgets.IntSlider(
    value=200,
    min=50,
    max=2048,
    step=50,
    description='Chunk size:'
)

display(model_dropdown)
display(chunk_size_slider)

# PDF upload
uploaded = files.upload()

# Initialize selected model
model = SentenceTransformer(model_dropdown.value)

# PDF to text extraction
import PyPDF2


def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(open(pdf_path, "rb"))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    # Check if extracted text is empty (image-only PDF)
    if not text.strip():
        print("No extractable text found: This PDF appears to be image-only. Skipping.")
        return ""
    return text


# Chunk splitting
def chunk_text(text, chunk_size=200):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]


def make_chunks_and_embeddings(text, chunk_size=200):
    chunks = chunk_text(text, chunk_size)
    embeddings = model.encode(chunks)
    return chunks, embeddings


# New function: Save chunks.json, embeddings.npy, embeddings.csv, metadata.json together as a .zip archive
import os
import io
import zipfile
from datetime import datetime

def export_zip_archive(base, chunks, embeddings, metadata):
    """
    Save chunks, embeddings (both .npy and .csv formats), and metadata into a .zip archive.
    chunks: list of text chunks
    embeddings: numpy array
    metadata: dict containing metadata information
    """
    # Create files in memory buffer temporarily
    chunks_json = json.dumps(chunks, ensure_ascii=False).encode('utf-8')
    metadata_json = json.dumps(metadata, ensure_ascii=False).encode('utf-8')
    embeddings_bytes_io = io.BytesIO()
    np.save(embeddings_bytes_io, embeddings)
    embeddings_bytes = embeddings_bytes_io.getvalue()
    # Create CSV in-memory string buffer
    embeddings_csv_io = io.StringIO()
    np.savetxt(embeddings_csv_io, embeddings, delimiter=",")
    embeddings_csv_str = embeddings_csv_io.getvalue().encode('utf-8')

    zip_path = f"/content/{base}.zip"
    with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.writestr("chunks.json", chunks_json)
        zf.writestr("embeddings.npy", embeddings_bytes)
        zf.writestr("embeddings.csv", embeddings_csv_str)
        zf.writestr("metadata.json", metadata_json)
        print(f"Exported ZIP archive: {zip_path}")
        print("Included embeddings.csv in the ZIP archive as well.")


for pdf_path in uploaded.keys():
    print(f"Processing {pdf_path} ...")
    text = extract_text_from_pdf(pdf_path)
    # Skip processing if the PDF is image-only (no extractable text)
    if not text:
        continue
    chunks, embeddings = make_chunks_and_embeddings(text, chunk_size=chunk_size_slider.value)
    base = pdf_path.rsplit(".", 1)[0]
    with open(f"/content/{base}_chunks.json", "w") as f:
        json.dump(chunks, f, ensure_ascii=False)
    np.save(f"/content/{base}_embeddings.npy", embeddings)
    np.savetxt(f"/content/{base}_embeddings.csv", embeddings, delimiter=",")
    print(f"Saved: {base}_chunks.json, {base}_embeddings.npy, {base}_embeddings.csv")


In [None]:
# --- New cell ---
# Export chunks.json, embeddings.npy, embeddings.csv, metadata.json together as a .zip archive

metadata = {
    "original_pdf": pdf_path,
    "chunk_size": chunk_size_slider.value,
    "model_used": model_dropdown.value,
    "timestamp": datetime.now().isoformat()
}
export_zip_archive(base, chunks, embeddings, metadata)