In [None]:
# 必要なライブラリ
!pip install sentence-transformers PyPDF2

from sentence_transformers import SentenceTransformer
import numpy as np
import json

from google.colab import files
from IPython.display import display
import ipywidgets as widgets

# --- モデル選択用UI ---
model_options = [
    ('英語 MiniLM', 'all-MiniLM-L6-v2'),
    ('日本語/多言語（LaBSE）', 'sentence-transformers/LaBSE'),
    ('日本語 S-BERT', 'sonoisa/sentence-bert-base-ja-mean-tokens-v2'),
]
model_dropdown = widgets.Dropdown(
    options=model_options,
    value='all-MiniLM-L6-v2',
    description='Model:'
)

chunk_size_slider = widgets.IntSlider(
    value=200,
    min=50,
    max=2048,
    step=50,
    description='Chunk size:'
)

display(model_dropdown)
display(chunk_size_slider)

# PDFアップロード
uploaded = files.upload()

# 選択したモデルで初期化
model = SentenceTransformer(model_dropdown.value)

# PDF→テキスト抽出
import PyPDF2


def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(open(pdf_path, "rb"))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text


# chunk分割
def chunk_text(text, chunk_size=200):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]


def make_chunks_and_embeddings(text, chunk_size=200):
    chunks = chunk_text(text, chunk_size)
    embeddings = model.encode(chunks)
    return chunks, embeddings


# 新規関数: chunks.json, embeddings.npy, metadata.json をまとめて .llmrag ファイルとして保存する
import os
import io
import zipfile
from datetime import datetime

def export_llmrag(base, chunks, embeddings, metadata):
    """
    chunks: list of text chunks
    embeddings: numpy array
    metadata: dict containing metadata information
    """
    # 一時的にメモリ内バッファにファイルを作成
    chunks_json = json.dumps(chunks, ensure_ascii=False).encode('utf-8')
    metadata_json = json.dumps(metadata, ensure_ascii=False).encode('utf-8')
    embeddings_bytes_io = io.BytesIO()
    np.save(embeddings_bytes_io, embeddings)
    embeddings_bytes = embeddings_bytes_io.getvalue()

    llmrag_path = f"/content/{base}.llmrag"
    with zipfile.ZipFile(llmrag_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.writestr("chunks.json", chunks_json)
        zf.writestr("embeddings.npy", embeddings_bytes)
        zf.writestr("metadata.json", metadata_json)
    print(f"Exported LLMRAG archive: {llmrag_path}")


for pdf_path in uploaded.keys():
    print(f"Processing {pdf_path} ...")
    text = extract_text_from_pdf(pdf_path)
    chunks, embeddings = make_chunks_and_embeddings(text, chunk_size=chunk_size_slider.value)
    base = pdf_path.rsplit(".", 1)[0]
    with open(f"/content/{base}_chunks.json", "w") as f:
        json.dump(chunks, f, ensure_ascii=False)
    np.save(f"/content/{base}_embeddings.npy", embeddings)
    np.savetxt(f"/content/{base}_embeddings.csv", embeddings, delimiter=",")
    print(f"Saved: {base}_chunks.json, {base}_embeddings.npy, {base}_embeddings.csv")


In [None]:
# --- 新規セル ---
# chunks.json, embeddings.npy, metadata.json をまとめて .llmrag ファイルとしてエクスポートする

metadata = {
    "original_pdf": pdf_path,
    "chunk_size": chunk_size_slider.value,
    "model_used": model_dropdown.value,
    "timestamp": datetime.now().isoformat()
}
export_llmrag(base, chunks, embeddings, metadata)