<a href="https://colab.research.google.com/github/ruizTechServices/google_Colabs/blob/main/fileEmbedder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How do I create a Small Language Model that I can host anywhere at all in any digital device?

In [1]:
!python --version

Python 3.12.11


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# @title Project Checklist (click to expand)
# @markdown Check items as you complete them. State is saved to /content/todo_data.json
from IPython.display import display
import ipywidgets as w, json, os, datetime

STORE = "/content/todo_data.json"
DEFAULTS = [
  "Create/Open this Colab notebook",
  "Install Python deps",
  "Set API keys (OPENAI_API_KEY or GOOGLE_API_KEY; optional OLLAMA via ngrok)",
  "Decide providers: LLM for context (OpenAI Mini or Ollama) & Embeddings (OpenAI or Google)",
  "Upload sample files",
  "Normalize files to text",
  "Chunk text by tokens",
  "Run optional context analysis",
  "Generate embeddings",
  "Save JSONL/Parquet dataset",
  "Download artifacts (or mount Drive)",
  "Document next steps (vector DB, fine-tuning, RAG)"
]

if os.path.exists(STORE):
  state = json.load(open(STORE))
  items = state.get("items", DEFAULTS)
  checks = state.get("checks", [False]*len(items))
else:
  items = DEFAULTS[:]
  checks = [False]*len(items)

boxes = []
for i, label in enumerate(items):
  cb = w.Checkbox(value=checks[i], description=label)
  boxes.append(cb)

def save_state(_=None):
  payload = {
    "items": [b.description for b in boxes],
    "checks": [b.value for b in boxes],
    "saved_at": datetime.datetime.utcnow().isoformat() + "Z"
  }
  json.dump(payload, open(STORE, "w"))
  status.value = f"Saved at {payload['saved_at']}"

btn = w.Button(description="Save progress", button_style="success")
btn.on_click(save_state)
status = w.HTML(value="")

accordion = w.Accordion(children=[w.VBox(boxes+[btn, status])])
accordion.set_title(0, "✅ Data Pipeline Checklist")
display(accordion)


Accordion(children=(VBox(children=(Checkbox(value=False, description='Create/Open this Colab notebook'), Check…

  "saved_at": datetime.datetime.utcnow().isoformat() + "Z"


In [5]:
!pip -q install pypdf python-docx html2text beautifulsoup4 tiktoken pandas pyarrow openai google-generativeai tqdm


In [6]:
import os

# ---- Choose providers ----
LLM_FOR_CONTEXT = "openai"   # options: "openai", "ollama", "none"
EMBED_PROVIDER  = "openai"   # options: "openai", "google"

# ---- Keys (assume they already exist; just set in the environment in Colab UI) ----
# os.environ["OPENAI_API_KEY"] = "..."       # set in Colab > three-dots > Variables (recommended)
# os.environ["GOOGLE_API_KEY"] = "..."

# ---- Ollama via ngrok (optional). Example: "https://user:pass-yourname.ngrok-free.app"
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "").strip()

# ---- Chunking knobs ----
CHUNK_TOKENS   = 800
CHUNK_OVERLAP  = 200

# ---- Embedding model names (safe defaults) ----
OPENAI_EMBED_MODEL  = "text-embedding-3-small"  # solid accuracy, good price
GOOGLE_EMBED_MODEL  = "text-embedding-004"      # current public model name; adjust if Google updates

# ---- LLM for analysis (fast & cheap default) ----
OPENAI_MINI_MODEL = "gpt-4o-mini"               # swap if you prefer o3-mini or similar
OLLAMA_MODEL      = "llama3.1"                  # choose your local model tag

print("Configured:", {"LLM_FOR_CONTEXT": LLM_FOR_CONTEXT, "EMBED_PROVIDER": EMBED_PROVIDER})


Configured: {'LLM_FOR_CONTEXT': 'openai', 'EMBED_PROVIDER': 'openai'}


In [7]:
from google.colab import files
import os, shutil, pathlib

UPLOAD_DIR = "/content/uploads"
pathlib.Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True)

print("Select one or more files…")
up = files.upload()
for name, data in up.items():
    with open(os.path.join(UPLOAD_DIR, name), "wb") as f:
        f.write(data)
print("Saved to", UPLOAD_DIR, "->", os.listdir(UPLOAD_DIR))


Select one or more files…


Saving 1-bitLLMs.pdf to 1-bitLLMs.pdf
Saved to /content/uploads -> ['1-bitLLMs.pdf']


In [8]:
import pathlib, re
from pypdf import PdfReader
import docx
from bs4 import BeautifulSoup
import html2text

def read_pdf(path: str) -> str:
    out = []
    try:
        pdf = PdfReader(path)
        for p in pdf.pages:
            out.append(p.extract_text() or "")
    except Exception as e:
        print(f"[PDF ERROR] {path} -> {e}")
    return "\n".join(out)

def read_docx(path: str) -> str:
    try:
        d = docx.Document(path)
        return "\n".join([p.text for p in d.paragraphs])
    except Exception as e:
        print(f"[DOCX ERROR] {path} -> {e}")
        return ""

def read_html(path: str) -> str:
    try:
        html = pathlib.Path(path).read_text(errors="ignore")
        soup = BeautifulSoup(html, "html.parser")
        # keep visible text; fallback to markdown
        md = html2text.html2text(str(soup))
        return md
    except Exception as e:
        print(f"[HTML ERROR] {path} -> {e}")
        return ""

def read_textlike(path: str) -> str:
    try:
        return pathlib.Path(path).read_text(errors="ignore")
    except Exception as e:
        print(f"[TEXT ERROR] {path} -> {e}")
        return ""

def to_text(path: str) -> str:
    ext = pathlib.Path(path).suffix.lower()
    if ext == ".pdf":   return read_pdf(path)
    if ext == ".docx":  return read_docx(path)
    if ext in (".html", ".htm"): return read_html(path)
    # csv, md, txt and others -> read as text
    return read_textlike(path)

docs = []
for p in sorted(pathlib.Path(UPLOAD_DIR).glob("**/*")):
    if p.is_file():
        txt = to_text(str(p))
        txt = re.sub(r"[ \t]+", " ", txt)
        txt = re.sub(r"\n{3,}", "\n\n", txt).strip()
        if len(txt) > 0:
            docs.append({"doc_id": p.name, "path": str(p), "text": txt})
len(docs), [d["doc_id"] for d in docs][:5]


(1, ['1-bitLLMs.pdf'])

In [9]:
import tiktoken
from typing import List, Dict

enc = tiktoken.get_encoding("cl100k_base")  # robust general encoder

def split_by_tokens(text: str, target: int, overlap: int) -> List[str]:
    tokens = enc.encode(text)
    chunks = []
    i = 0
    while i < len(tokens):
        window = tokens[i : i + target]
        chunks.append(enc.decode(window))
        i += max(1, target - overlap)
    return chunks

records = []
for d in docs:
    chs = split_by_tokens(d["text"], CHUNK_TOKENS, CHUNK_OVERLAP)
    for idx, ch in enumerate(chs):
        records.append({
            "id": f"{d['doc_id']}::{idx}",
            "doc_id": d["doc_id"],
            "chunk_index": idx,
            "text": ch,
            "meta": {"source": d["path"]}
        })

len(records)


13

In [14]:
import json, requests, math, time
from tqdm import tqdm
from google.colab import userdata


def analyze_with_openai(texts, model=OPENAI_MINI_MODEL):
    import openai
    client = openai.OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
    outs = []
    for t in texts:
        prompt = (
          "Summarize this chunk in 1-2 sentences. "
          "Also return 3-6 keywords and a short section title.\n\nCHUNK:\n" + t
        )
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role":"system","content":"Be concise and accurate."},
                      {"role":"user","content":prompt}],
            temperature=0.2,
            max_tokens=200,
        )
        outs.append(resp.choices[0].message.content)
    return outs

def analyze_with_ollama(texts, model=OLLAMA_MODEL):
    # Requires: OLLAMA_BASE_URL to be reachable (e.g., your ngrok tunnel)
    outs = []
    for t in texts:
        payload = {
          "model": model,
          "messages": [
            {"role":"system","content":"Be concise and accurate."},
            {"role":"user","content":
              "Summarize this chunk in 1-2 sentences. "
              "Also return 3-6 keywords and a short section title.\n\nCHUNK:\n"+t}
          ],
          "stream": False
        }
        r = requests.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload, timeout=120)
        r.raise_for_status()
        outs.append(r.json().get("message", {}).get("content", ""))
    return outs

def parse_analysis(s: str) -> dict:
    # Accept free-form; try to split; keep it robust
    if not s: return {"summary":"", "keywords":[], "section":""}
    lines = [x.strip() for x in s.splitlines() if x.strip()]
    summary = lines[0] if lines else ""
    # naive keyword/section extraction
    kws = []
    section = ""
    for ln in lines[1:]:
        low = ln.lower()
        if "keyword" in low:
            kws = [k.strip(" ,;") for k in ln.split(":")[-1].split(",")]
        if "title" in low or "section" in low:
            section = ln.split(":")[-1].strip()
    return {"summary": summary, "keywords": kws, "section": section}

if LLM_FOR_CONTEXT in ("openai","ollama"):
    BATCH = 16
    for i in tqdm(range(0, len(records), BATCH)):
        batch = records[i:i+BATCH]
        texts = [r["text"] for r in batch]
        if LLM_FOR_CONTEXT == "openai":
            outs = analyze_with_openai(texts)
        else:
            assert OLLAMA_BASE_URL, "Set OLLAMA_BASE_URL to your ngrok tunnel (e.g., https://user:pass-<name>.ngrok-free.app)"
            outs = analyze_with_ollama(texts)
        for r, out in zip(batch, outs):
            r.update(parse_analysis(out))
else:
    print("Skipping context analysis.")

100%|██████████| 1/1 [00:15<00:00, 15.92s/it]


In [16]:
import numpy as np
from tqdm import tqdm
from google.colab import userdata

def embed_openai(texts):
    import openai
    client = openai.OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
    # Batch for efficiency
    vecs = []
    B = 128
    for i in tqdm(range(0, len(texts), B)):
        sub = texts[i:i+B]
        resp = client.embeddings.create(model=OPENAI_EMBED_MODEL, input=sub)
        vecs.extend([np.array(d.embedding, dtype=np.float32) for d in resp.data])
    return vecs

def embed_google(texts):
    import google.generativeai as genai
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    model = genai.embedder.GenerativeModelEmbedding(  # fallback if class evolves; API sometimes provides simple `genai.embed_content`
        model_name=GOOGLE_EMBED_MODEL
    ) if hasattr(genai, "embedder") else None

    vecs = []
    for t in tqdm(texts):
        # API surface changes over time; this path works with most releases:
        if hasattr(genai, "embed_content"):
            out = genai.embed_content(model=GOOGLE_EMBED_MODEL, content=t)
            vecs.append(np.array(out["embedding"], dtype=np.float32))
        else:
            # future-proof fallback if embedder object exists
            out = genai.embedder.embed(model=GOOGLE_EMBED_MODEL, content=t)
            vecs.append(np.array(out["embedding"], dtype=np.float32))
    return vecs

texts = [r["text"] for r in records]
if EMBED_PROVIDER == "openai":
    vectors = embed_openai(texts)
else:
    vectors = embed_google(texts)

for r, v in zip(records, vectors):
    r["embedding"] = v.tolist()

len(records), len(records[0]["embedding"])

100%|██████████| 1/1 [00:00<00:00,  1.05it/s]


(13, 1536)

In [17]:
import json, pandas as pd, numpy as np
from google.colab import files

# JSONL
jsonl_path = "/content/dataset.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

# Parquet (store embeddings as list or fixed-length numpy array)
df = pd.DataFrame(records)
parquet_path = "/content/dataset.parquet"
df.to_parquet(parquet_path, engine="pyarrow", index=False)

print("Wrote:", jsonl_path, parquet_path)
files.download(jsonl_path)
files.download(parquet_path)


Wrote: /content/dataset.jsonl /content/dataset.parquet


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The previous is a comprehensive File Embedder. It turns

# Task --------From GOOGLE
Load Google Drive in the selected empty cell.

## Install python deps

### Subtask:
Install the necessary Python dependencies for the data processing pipeline.


**Reasoning**:
The subtask is to install the necessary Python dependencies. The provided code cell `gHNrLqcN7rHk` does exactly this by running `pip install` for a list of required packages.



In [18]:
!pip -q install pypdf python-docx html2text beautifulsoup4 tiktoken pandas pyarrow openai google-generativeai tqdm


## Set providers

### Subtask:
Configure the LLM and embedding providers and set API keys for the data processing pipeline.


**Reasoning**:
The code cell `h3X4o7a78AdF` already contains the code to configure the LLM and embedding providers and print the configuration. I will execute this cell as it directly addresses step 1, 2, and 4 of the instructions. Step 3 is a manual step for the user outside of the code execution.



In [19]:
import os

# ---- Choose providers ----
LLM_FOR_CONTEXT = "openai"   # options: "openai", "ollama", "none"
EMBED_PROVIDER  = "openai"   # options: "openai", "google"

# ---- Keys (assume they already exist; just set in the environment in Colab UI) ----
# os.environ["OPENAI_API_KEY"] = "..."       # set in Colab > three-dots > Variables (recommended)
# os.environ["GOOGLE_API_KEY"] = "..."

# ---- Ollama via ngrok (optional). Example: "https://user:pass-yourname.ngrok-free.app"
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "").strip()

# ---- Chunking knobs ----
CHUNK_TOKENS   = 800
CHUNK_OVERLAP  = 200

# ---- Embedding model names (safe defaults) ----
OPENAI_EMBED_MODEL  = "text-embedding-3-small"  # solid accuracy, good price
GOOGLE_EMBED_MODEL  = "text-embedding-004"      # current public model name; adjust if Google updates

# ---- LLM for analysis (fast & cheap default) ----
OPENAI_MINI_MODEL = "gpt-4o-mini"               # swap if you prefer o3-mini or similar
OLLAMA_MODEL      = "llama3.1"                  # choose your local model tag

print("Configured:", {"LLM_FOR_CONTEXT": LLM_FOR_CONTEXT, "EMBED_PROVIDER": EMBED_PROVIDER})

Configured: {'LLM_FOR_CONTEXT': 'openai', 'EMBED_PROVIDER': 'openai'}


## Upload files

### Subtask:
Upload files to the designated upload directory in the Colab environment.


**Reasoning**:
Execute the code to upload files to the specified directory.



In [20]:
from google.colab import files
import os, shutil, pathlib

UPLOAD_DIR = "/content/uploads"
pathlib.Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True)

print("Select one or more files…")
up = files.upload()
for name, data in up.items():
    with open(os.path.join(UPLOAD_DIR, name), "wb") as f:
        f.write(data)
print("Saved to", UPLOAD_DIR, "->", os.listdir(UPLOAD_DIR))

Select one or more files…


Saving Attentionisallyouneed.pdf to Attentionisallyouneed.pdf
Saved to /content/uploads -> ['1-bitLLMs.pdf', 'Attentionisallyouneed.pdf']


## Normalize files to text

### Subtask:
Normalize the uploaded files to plain text format.

**Reasoning**:
The code cell `hVVvmcPn8Yku` contains the necessary functions and logic to convert different file types (PDF, DOCX, HTML, text-like) into plain text and store them in the `docs` variable. I will execute this cell to perform the normalization.

In [21]:
import pathlib, re
from pypdf import PdfReader
import docx
from bs4 import BeautifulSoup
import html2text

def read_pdf(path: str) -> str:
    out = []
    try:
        pdf = PdfReader(path)
        for p in pdf.pages:
            out.append(p.extract_text() or "")
    except Exception as e:
        print(f"[PDF ERROR] {path} -> {e}")
    return "\n".join(out)

def read_docx(path: str) -> str:
    try:
        d = docx.Document(path)
        return "\n".join([p.text for p in d.paragraphs])
    except Exception as e:
        print(f"[DOCX ERROR] {path} -> {e}")
        return ""

def read_html(path: str) -> str:
    try:
        html = pathlib.Path(path).read_text(errors="ignore")
        soup = BeautifulSoup(html, "html.parser")
        # keep visible text; fallback to markdown
        md = html2text.html2text(str(soup))
        return md
    except Exception as e:
        print(f"[HTML ERROR] {path} -> {e}")
        return ""

def read_textlike(path: str) -> str:
    try:
        return pathlib.Path(path).read_text(errors="ignore")
    except Exception as e:
        print(f"[TEXT ERROR] {path} -> {e}")
        return ""

def to_text(path: str) -> str:
    ext = pathlib.Path(path).suffix.lower()
    if ext == ".pdf":   return read_pdf(path)
    if ext == ".docx":  return read_docx(path)
    if ext in (".html", ".htm"): return read_html(path)
    # csv, md, txt and others -> read as text
    return read_textlike(path)

docs = []
for p in sorted(pathlib.Path(UPLOAD_DIR).glob("**/*")):
    if p.is_file():
        txt = to_text(str(p))
        txt = re.sub(r"[ \t]+", " ", txt)
        txt = re.sub(r"\n{3,}", "\n\n", txt).strip()
        if len(txt) > 0:
            docs.append({"doc_id": p.name, "path": str(p), "text": txt})
len(docs), [d["doc_id"] for d in docs][:5]

(2, ['1-bitLLMs.pdf', 'Attentionisallyouneed.pdf'])

## Chunk text by tokens

### Subtask:
Chunk the normalized text into smaller pieces based on token limits and overlap.

**Reasoning**:
The code cell `moaQ2yRI8_aT` defines the `split_by_tokens` function and applies it to the normalized text in the `docs` variable to create tokenized chunks stored in the `records` variable. I will execute this cell to perform the chunking.

In [22]:
import tiktoken
from typing import List, Dict

enc = tiktoken.get_encoding("cl100k_base")  # robust general encoder

def split_by_tokens(text: str, target: int, overlap: int) -> List[str]:
    tokens = enc.encode(text)
    chunks = []
    i = 0
    while i < len(tokens):
        window = tokens[i : i + target]
        chunks.append(enc.decode(window))
        i += max(1, target - overlap)
    return chunks

records = []
for d in docs:
    chs = split_by_tokens(d["text"], CHUNK_TOKENS, CHUNK_OVERLAP)
    for idx, ch in enumerate(chs):
        records.append({
            "id": f"{d['doc_id']}::{idx}",
            "doc_id": d["doc_id"],
            "chunk_index": idx,
            "text": ch,
            "meta": {"source": d["path"]}
        })

len(records)

30

## Run optional context analysis

### Subtask:
Perform optional context analysis on the text chunks using the selected LLM.

**Reasoning**:
The code cell `U9oeEBtP9Cid` contains the logic to perform context analysis on the generated chunks using either OpenAI or Ollama, based on the `LLM_FOR_CONTEXT` variable. I will execute this cell to perform the analysis.

In [23]:
import json, requests, math, time
from tqdm import tqdm
from google.colab import userdata


def analyze_with_openai(texts, model=OPENAI_MINI_MODEL):
    import openai
    client = openai.OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
    outs = []
    for t in texts:
        prompt = (
          "Summarize this chunk in 1-2 sentences. "
          "Also return 3-6 keywords and a short section title.\n\nCHUNK:\n" + t
        )
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role":"system","content":"Be concise and accurate."},
                      {"role":"user","content":prompt}],
            temperature=0.2,
            max_tokens=200,
        )
        outs.append(resp.choices[0].message.content)
    return outs

def analyze_with_ollama(texts, model=OLLAMA_MODEL):
    # Requires: OLLAMA_BASE_URL to be reachable (e.g., your ngrok tunnel)
    outs = []
    for t in texts:
        payload = {
          "model": model,
          "messages": [
            {"role":"system","content":"Be concise and accurate."},
            {"role":"user","content":
              "Summarize this chunk in 1-2 sentences. "
              "Also return 3-6 keywords and a short section title.\n\nCHUNK:\n"+t}
          ],
          "stream": False
        }
        r = requests.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload, timeout=120)
        r.raise_for_status()
        outs.append(r.json().get("message", {}).get("content", ""))
    return outs

def parse_analysis(s: str) -> dict:
    # Accept free-form; try to split; keep it robust
    if not s: return {"summary":"", "keywords":[], "section":""}
    lines = [x.strip() for x in s.splitlines() if x.strip()]
    summary = lines[0] if lines else ""
    # naive keyword/section extraction
    kws = []
    section = ""
    for ln in lines[1:]:
        low = ln.lower()
        if "keyword" in low:
            kws = [k.strip(" ,;") for k in ln.split(":")[-1].split(",")]
        if "title" in low or "section" in low:
            section = ln.split(":")[-1].strip()
    return {"summary": summary, "keywords": kws, "section": section}

if LLM_FOR_CONTEXT in ("openai","ollama"):
    BATCH = 16
    for i in tqdm(range(0, len(records), BATCH)):
        batch = records[i:i+BATCH]
        texts = [r["text"] for r in batch]
        if LLM_FOR_CONTEXT == "openai":
            outs = analyze_with_openai(texts)
        else:
            assert OLLAMA_BASE_URL, "Set OLLAMA_BASE_URL to your ngrok tunnel (e.g., https://user:pass-<name>.ngrok-free.app)"
            outs = analyze_with_ollama(texts)
        for r, out in zip(batch, outs):
            r.update(parse_analysis(out))
else:
    print("Skipping context analysis.")

100%|██████████| 2/2 [00:25<00:00, 12.72s/it]


## Generate embeddings

### Subtask:
Generate embeddings for the text chunks using the selected embedding provider.

**Reasoning**:
The code cell `KYqHUIoz9uMK` contains the logic to generate embeddings for the text chunks using either OpenAI or Google embeddings, based on the `EMBED_PROVIDER` variable. I will execute this cell to perform the embedding generation.

In [24]:
import numpy as np
from tqdm import tqdm
from google.colab import userdata

def embed_openai(texts):
    import openai
    client = openai.OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
    # Batch for efficiency
    vecs = []
    B = 128
    for i in tqdm(range(0, len(texts), B)):
        sub = texts[i:i+B]
        resp = client.embeddings.create(model=OPENAI_EMBED_MODEL, input=sub)
        vecs.extend([np.array(d.embedding, dtype=np.float32) for d in resp.data])
    return vecs

def embed_google(texts):
    import google.generativeai as genai
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    model = genai.embedder.GenerativeModelEmbedding(  # fallback if class evolves; API sometimes provides simple `genai.embed_content`
        model_name=GOOGLE_EMBED_MODEL
    ) if hasattr(genai, "embedder") else None

    vecs = []
    for t in tqdm(texts):
        # API surface changes over time; this path works with most releases:
        if hasattr(genai, "embed_content"):
            out = genai.embed_content(model=GOOGLE_EMBED_MODEL, content=t)
            vecs.append(np.array(out["embedding"], dtype=np.float32))
        else:
            # future-proof fallback if embedder object exists
            out = genai.embedder.embed(model=GOOGLE_EMBED_MODEL, content=t)
            vecs.append(np.array(out["embedding"], dtype=np.float32))
    return vecs

texts = [r["text"] for r in records]
if EMBED_PROVIDER == "openai":
    vectors = embed_openai(texts)
else:
    vectors = embed_google(texts)

for r, v in zip(records, vectors):
    r["embedding"] = v.tolist()

len(records), len(records[0]["embedding"])

100%|██████████| 1/1 [00:00<00:00,  1.66it/s]


(30, 1536)

## Save JSONL/Parquet dataset

### Subtask:
Save the processed data, including embeddings, to JSONL and Parquet files and provide them for download.

**Reasoning**:
The code cell `0rxKSUTx912d` saves the processed data, including the generated embeddings, to JSONL and Parquet files and then provides download links for these files. This directly addresses the subtask. I will execute this cell.

In [25]:
import json, pandas as pd, numpy as np
from google.colab import files

# JSONL
jsonl_path = "/content/dataset.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

# Parquet (store embeddings as list or fixed-length numpy array)
df = pd.DataFrame(records)
parquet_path = "/content/dataset.parquet"
df.to_parquet(parquet_path, engine="pyarrow", index=False)

print("Wrote:", jsonl_path, parquet_path)
files.download(jsonl_path)
files.download(parquet_path)

Wrote: /content/dataset.jsonl /content/dataset.parquet


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>