In [None]:
import os
import json
import math

CHUNK_SIZE = 80 # we can change this later
INPUT_DIR = "../data"
OUTPUT_DIR = "../chunked_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def chunk_text(text, chunk_size):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

for filename in os.listdir(INPUT_DIR):
    if not filename.endswith(".txt"):
        continue

    speech_id = filename.replace(".txt", "")
    with open(os.path.join(INPUT_DIR, filename), "r", encoding="utf-8") as f:
        full_text = f.read()

    chunks = chunk_text(full_text, CHUNK_SIZE)
    total_chunks = len(chunks)

    chunk_data = []
    for i, chunk in enumerate(chunks):
        entry = {
            "speech_id" : speech_id,
            "chunk_id" : i,
            "text" : chunk,
            "position" : round(i / (total_chunks-1), 4) if total_chunks>1 else 0.0
        }
        chunk_data.append(entry)

    with open(os.path.join(OUTPUT_DIR, f"{speech_id}_chunks.json"), "w", encoding="utf-8") as f:
        json.dump(chunk_data, f, indent=2)

    print(f"Saved {total_chunks} chunks for {speech_id}")