In [None]:
import os
import json
from pathlib import Path
import re # Import the regular expression module

# Configuration
data_dir = Path("data/extracted") # Directory where extracted .txt files are located
chunk_output_dir = Path("chunks") # Directory to save the chunked JSON
chunk_output_dir.mkdir(parents=True, exist_ok=True) # Ensure output directory exists

def split_into_chunks(text, chunk_size=1000, overlap=200):
    """
    Splits text into chunks with a specified size and overlap.
    A simple character-based chunking for demonstration.
    For real-world RAG, consider more sophisticated methods (e.g., sentence-based, semantic chunking).
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap) # Move start for next chunk
        if start >= len(text): # Ensure loop terminates if end reaches or passes text length
            break
    return chunks

def run_chunker():
    """
    Orchestrates the chunking process for all extracted text files,
    adding company and year metadata.
    """
    all_chunks = []
    files_processed_count = 0
    chunks_generated_count = 0

    print(f"Ensured output directory '{chunk_output_dir.resolve()}' exists.")
    print("\nStarting chunking process...")

    for txt_file in data_dir.glob("*.txt"):
        files_processed_count += 1
        with open(txt_file, "r", encoding="utf-8") as f:
            content = f.read()

        # Extract company ticker and year from filename using regex
        # Assuming filenames are like 'GOOGL_2022_10K.txt'
        match = re.match(r"([A-Z]+)_(\d{4})_10K\.txt", txt_file.name)
        company_ticker = match.group(1) if match else "N/A"
        fiscal_year = match.group(2) if match else "N/A"

        chunks = split_into_chunks(content)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "source_file": txt_file.name,
                "chunk_id": f"{txt_file.stem}_chunk_{i}",
                "text": chunk,
                "company": company_ticker, # ADDED: Company ticker
                "year": fiscal_year      # ADDED: Fiscal year
            })
            chunks_generated_count += 1
        print(f"  Processed {txt_file.name} -> Generated {len(chunks)} chunks.")

    # Save all chunks to a single JSON file
    with open(chunk_output_dir / "all_chunks.json", "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2)

    print(f"\nFinished chunking process. Total files processed: {files_processed_count}, Total chunks generated: {chunks_generated_count}")
    print(f"Saved all chunks to {chunk_output_dir.resolve() / 'all_chunks.json'}")