In [26]:
import json
import os
from pathlib import Path
from typing import List, Dict
import re
from datetime import datetime
import tiktoken

In [27]:
CHUNK_SIZE = 300
OVERLAP_SIZE = 150
tokenizer = tiktoken.get_encoding("cl100k_base")

In [28]:
def clean_text(text: str) -> str:
    text = text.replace('\n', ' ').replace('\r', ' ')
    return re.sub(r'\s+', ' ', text).strip()

def split_into_token_chunks(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
    tokens = tokenizer.encode(text)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk) for chunk in chunks]

def split_into_token_chunks(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = OVERLAP_SIZE) -> List[str]:
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunks.append(tokenizer.decode(chunk_tokens))
    return chunks

def parse_pdf_date(pdf_date: str) -> str:
    match = re.search(r"D:(\d{4})(\d{2})(\d{2})", pdf_date)
    if match:
        return f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
    return ""


In [29]:
def chunk_blog_post(doc: Dict, doc_id: str) -> List[Dict]:
    text = clean_text(doc.get("content", ""))
    title = doc.get("title", f"blog_{doc_id}")
    date = doc.get("date", "")
    chunks = split_into_token_chunks(text)

    return [{
        "text": chunk,
        "source": "blog_posts",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i, chunk in enumerate(chunks)]


def chunk_collateral(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name",f"collateral_{doc_id}")
    date = parse_pdf_date(doc.get("metadata",{}).get("CreationDate",""))
    text_blocks = [item.get("text","")for item in doc.get("text_data",[])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "collateral",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i,chunk in enumerate(chunks)]


def chunk_grant_proposal(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name",f"grant_{doc_id}")
    date = parse_pdf_date(doc.get("metadata",{}).get("CreationDate",""))
    text_blocks = [item.get("text","")for item in doc.get("text_data",[])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "collateral",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i,chunk in enumerate(chunks)]


def chunk_powerpoint(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name", f"ppt_{doc_id}")
    date = doc.get("metadata", {}).get("Created", "")
    text_blocks = [item.get("text", "") for item in doc.get("text_data", [])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "powerpoints",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i, chunk in enumerate(chunks)]


def chunk_video_captions(txt_path: str, title: str, doc_id: str, group_size: int = 5) -> List[Dict]:
    with open(txt_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    filtered = []
    for line in lines:
        if line.startswith(('WEBVTT', 'Kind:', 'Language:', '<', '00:')):
            continue
        # Remove embedded timestamp tags like <00:01:02.640><c>
        clean = re.sub(r'<.*?>', '', line)
        filtered.append(clean)

    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for line in filtered:
        if line not in seen:
            deduped.append(line)
            seen.add(line)

    grouped_chunks = [" ".join(deduped[i:i + group_size]) for i in range(0, len(deduped), group_size)]

    print(f"[transcript] {doc_id}: {len(grouped_chunks)} grouped caption chunks from '{title}'")

    return [{
        "text": clean_text(chunk),
        "source": "transcript",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": ""
    } for i, chunk in enumerate(grouped_chunks)]


In [30]:
def process_jsonl_file(file_path: str, chunk_fn, source: str, output_path: str):
    with open(file_path, 'r', encoding='utf-8') as f:
        docs = [json.loads(line.strip())for line in f if line.strip()]

    all_chunks=[]
    for i, doc in enumerate(docs):
        doc_id=f"{source}_{i:03d}"
        all_chunks.extend(chunk_fn(doc,doc_id))

    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + '\n')
        print(f"Saved {len(all_chunks)} chunks to {output_path}")


def process_folder(folder_path: str, chunk_fn, source: str, output_path: str):
    all_chunks = []
    for i, file in enumerate(Path(folder_path).glob("*.json")):
        with open(file, 'r', encoding='utf-8') as f:
            doc = json.load(f)
            doc_id = file.stem
            all_chunks.extend(chunk_fn(doc, doc_id))

    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + '\n')
    print(f"Saved {len(all_chunks)} chunks to {output_path}")


def chunk_all_caption_files(folder_path: str, output_path: str, group_size: int = 5):
    all_chunks = []
    folder = Path(folder_path)
    for i, file in enumerate(folder.glob("*.txt")):
        doc_id = file.stem
        title = file.stem.replace("_", " ")
        chunks = chunk_video_captions(str(file), title, doc_id, group_size)
        all_chunks.extend(chunks)

    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + '\n')
    print(f"Saved {len(all_chunks)} chunks to {output_path}")




In [31]:
def chunk_collateral_images(doc: Dict, doc_id: str) -> List[Dict]:
    image_name = doc.get("image_name", "")
    source_pdf = doc.get("source_pdf", "")
    page_number = doc.get("page_number", None)
    image_type = doc.get("type", "")
    title = Path(source_pdf).stem.replace("_", " ")

    text = f"{image_type.title()} from page {page_number} of {title}" if page_number else image_type.title()

    return [{
        "text": text,
        "source": "collateral_image",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": 0,
        "date": "",
        "metadata": {
            "image_name": image_name,
            "source_pdf": source_pdf,
            "page_number": page_number,
            "type": image_type
        }
    }]

def chunk_powerpoint_images(doc: Dict, doc_id: str) -> List[Dict]:
    image_name = doc.get("image_name", "")
    slide_number = doc.get("slide_number", None)
    original_ppt = doc.get("original_ppt", "")
    title = Path(original_ppt).stem.replace("_", " ")

    text = f"Slide {slide_number} image from {title}" if slide_number else "PowerPoint Image"

    return [{
        "text": text,
        "source": "powerpoint_image",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": 0,
        "date": "",
        "metadata": {
            "image_name": image_name,
            "original_ppt": original_ppt,
            "slide_number": slide_number
        }
    }]


In [32]:
os.makedirs('/Users/sharvari/Downloads/CAFB_Challenge/outputs', exist_ok=True)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/blog_posts.jsonl',
    chunk_blog_post,
    'blog_posts',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_blog.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/collateral.jsonl',
    chunk_collateral,
    'collateral',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_collateral.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/powerpoints.jsonl',
    chunk_powerpoint,
    'powerpoints',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_powerpoint.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/grant_proposals.jsonl',
    chunk_grant_proposal,
    'grant_proposals',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_grants.jsonl'
)


chunk_all_caption_files(
    folder_path="/Users/sharvari/Downloads/CAFB_Challenge/data/captions",
    output_path="/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_captions.jsonl"
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/collateral-images.jsonl',
    chunk_collateral_images,
    'collateral_images',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_collateral_images.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/powerpoints-images.jsonl',
    chunk_powerpoint_images,
    'powerpoint_images',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_ppt_images.jsonl'
)



Saved 2309 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_blog.jsonl
Saved 177 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_collateral.jsonl
Saved 95 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_powerpoint.jsonl
Saved 775 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_grants.jsonl
[transcript] SH-Bck3d6h8: 5 grouped caption chunks from 'SH-Bck3d6h8'
[transcript] cDXvnEhu21c: 4 grouped caption chunks from 'cDXvnEhu21c'
[transcript] BnfDzskvnDE: 60 grouped caption chunks from 'BnfDzskvnDE'
[transcript] l3MHqmyqXgs: 63 grouped caption chunks from 'l3MHqmyqXgs'
[transcript] zfuSsOSY450: 3 grouped caption chunks from 'zfuSsOSY450'
[transcript] 7sl9JhOAN40: 35 grouped caption chunks from '7sl9JhOAN40'
[transcript] Fn9vYywn_NU: 23 grouped caption chunks from 'Fn9vYywn NU'
[transcript] fsj7hnHquNo: 7 grouped caption chunks from 'fsj7hnHquNo'
[transcript] B1KujrupuRU: 31 grouped caption chunks from 'B1KujrupuRU'
[tr

In [None]:
# if __name__ == '__main__':
#     os.makedirs('outputs', exist_ok=True)

#     process_jsonl_file(
#         'data/blog_posts.jsonl',
#         chunk_blog_post,
#         'blog_posts',
#         'outputs/chunks_blog.jsonl'
#     )

#     process_jsonl_file(
#         'data/collateral.jsonl',
#         chunk_collateral,
#         'collateral',
#         'outputs/chunks_collateral.jsonl'
#     )

#     process_jsonl_file(
#         'data/powerpoints.jsonl',
#         chunk_powerpoint,
#         'powerpoints',
#         'outputs/chunks_powerpoints.jsonl'
#     )

#     process_jsonl_file(
#         'data/grants_proposals.jsonl',
#         chunk_grant_proposal,
#         'grant_proposals',
#         'outputs/chunks_grants.jsonl'
#     )