In [2]:
import json
import os
from pathlib import Path
from typing import List, Dict
import re
from datetime import datetime
import tiktoken

In [3]:
CHUNK_SIZE = 300
tokenizer = tiktoken.get_encoding("cl100k_base")



In [4]:
def clean_text(text: str) -> str:
    text = text.replace('\n', ' ').replace('\r', ' ')
    return re.sub(r'\s+', ' ', text).strip()

def split_into_token_chunks(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
    tokens = tokenizer.encode(text)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk) for chunk in chunks]

def parse_pdf_date(pdf_date: str) -> str:
    match = re.search(r"D:(\d{4})(\d{2})(\d{2})", pdf_date)
    if match:
        return f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
    return ""


In [5]:
def chunk_blog_post(doc: Dict, doc_id: str) -> List[Dict]:
    text = clean_text(doc.get("content", ""))
    title = doc.get("title", f"blog_{doc_id}")
    date = doc.get("date", "")
    chunks = split_into_token_chunks(text)

    return [{
        "text": chunk,
        "source": "blog_posts",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i, chunk in enumerate(chunks)]


def chunk_collateral(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name",f"collateral_{doc_id}")
    date = parse_pdf_date(doc.get("metadata",{}).get("CreationDate",""))
    text_blocks = [item.get("text","")for item in doc.get("text_data",[])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "collateral",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i,chunk in enumerate(chunks)]


def chunk_grant_proposal(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name",f"grant_{doc_id}")
    date = parse_pdf_date(doc.get("metadata",{}).get("CreationDate",""))
    text_blocks = [item.get("text","")for item in doc.get("text_data",[])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "collateral",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i,chunk in enumerate(chunks)]


def chunk_powerpoint(doc: Dict, doc_id: str) -> List[Dict]:
    title = doc.get("file_name", f"ppt_{doc_id}")
    date = doc.get("metadata", {}).get("Created", "")
    text_blocks = [item.get("text", "") for item in doc.get("text_data", [])]
    full_text = clean_text(" ".join(text_blocks))
    chunks = split_into_token_chunks(full_text)

    return [{
        "text": chunk,
        "source": "powerpoints",
        "title": title,
        "doc_id": doc_id,
        "chunk_id": i,
        "date": date
    } for i, chunk in enumerate(chunks)]






In [6]:
def process_jsonl_file(file_path: str, chunk_fn, source: str, output_path: str):
    with open(file_path, 'r', encoding='utf-8') as f:
        docs = [json.loads(line.strip())for line in f if line.strip()]

    all_chunks=[]
    for i, doc in enumerate(docs):
        doc_id=f"{source}_{i:03d}"
        all_chunks.extend(chunk_fn(doc,doc_id))

    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + '\n')
        print(f"Saved {len(all_chunks)} chunks to {output_path}")


def process_folder(folder_path: str, chunk_fn, source: str, output_path: str):
    all_chunks = []
    for i, file in enumerate(Path(folder_path).glob("*.json")):
        with open(file, 'r', encoding='utf-8') as f:
            doc = json.load(f)
            doc_id = file.stem
            all_chunks.extend(chunk_fn(doc, doc_id))

    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + '\n')
    print(f"Saved {len(all_chunks)} chunks to {output_path}")


In [12]:
os.makedirs('/Users/sharvari/Downloads/CAFB_Challenge/outputs', exist_ok=True)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/blog_posts.jsonl',
    chunk_blog_post,
    'blog_posts',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_blog.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/collateral.jsonl',
    chunk_collateral,
    'collateral',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_collateral.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/powerpoints.jsonl',
    chunk_powerpoint,
    'powerpoints',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_powerpoint.jsonl'
)

process_jsonl_file(
    '/Users/sharvari/Downloads/CAFB_Challenge/data/grant_proposals.jsonl',
    chunk_grant_proposal,
    'grant_proposals',
    '/Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_grants.jsonl'
)


Saved 1354 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_blog.jsonl
Saved 93 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_collateral.jsonl
Saved 49 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_powerpoint.jsonl
Saved 392 chunks to /Users/sharvari/Downloads/CAFB_Challenge/outputs/chunks_grants.jsonl


In [None]:
# if __name__ == '__main__':
#     os.makedirs('outputs', exist_ok=True)

#     process_jsonl_file(
#         'data/blog_posts.jsonl',
#         chunk_blog_post,
#         'blog_posts',
#         'outputs/chunks_blog.jsonl'
#     )

#     process_jsonl_file(
#         'data/collateral.jsonl',
#         chunk_collateral,
#         'collateral',
#         'outputs/chunks_collateral.jsonl'
#     )

#     process_jsonl_file(
#         'data/powerpoints.jsonl',
#         chunk_powerpoint,
#         'powerpoints',
#         'outputs/chunks_powerpoints.jsonl'
#     )

#     process_jsonl_file(
#         'data/grants_proposals.jsonl',
#         chunk_grant_proposal,
#         'grant_proposals',
#         'outputs/chunks_grants.jsonl'
#     )