In [None]:
import os
import csv

# ----------------------------
# Function to split text into chunks
# ----------------------------
def create_chunks(text, chunk_size=500):
    """
    Splits text into chunks of approximately chunk_size words.
    Returns a list of dictionaries with chunk text and start/end positions.
    """
    words = text.split()
    chunks = []
    start_word_idx = 0

    while start_word_idx < len(words):
        chunk_words = words[start_word_idx:start_word_idx + chunk_size]
        chunk_text = ' '.join(chunk_words)
        chunks.append({
            'text': chunk_text,
            'start_word_idx': start_word_idx + 1,
            'end_word_idx': start_word_idx + len(chunk_words)
        })
        start_word_idx += chunk_size

    return chunks

# ----------------------------
# Function to create metadata for each chunk
# ----------------------------
def create_metadata(chunks, file_name):
    """
    Takes chunk data and returns metadata list.
    """
    metadata = []
    for idx, chunk in enumerate(chunks):
        meta = {
            'chunk_id': idx + 1,
            'file_name': file_name,
            'num_words': len(chunk['text'].split()),
            'start_word_idx': chunk['start_word_idx'],
            'end_word_idx': chunk['end_word_idx'],
            'char_count': len(chunk['text']),
            'summary': '',  # placeholder for summary
            'text': chunk['text']
        }
        metadata.append(meta)
    return metadata

# ----------------------------
# Main Script
# ----------------------------

# Input document file
input_file = 'large_document.txt'  # replace with your file path

# Read document
with open(input_file, 'r', encoding='utf-8') as f:
    text = f.read()

# Create chunks
chunk_size = 500  # number of words per chunk
chunks = create_chunks(text, chunk_size)

# Generate metadata
metadata = create_metadata(chunks, os.path.basename(input_file))

# Save metadata and chunks to CSV
output_file = 'document_chunks_metadata.csv'
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['chunk_id', 'file_name', 'num_words', 'start_word_idx', 'end_word_idx', 'char_count', 'summary', 'text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for data in metadata:
        writer.writerow(data)

print(f"\nChunks and metadata saved to {output_file}\n")

# ----------------------------
# Print first few chunks to console
# ----------------------------
print("Preview of first 3 chunks:\n")
for data in metadata[:3]:
    print(f"Chunk ID: {data['chunk_id']}")
    print(f"Number of words: {data['num_words']}")
    print(f"Start word index: {data['start_word_idx']}")
    print(f"End word index: {data['end_word_idx']}")
    print(f"Character count: {data['char_count']}")
    print("Text:")
    print(data['text'])
    print('-' * 80)



Chunks and metadata saved to document_chunks_metadata.csv

Preview of first 3 chunks:

Chunk ID: 1
Number of words: 500
Start word index: 1
End word index: 500
Character count: 2579
Text:
This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains some sample text that will be chunked. We need enough words to make at least a few chunks. This is a dummy document. It contains