In [7]:
import fitz
import json

In [10]:
pdf_path = "data/Bhagavad-gita.pdf"
doc = fitz.open(pdf_path)

# Extract text from page 30 onwards (index starts at 0)
start_page = 0
text_data = ""

for page_num in range(start_page, len(doc)):
    page = doc.load_page(page_num)
    text_data += page.get_text()

# Optional: Save to a text file
with open("gita_text.txt", "w", encoding="utf-8") as f:
    f.write(text_data)

In [11]:
len(text_data)

1536303

In [12]:
# Split text into paragraphs
paragraphs = text_data.split("\n\n")

dataset = []
for i, para in enumerate(paragraphs):
    if para.strip():
        dataset.append({"prompt": f"Context:\n{para}\n\n### Response:", "completion": " <insert expected output or leave blank>"})

# Save as JSONL
with open("gita_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [13]:
import re
import json

def parse_gita_text(file_path):
    dataset = []
    current_chapter = None
    current_verse = None
    buffer = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            stripped = line.strip()

            # Start new chapter
            chapter_match = re.match(r'^Chapter\s+(\d+)', stripped, re.IGNORECASE)
            if chapter_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    dataset.append({
                        "chapter": current_chapter,
                        "verse": current_verse,
                        "text": " ".join(buffer).strip()
                    })
                current_chapter = chapter_match.group(1)
                current_verse = None
                buffer = []
                continue

            # Start new verse
            verse_match = re.match(r'^TEXT\s+(\d+)', stripped)
            if verse_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    dataset.append({
                        "chapter": current_chapter,
                        "verse": current_verse,
                        "text": " ".join(buffer).strip()
                    })
                current_verse = verse_match.group(1)
                buffer = []
                continue

            # Accumulate verse content
            if current_chapter and current_verse:
                buffer.append(stripped)

    # Add final verse
    if buffer and current_chapter is not None and current_verse is not None:
        dataset.append({
            "chapter": current_chapter,
            "verse": current_verse,
            "text": " ".join(buffer).strip()
        })

    return dataset

# Input and output
input_path = 'gita_text.txt'
output_path = 'gita_dataset.jsonl'

# Parse and write to JSONL
data = parse_gita_text(input_path)
with open(output_path, 'w', encoding='utf-8') as out_file:
    for item in data:
        out_file.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Dataset saved to {output_path} with {len(data)} verse entries.")


Dataset saved to gita_dataset.jsonl with 622 verse entries.
