In [7]:
import fitz
import json

In [10]:
pdf_path = "data/Bhagavad-gita.pdf"
doc = fitz.open(pdf_path)

# Extract text from page 30 onwards (index starts at 0)
start_page = 0
text_data = ""

for page_num in range(start_page, len(doc)):
    page = doc.load_page(page_num)
    text_data += page.get_text()

# Optional: Save to a text file
with open("gita_text.txt", "w", encoding="utf-8") as f:
    f.write(text_data)

In [11]:
len(text_data)

1536303

In [12]:
# Split text into paragraphs
paragraphs = text_data.split("\n\n")

dataset = []
for i, para in enumerate(paragraphs):
    if para.strip():
        dataset.append({"prompt": f"Context:\n{para}\n\n### Response:", "completion": " <insert expected output or leave blank>"})

# Save as JSONL
with open("gita_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [13]:
import re
import json

def parse_gita_text(file_path):
    dataset = []
    current_chapter = None
    current_verse = None
    buffer = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            stripped = line.strip()

            # Start new chapter
            chapter_match = re.match(r'^Chapter\s+(\d+)', stripped, re.IGNORECASE)
            if chapter_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    dataset.append({
                        "chapter": current_chapter,
                        "verse": current_verse,
                        "text": " ".join(buffer).strip()
                    })
                current_chapter = chapter_match.group(1)
                current_verse = None
                buffer = []
                continue

            # Start new verse
            verse_match = re.match(r'^TEXT\s+(\d+)', stripped)
            if verse_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    dataset.append({
                        "chapter": current_chapter,
                        "verse": current_verse,
                        "text": " ".join(buffer).strip()
                    })
                current_verse = verse_match.group(1)
                buffer = []
                continue

            # Accumulate verse content
            if current_chapter and current_verse:
                buffer.append(stripped)

    # Add final verse
    if buffer and current_chapter is not None and current_verse is not None:
        dataset.append({
            "chapter": current_chapter,
            "verse": current_verse,
            "text": " ".join(buffer).strip()
        })

    return dataset

# Input and output
input_path = 'gita_text.txt'
output_path = 'gita_dataset.jsonl'

# Parse and write to JSONL
data = parse_gita_text(input_path)
with open(output_path, 'w', encoding='utf-8') as out_file:
    for item in data:
        out_file.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Dataset saved to {output_path} with {len(data)} verse entries.")


Dataset saved to gita_dataset.jsonl with 622 verse entries.


In [6]:
from datasets import load_dataset
from huggingface_hub import HfApi
from pathlib import Path

# Config
dataset_path = "gita_dataset.jsonl"
repo_name = "gita-verse-dataset"  # Customize this
hf_api = HfApi()
username = hf_api.whoami()["name"]
repo_id = f"{username}/{repo_name}"

# Step 1: Load dataset
print("Loading dataset...")
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Step 2: Push to Hub
print(f"Creating repo: {repo_id}")
dataset.push_to_hub(repo_id)
print(f"✅ Dataset pushed to: https://huggingface.co/datasets/{repo_id}")


Loading dataset...


Generating train split: 622 examples [00:00, 96179.06 examples/s]


Creating repo: serpentilec137/gita-verse-dataset


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 131.89ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


✅ Dataset pushed to: https://huggingface.co/datasets/serpentilec137/gita-verse-dataset


In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HF_TOKEN")
print("Token loaded:", bool(hf_token))

Token loaded: True


In [10]:
from datasets import load_dataset

# Removed split="train" as it's not needed for local data_files
dataset = load_dataset("serpentilec137/gita-verse-dataset")
print(dataset['train'][0])

{'chapter': '1', 'verse': '1', 'text': 'dhrtarastra uvaca dharma-ksetre kuru-ksetre samaveta yuyutsavah mamakah pandavas caiva kim akurvata sanjaya SYNONYMS dhrtarastrah--King Dhrtarastra; uvaca--said; dharma-ksetre--in the place of pilgrimage; kuru-ksetre--in the place named Kuruksetra; samavetah--assembled; yuyutsavah--desiring to fight; mamakah--my party (sons); pandavah--the sons of Pandu; ca--and; eva--certainly; kim-- what; akurvata--did they do; sanjaya--O Sanjaya. TRANSLATION Dhrtarastra said: O Sanjaya, after assembling in the place of pilgrimage at Kuruksetra, what did my sons and the sons of Pandu do, being desirous to fight? PURPORT Bhagavad-gita is the widely read theistic science summarized in the Gita-mahatmya (Glorification of the Gita). There it says that one should read Bhagavad-gita very scrutinizingly with the help of a person who is a devotee of Sri Krsna and try to understand it without personally motivated interpretations. The example of clear understanding is th

DatasetDict({
    train: Dataset({
        features: ['chapter', 'verse', 'text'],
        num_rows: 622
    })
})