In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os

# Path to your RAG project directory
project_dir = "/content/drive/MyDrive/vedas-rag"

# Create subfolders for different parts of your pipeline
os.makedirs(f"{project_dir}/data", exist_ok=True)
os.makedirs(f"{project_dir}/chunks", exist_ok=True)
os.makedirs(f"{project_dir}/embeddings", exist_ok=True)
os.makedirs(f"{project_dir}/finetune", exist_ok=True)


In [5]:
with open(f"{project_dir}/data/book1_rigveda.txt") as file:
  lines = file.readlines()

In [6]:
len(lines)

4632

In [13]:
import re
import json

In [10]:
output_path = "/content/drive/MyDrive/vedas-rag/chunks/rigveda_book1.json"

In [11]:
hymns = []
current_hymn = None

for line in lines:
  line = line.strip()

  match = re.match(r"HYMN\s+([IVXLCDM]+)\.\s+(.+)", line)
  if match:
    if current_hymn:
      hymns.append(current_hymn)

    roman = match.group(1)
    deity = match.group(2)
    number = len(hymns) + 1
    id_str = f"rigveda-{number}-{deity.lower().replace(' ', '_')}"

    current_hymn = {
        "id": id_str,
        "title": line,
        "veda": "Rig Veda",
        "book": 1,
        "hymn_number": roman,
        "deity": deity,
        "verses": []
    }
    continue
  if current_hymn and line:
    # Remove leading verse number if present
    clean_line = re.sub(r"^\d{1,2}\s+", "", line).strip()
    if clean_line:
        current_hymn["verses"].append(clean_line)

# Add last hymn
if current_hymn:
  hymns.append(current_hymn)

In [12]:
hymns[0]

{'id': 'rigveda-1-agni.',
 'title': 'HYMN I. Agni.',
 'veda': 'Rig Veda',
 'book': 1,
 'hymn_number': 'I',
 'deity': 'Agni.',
 'verses': ['I Laud Agni, the chosen Priest, God, minister of sacrifice,',
  'The hotar, lavishest of wealth.',
  'Worthy is Agni to be praised by living as by ancient seers.',
  'He shall bring hitherward the Gods.',
  'Through Agni man obtaineth wealth, yea, plenty waxing day by day,',
  'Most rich in heroes, glorious.',
  'Agni, the perfect sacrifice which thou encompassest about',
  'Verily goeth to the Gods.',
  'May Agni, sapient-minded Priest, truthful, most gloriously great,',
  'The God, come hither with the Gods.',
  'Whatever blessing, Agni, thou wilt grant unto thy worshipper,',
  'That, Aṅgiras, is indeed thy truth.',
  'To thee, dispeller of the night, O Agni, day by day with prayer',
  'Bringing thee reverence, we come',
  'Ruler of sacrifices, guard of Law eternal, radiant One,',
  'Increasing in thine own abode.',
  'Be to us easy of approach, e

In [14]:
# Save to JSON
with open(output_path, "w") as f:
    json.dump(hymns, f, indent=2)