In [1]:
import google.generativeai as genai
import os
import json
import time
import re
from tqdm import tqdm
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
print("Token loaded:", bool(GOOGLE_API_KEY))

Token loaded: True


In [3]:
# --- Configuration ---
# INPUT_TEXT_FILE = "gita_text.txt"
# OUTPUT_JSONL_FILE = "gita_qa_dataset_per_verse.jsonl"
MAX_VERSES_TO_PROCESS = None # Set to a number (e.g., 10) for testing, or None for all
MIN_VERSE_TEXT_LENGTH_WORDS = 30 # Minimum words for a verse's text to be considered for Q/A

In [4]:
# --- API Key Setup ---
try:
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    if GOOGLE_API_KEY is None:
        print("Warning: GOOGLE_API_KEY not found in Colab userdata. Trying environment variable.")
        GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
except ImportError:
    GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

if not GOOGLE_API_KEY:
    print("Error: GOOGLE_API_KEY not found. Please set it in Colab userdata or as an environment variable.")
    exit()

genai.configure(api_key=GOOGLE_API_KEY)


In [5]:
# --- Gemini Model Configuration ---
generation_config = {
    "temperature": 0.6,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 2048,
    "response_mime_type": "application/json",
}
safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash-latest",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

In [6]:
# --- Helper Functions ---
def load_text(filepath):
    """Loads text from a file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        exit()

# User-provided parsing function
def parse_gita_text_from_file(file_path):
    """
    Parses the Gita text to extract verses and their associated content.
    Each item in the dataset will be a dictionary:
    {"chapter": "chapter_number", "verse": "verse_number", "text": "verse_content"}
    """
    dataset = []
    current_chapter = None
    current_verse = None
    buffer = []
    # Keywords that often appear alone and should not be part of the main text buffer if they are the only content
    standalone_structural_keywords = ["SYNONYMS", "TRANSLATION", "PURPORT"]


    # Skip initial metadata lines
    # Heuristic: assume metadata is at the very start. Find first "Chapter 1" or "TEXT 1"
    start_processing = False

    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            stripped = line.strip()

            if not start_processing:
                if re.match(r'^Chapter\s+1', stripped, re.IGNORECASE) or \
                   re.match(r'^TEXT\s+1', stripped, re.IGNORECASE) and current_chapter is None: # Ensure it's the first TEXT 1
                    start_processing = True
                else:
                    continue # Skip metadata

            chapter_match = re.match(r'^Chapter\s+(\d+)', stripped, re.IGNORECASE)
            if chapter_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    text_content = " ".join(buffer).strip()
                    if text_content: # Only add if there's actual text
                        dataset.append({
                            "chapter": current_chapter,
                            "verse": current_verse,
                            "text": text_content
                        })
                current_chapter = chapter_match.group(1)
                current_verse = None # Reset verse when new chapter starts
                buffer = []
                continue

            verse_match = re.match(r'^TEXT\s+(\d+)', stripped, re.IGNORECASE) # Made IGNORECASE more consistent
            if verse_match:
                if buffer and current_chapter is not None and current_verse is not None:
                    text_content = " ".join(buffer).strip()
                    if text_content:
                        dataset.append({
                            "chapter": current_chapter,
                            "verse": current_verse,
                            "text": text_content
                        })
                current_verse = verse_match.group(1)
                buffer = [] # Reset buffer for the new verse text
                # We can choose to include the TEXT line itself or not.
                # If not, we 'continue'. If yes, we let it fall through to buffer.append.
                # For Q/A, probably better to exclude it from the chunk sent to Gemini.
                continue # Skip adding "TEXT X" line to buffer

            # Accumulate verse content only after a chapter and verse context is established
            if current_chapter and current_verse:
                # Avoid adding standalone keywords if they are the only thing on the line
                if stripped and not (stripped in standalone_structural_keywords and len(stripped.split()) == 1):
                     # Skip copyright lines that might appear at the end of chapters or sections
                    if "copyright" in stripped.lower() or "bhaktivedanta book" in stripped.lower():
                        continue
                    if "Thus end the Bhaktivedanta Purports" in stripped: # Stop accumulating for this verse
                        if buffer: # Add remaining buffer for the current verse
                            text_content = " ".join(buffer).strip()
                            if text_content:
                                dataset.append({
                                    "chapter": current_chapter,
                                    "verse": current_verse,
                                    "text": text_content
                                })
                            buffer = []
                            current_verse = None # Ready for next TEXT marker
                        continue
                    buffer.append(stripped)


    # Add the last verse's content if any remains in the buffer
    if buffer and current_chapter is not None and current_verse is not None:
        text_content = " ".join(buffer).strip()
        if text_content:
            dataset.append({
                "chapter": current_chapter,
                "verse": current_verse,
                "text": text_content
            })
    
    # Filter out entries with very short text
    dataset = [item for item in dataset if len(item["text"].split()) >= MIN_VERSE_TEXT_LENGTH_WORDS]
    print(f"Parsed {len(dataset)} verse entries meeting minimum length criteria.")
    return dataset


def generate_qa_from_verse_text(verse_item, retries=3, delay=5):
    """
    Generates a Q/A pair from a verse's text content using Gemini.
    Ensures output is valid JSON. verse_item is a dict with 'chapter', 'verse', 'text'.
    """
    text_chunk = verse_item['text']
    prompt = f"""
    You are an expert in analyzing religious texts. Based ONLY on the following text, which is commentary on a verse from the "Bhagavad-gita As It Is", generate one insightful question and a concise, accurate answer.

    Constraints:
    1. The question MUST be answerable *solely* from the provided text.
    2. The answer MUST be directly derivable and supported by the provided text.
    3. Do NOT use any external knowledge or information not present in this specific text chunk.
    4. Your response MUST be a single JSON object with two keys: "question" and "answer".
    5. Ensure the question is about a significant aspect of the provided text.
    6. The answer should be a direct summary or quote from the text that answers the question.

    Example:
    If the text is: "The soul is eternal and cannot be destroyed by any weapon. It is unborn, ever-existing, and primeval."
    A good JSON output would be:
    {{
      "question": "According to this text, what are some key characteristics of the soul?",
      "answer": "The text states that the soul is eternal, cannot be destroyed by any weapon, is unborn, ever-existing, and primeval."
    }}

    Provided Text (from Bhagavad-gita, Chapter {verse_item['chapter']}, Verse {verse_item['verse']}):
    ---
    {text_chunk}
    ---

    Your JSON Output:
    """

    for attempt in range(retries):
        try:
            response = model.generate_content(prompt)
            raw_json_text = response.text.strip()

            match = re.search(r"```json\s*(\{.*?\})\s*```", raw_json_text, re.DOTALL)
            if match:
                json_str = match.group(1)
            else:
                start_index = raw_json_text.find('{')
                end_index = raw_json_text.rfind('}')
                if start_index != -1 and end_index != -1 and end_index > start_index:
                    json_str = raw_json_text[start_index : end_index + 1]
                else:
                    json_str = raw_json_text

            try:
                qa_pair = json.loads(json_str)
                if "question" in qa_pair and "answer" in qa_pair:
                    if qa_pair["question"].strip() and qa_pair["answer"].strip():
                        return qa_pair
                    else:
                        print(f"Warning: Generated Q/A has empty Q or A for C{verse_item['chapter']}:V{verse_item['verse']}.")
                else:
                    print(f"Warning: JSON missing 'question' or 'answer' for C{verse_item['chapter']}:V{verse_item['verse']}.")
                    print(f"  Raw response part: {json_str[:200]}")
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError (attempt {attempt + 1}) for C{verse_item['chapter']}:V{verse_item['verse']}: {e}")
                print(f"  Problematic JSON string: {json_str[:200]}")

        except Exception as e:
            print(f"API Error (attempt {attempt + 1}/{retries}) for C{verse_item['chapter']}:V{verse_item['verse']}: {e}")

        if attempt < retries - 1:
            print(f"Retrying in {delay * (attempt + 1)} seconds...")
            time.sleep(delay * (attempt + 1))
        else:
            print(f"Failed to generate valid Q/A for C{verse_item['chapter']}:V{verse_item['verse']} after {retries} retries.")
    return None


In [7]:
SLEEP_BETWEEN_API_CALLS = 2

In [8]:
directory_path = "gita_chapters"
text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt') and os.path.isfile(os.path.join(directory_path, f))]
print(text_files)

['chapter_04.txt', 'chapter_10.txt', 'chapter_11.txt', 'chapter_05.txt', 'chapter_13.txt', 'chapter_07.txt', 'chapter_06.txt', 'chapter_12.txt', 'chapter_16.txt', 'chapter_02.txt', 'chapter_03.txt', 'chapter_17.txt', 'chapter_01.txt', 'chapter_15.txt', 'chapter_14.txt', 'chapter_18.txt', 'chapter_08.txt', 'chapter_09.txt']


In [9]:
# INPUT_TEXT_FILE='gita_text.txt'
INPUT_TEXT_FILE='gita_chapters/chapter_02.txt'
OUTPUT_JSONL_FILE = "processed_data/gita_qa_dataset_per_verse_02.jsonl"

In [None]:
# --- Main Script ---
if __name__ == "__main__":
    parsed_verses = parse_gita_text_from_file(INPUT_TEXT_FILE)

    if not parsed_verses:
        print("No verse data parsed. Exiting.")
        exit()

    if MAX_VERSES_TO_PROCESS is not None and MAX_VERSES_TO_PROCESS > 0:
        parsed_verses_to_process = parsed_verses[:MAX_VERSES_TO_PROCESS]
        print(f"Processing only the first {len(parsed_verses_to_process)} verses for testing.")
    else:
        parsed_verses_to_process = parsed_verses
        print(f"Found {len(parsed_verses_to_process)} verses to process.")


    # Clear/Create the output file
    with open(OUTPUT_JSONL_FILE, 'w', encoding='utf-8') as outfile:
        pass # Ensures the file is new or cleared

    generated_qa_count = 0
    print(f"\nStarting Q/A generation for {len(parsed_verses_to_process)} verses...")
    for i, verse_item in enumerate(tqdm(parsed_verses_to_process, desc="Generating Q/A per verse")):
        
        # The API call for Q/A generation happens within this function
        qa_pair = generate_qa_from_verse_text(verse_item) 

        if qa_pair:
            full_qa_entry = {
                "chapter": verse_item['chapter'],
                "verse": verse_item['verse'],
                "context": verse_item['text'], # Optional: include the source text for traceability
                "question": qa_pair['question'],
                "answer": qa_pair['answer']
            }
            
            with open(OUTPUT_JSONL_FILE, 'a', encoding='utf-8') as outfile:
                json.dump(full_qa_entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            generated_qa_count += 1
        
        # --- Sleep between API calls for different verses to respect rate limits ---
        # This sleep is applied AFTER processing each verse (which includes one main API call).
        if i < len(parsed_verses_to_process) - 1: # Don't sleep after the last item
            time.sleep(SLEEP_BETWEEN_API_CALLS)

    print(f"\nGenerated {generated_qa_count} Q/A pairs.")
    print(f"Dataset saved to {OUTPUT_JSONL_FILE}")

Parsed 0 verse entries meeting minimum length criteria.
No verse data parsed. Exiting.
Found 0 verses to process.

Starting Q/A generation for 0 verses...


Generating Q/A per verse: 0it [00:00, ?it/s]


Generated 0 Q/A pairs.
Dataset saved to processed_data/gita_qa_dataset_per_verse_02.jsonl





: 