### 1. Environment Setup & Configuration

In [None]:
import sys, os, json, random
from tqdm.auto import tqdm
from dotenv import load_dotenv

# Load API keys from environment variables
load_dotenv("../.env")

# Add the project root to path for internal service imports
sys.path.append(os.path.abspath("../"))
from src.services.llm_services import load_config, query_broker, clean_json_output, load_prompts, format_prompt
from src.services.data_manager import DataManager

# Initialize global configuration and prompt library
cfg = load_config("../src/config/config.yaml")
prompts_lib = load_prompts("../src/config/prompts.yaml")

print(f"âœ… Data Factory Initialized. Target PDF: {cfg['paths']['pdf_source']}")

### 2. Ingestion & Chunking Strategy

In [None]:
# Ingestion
raw_text = DataManager.extract_and_clean_pdf(cfg['paths']['pdf_source'])

# Chunking Alignment
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=cfg['chunking']['size'], 
    chunk_overlap=cfg['chunking']['overlap'],
    separators=[cfg['chunking']['separator']] 
)

chunks = splitter.split_text(raw_text)

print(f"ðŸ“Š PDF Processing Complete.")
print(f"   - Total Chunks: {len(chunks)}")
print(f"   - Character Count of Raw Text: {len(raw_text)}")

### 3. The Generation Loop (Dual-LLM Pipeline)

In [None]:
all_qa_pairs = []

print(f"ðŸš€ Starting Data Factory for {len(chunks)} chunks...")

for i, chunk in enumerate(tqdm(chunks, desc="Processing Uber Annual Report")):
    try:
        # Step A: Question Generation (LLM A)
        # The prompt is instructed to cover Hard Facts, Strategy, and Tone
        q_prompt = format_prompt("teacher_questions", prompts_lib, chunk_text=chunk)
        raw_q = query_broker(cfg, q_prompt, role="llm_a", format_json=True)
        
        if not raw_q:
            continue 
        
        questions = json.loads(clean_json_output(raw_q))
        if isinstance(questions, dict): 
            questions = list(questions.values())[0]
        
        if not isinstance(questions, list): 
            continue

        # Step B: Answer Generation (LLM B) 
        for q in questions[:10]: # Limit to 10 pairs per chunk 
            a_prompt = format_prompt("student_answers", prompts_lib, chunk_text=chunk, question=q)
            answer = query_broker(cfg, a_prompt, role="llm_b")
            
            if answer and "Information not available" not in answer:
                # Structuring expected json format
                all_qa_pairs.append({
                    "instruction": q,
                    "input": chunk,
                    "output": answer.strip(),
                    "category": "financial_report",
                    "metadata": {
                        "global_id": len(all_qa_pairs) + 1,
                        "chunk_index": i,
                        "sequential_order": True
                    }
                })

    except Exception as e:
        # Prevents a single parsing error from stopping the whole process
        continue

print(f"âœ… Generation Complete. Total Pairs Created: {len(all_qa_pairs)}")

### 4. Storage & Dataset Splitting

In [None]:
# Shuffle to ensure categorical variety in both sets
random.shuffle(all_qa_pairs)

# Storage & Splitting: 80% Train / 20% Golden Test 
split_point = int(len(all_qa_pairs) * cfg['generation'].get('split_ratio', 0.8))
train_slice = all_qa_pairs[:split_point]
test_slice = all_qa_pairs[split_point:]

def save_as_jsonl(dataset, file_path):
    """Saves the dataset in JSONL format for the Fine-Tuning pipeline."""
    with open(file_path, 'w', encoding='utf-8') as f:
        for entry in dataset:
            f.write(json.dumps(entry) + '\n')

# Save output artifacts
save_as_jsonl(train_slice, cfg['paths']['train_data'])   # train.jsonl
save_as_jsonl(test_slice, cfg['paths']['test_data'])    # golden_test_set.jsonl

print(f"ðŸ’¾ Files Saved Successfully.")
print(f"   - Training Data: {len(train_slice)} items saved to {cfg['paths']['train_data']}")
print(f"   - Golden Test Set: {len(test_slice)} items saved to {cfg['paths']['test_data']}")