In [None]:
import os
import sys
import json
import time
import random
import warnings
warnings.filterwarnings("ignore")
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from datasets import Dataset, DatasetDict

In [None]:
from transformers import(
    AutoTokenizer,
    GPT2Config,
    GPT2LMHeadModel, 
    DataCollatorForLanguageModeling,
    Trainer, 
    TrainingArguments, 
)

In [None]:
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    print(f"Using device: cuda:0")
else:
    print("Using device: cpu")


In [None]:
TOKENIZER_DIR = "./telugu_tokenizer"
BASE_OUTPUT_DIR="./gpt-telugu"

In [None]:
CURRICULUM_TXT="telugu_curriculum.txt"
TOKENS17_TXT="telugu_17M_tokens.txt"

In [None]:
# Model architecture (GPT-Wee: very small)
N_EMBD = 128
N_LAYER = 2
N_HEAD = 2

In [None]:
CONTEXT_LENGTH = 128
PER_DEVICE_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 8
NUM_EPOCHS = 12
LEARNING_RATE = 5e-4
WARMUP_STEPS = 1000
EVAL_STEPS = 5000
SAVE_STEPS = 5000
LOGGING_STEPS = 5000

In [None]:
GRADIENT_CHECKPOINTING=True
DATALOADER_NUM_WORKERS=0
DATALOADER_PIN_MEMORY=False

In [None]:
SEEDS = [42, 123, 456, 789, 1011, 2024, 3035, 4046, 5057, 6068]  
NUM_RUNS_PER_MODE = 10

In [None]:
def load_txt_file(path):
    """Load text file line by line"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def load_tokenizer():
    """Load the pre-trained Telugu tokenizer"""
    if not os.path.exists(TOKENIZER_DIR):
        raise FileNotFoundError(f"Tokenizer not found: {TOKENIZER_DIR}")
    
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

# Load tokenizer once
tokenizer = load_tokenizer()
print(f"Tokenizer loaded from: {TOKENIZER_DIR}")
print(f"Vocabulary size: {len(tokenizer)}")


In [None]:
def train_gpt2_wee(seed=6068, curriculum_mode=True, run_name=None):
       random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    # Generate run name
    mode_str = "curriculum" if curriculum_mode else "random"
    if run_name is None:
        run_name = f"{mode_str}_seed_{seed}"
    
    model_output_dir = os.path.join(BASE_OUTPUT_DIR, run_name)
    os.makedirs(model_output_dir, exist_ok=True)
    
  
    print(f"Training: {run_name}")
    print(f"Seed: {seed}, Mode: {mode_str}")

    
    # Load data
    curriculum_data = load_txt_file(CURRICULUM_TXT) if os.path.exists(CURRICULUM_TXT) else []
    tokens17_data = load_txt_file(TOKENS17_TXT) if os.path.exists(TOKENS17_TXT) else []
    
    # Combine data based on mode
    if curriculum_mode:
        training_texts = curriculum_data + tokens17_data
    else:
        training_texts = curriculum_data + tokens17_data
        random.shuffle(training_texts)
    
    # Create validation split (5% of data)
    split_idx = int(0.95 * len(training_texts))
    train_texts = training_texts[:split_idx]
    val_texts = training_texts[split_idx:]
    
    # Limit validation size
    if len(val_texts) > 2000:
        val_texts = val_texts[:2000]
    
    print(f"Training examples: {len(train_texts):,}")
    print(f"Validation examples: {len(val_texts):,}")
    
    # Create datasets
    raw_datasets = DatasetDict({
        'train': Dataset.from_dict({'text': train_texts}),
        'validation': Dataset.from_dict({'text': val_texts})
    })
    
    # Tokenize
    def tokenize_function(examples):
        outputs = tokenizer(
            examples["text"],
            truncation=True,
            max_length=CONTEXT_LENGTH,
            return_overflowing_tokens=False,
            return_length=True,
        )
        input_batch = []
        for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
            input_batch.append(input_ids)
        return {"input_ids": input_batch}
    
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names
    )
    
    # Initialize model
    config = GPT2Config(
        vocab_size=len(tokenizer),
        n_ctx=CONTEXT_LENGTH,
        n_positions=CONTEXT_LENGTH,
        n_embd=N_EMBD,
        n_layer=N_LAYER,
        n_head=N_HEAD,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    model = GPT2LMHeadModel(config)
    
    # Data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=EVAL_STEPS,
        logging_steps=LOGGING_STEPS,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=0.1,
        warmup_steps=WARMUP_STEPS,
        lr_scheduler_type="cosine",
        learning_rate=LEARNING_RATE,
        save_steps=SAVE_STEPS,
        fp16=True if torch.cuda.is_available() else False,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
    )
    
    # Train
    start_time = time.time()
    train_result = trainer.train()
    training_time = time.time() - start_time
    
    # Save model
    final_model_path = os.path.join(model_output_dir, "final_model")
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    
    # Save training stats
    training_log_path = os.path.join(model_output_dir, "training_stats.csv")
    log_df = pd.DataFrame(trainer.state.log_history)
    log_df.to_csv(training_log_path, index=False)
    
    # Extract final metrics
    log_history = trainer.state.log_history
    final_train_loss = train_result.training_loss
    final_eval_loss = None
    final_eval_perplexity = None
    
    for entry in reversed(log_history):
        if 'eval_loss' in entry:
            final_eval_loss = entry['eval_loss']
            if 'eval_perplexity' in entry:
                final_eval_perplexity = entry['eval_perplexity']
            break
    
    # Save run metadata
    run_info = {
        "seed": seed,
        "curriculum_mode": curriculum_mode,
        "run_name": run_name,
        "training_time_seconds": training_time,
        "final_train_loss": final_train_loss,
        "final_eval_loss": final_eval_loss,
        "final_eval_perplexity": final_eval_perplexity,
        "num_train_examples": len(train_texts),
        "num_val_examples": len(val_texts),
        "model_size_million": sum(t.numel() for t in model.parameters()) / 1e6,
        "config": {
            "context_length": CONTEXT_LENGTH,
            "batch_size": PER_DEVICE_BATCH_SIZE,
            "gradient_accumulation": GRADIENT_ACCUMULATION_STEPS,
            "num_epochs": NUM_EPOCHS,
            "learning_rate": LEARNING_RATE,
            "n_embd": N_EMBD,
            "n_layer": N_LAYER,
            "n_head": N_HEAD,
        }
    }
    
    run_info_path = os.path.join(model_output_dir, "run_info.json")
    with open(run_info_path, "w") as f:
        json.dump(run_info, f, indent=2)
    
    print(f"\nTraining completed in {training_time/60:.2f} minutes")
    print(f"Final train loss: {final_train_loss:.4f}")
    print(f"Final eval loss: {final_eval_loss:.4f}" if final_eval_loss else "")
    print(f"Results saved to: {model_output_dir}\n")
    
    return run_info


In [None]:
def load_all_run_results():
    all_results = []
    
    for seed in SEEDS:
        run_name = f"curriculum_seed_{seed}"
        run_dir = os.path.join(BASE_OUTPUT_DIR, run_name)
        run_info_path = os.path.join(run_dir, "run_info.json")
        
        if os.path.exists(run_info_path):
            with open(run_info_path, "r") as f:
                run_info = json.load(f)
                all_results.append(run_info)
    
    return pd.DataFrame(all_results)

def perform_statistical_analysis(df):    
    results = {}
    
    # Metrics to analyze
    metrics = ['final_eval_loss', 'final_train_loss', 'training_time_seconds']
    if 'final_eval_perplexity' in df.columns:
        metrics.append('final_eval_perplexity')
    
    for metric in metrics:
        if metric not in df.columns:
            continue
            
        values = df[metric].dropna()
        
        if len(values) == 0:
            continue
        
        # Descriptive statistics
        mean_val = values.mean()
        std_val = values.std()
        sem_val = std_val / np.sqrt(len(values))  # Standard error of mean
        min_val = values.min()
        max_val = values.max()
        median_val = values.median()
        
        # Confidence intervals (95%)
        if len(values) > 1:
            ci = stats.t.interval(0.95, len(values)-1, loc=mean_val, scale=sem_val)
        else:
            ci = (mean_val, mean_val)
        
        results[metric] = {
            'mean': mean_val,
            'std': std_val,
            'sem': sem_val,
            'min': min_val,
            'max': max_val,
            'median': median_val,
            'ci_lower': ci[0],
            'ci_upper': ci[1],
            'n': len(values),
            'values': values.tolist()
        }
    
    return results

def print_statistical_summary(analysis_results):
    print("CURRICULUM LEARNING RESULTS SUMMARY")
    for metric, stats in analysis_results.items():
        print(f"\n{metric.upper().replace('_', ' ')}")
        print(f"  Mean:   {stats['mean']:.4f} ± {stats['std']:.4f}")
        print(f"  Median: {stats['median']:.4f}")
        print(f"  Range:  [{stats['min']:.4f}, {stats['max']:.4f}]")
        print(f"  95% CI: [{stats['ci_lower']:.4f}, {stats['ci_upper']:.4f}]")
    

In [None]:
#SEED_TO_RUN = 6068

#print(f"Running Curriculum Learning Experiment with seed {SEED_TO_RUN}...")
#result = train_gpt2_wee(seed=SEED_TO_RUN, curriculum_mode=True)
#print(f"Completed curriculum learning run with seed {SEED_TO_RUN}")

In [None]:
import math
import shutil

def evaluate_perplexity(run_name):
    """
    Calculate and save perplexity with improved error handling.
    This version ensures perplexity is definitely saved.
    """
    model_path = os.path.join(BASE_OUTPUT_DIR, run_name, "final_model")
    run_info_path = os.path.join(BASE_OUTPUT_DIR, run_name, "run_info.json")
    
    # First, check if perplexity is already saved
    if os.path.exists(run_info_path):
        with open(run_info_path, "r", encoding="utf-8") as f:
            run_info = json.load(f)
            if "final_eval_perplexity" in run_info and run_info["final_eval_perplexity"] is not None:
                print(f"✓ Perplexity already calculated for {run_name}: {run_info['final_eval_perplexity']:.4f}")
                return run_info['final_eval_perplexity']
    
    if not os.path.exists(model_path):
        print(f"✗ Model not found: {model_path}")
        return None
    
    if not os.path.exists(run_info_path):
        print(f"✗ run_info.json not found: {run_info_path}")
        print("  Cannot save perplexity without run_info.json")
        return None
    
    print(f"Calculating perplexity for {run_name}...")
    
    try:
        # Load model
        model = GPT2LMHeadModel.from_pretrained(model_path)
        tokenizer_eval = AutoTokenizer.from_pretrained(model_path)
        
        if torch.cuda.is_available():
            model = model.cuda()
        model.eval()
        
        # Reconstruct validation data (same as training)
        curriculum_data = load_txt_file(CURRICULUM_TXT) if os.path.exists(CURRICULUM_TXT) else []
        tokens17_data = load_txt_file(TOKENS17_TXT) if os.path.exists(TOKENS17_TXT) else []
        
        is_curriculum = "curriculum" in run_name
        if is_curriculum:
            training_texts = curriculum_data + tokens17_data
        else:
            seed = int(run_name.split("_")[-1])
            random.seed(seed)
            np.random.seed(seed)
            training_texts = curriculum_data + tokens17_data
            random.shuffle(training_texts)
        
        split_idx = int(0.95 * len(training_texts))
        val_data = training_texts[split_idx:]
        if len(val_data) > 2000:
            val_data = val_data[:2000]
        
        # Calculate perplexity
        total_loss = 0.0
        total_tokens = 0
        
        with torch.no_grad():
            for text in val_data:
                inputs = tokenizer_eval(text, return_tensors="pt", truncation=True, max_length=CONTEXT_LENGTH)
                
                if torch.cuda.is_available():
                    inputs = {k: v.cuda() for k, v in inputs.items()}
                
                input_ids = inputs["input_ids"]
                outputs = model(input_ids, labels=input_ids)
                loss = outputs.loss
                
                num_tokens = (input_ids != tokenizer_eval.pad_token_id).sum().item()
                total_loss += loss.item() * num_tokens
                total_tokens += num_tokens
        
        if total_tokens > 0:
            avg_loss = total_loss / total_tokens
            perplexity = math.exp(avg_loss)
        else:
            perplexity = None
        
        # Save to run_info.json with robust error handling
        print(f"\nSaving perplexity to {run_info_path}...")
        try:
            # Read existing file
            with open(run_info_path, "r", encoding="utf-8") as f:
                run_info = json.load(f)
            
            # Update with perplexity
            run_info["final_eval_perplexity"] = perplexity
            run_info["perplexity_eval_tokens"] = total_tokens
            run_info["perplexity_calculated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
            
            # Create backup
            backup_path = run_info_path + ".backup"
            shutil.copy2(run_info_path, backup_path)
            
            # Write to temporary file first
            temp_path = run_info_path + ".tmp"
            with open(temp_path, "w", encoding="utf-8") as f:
                json.dump(run_info, f, indent=2, ensure_ascii=False)
            
            # Replace original file
            shutil.move(temp_path, run_info_path)
            
            # Verify it was saved correctly
            with open(run_info_path, "r", encoding="utf-8") as f:
                verify_info = json.load(f)
                if "final_eval_perplexity" in verify_info:
                    saved_perp = verify_info["final_eval_perplexity"]
                    if saved_perp == perplexity:
                        print(f"✓ Perplexity: {perplexity:.4f} (evaluated on {total_tokens:,} tokens)")
                        print(f" Successfully saved and verified!")
                        # Remove backup if successful
                        if os.path.exists(backup_path):
                            os.remove(backup_path)
                        return perplexity
                    else:
                        print(f" Mismatch: Calculated {perplexity:.4f} but saved {saved_perp}")
                        # Restore backup
                        shutil.move(backup_path, run_info_path)
                else:
                    print(f" Perplexity key not found after saving")
                    # Restore backup
                    shutil.move(backup_path, run_info_path)
                    
        except Exception as save_error:
            print(f"Error saving perplexity: {save_error}")
            print(f"  Calculated perplexity: {perplexity:.4f}")
            import traceback
            traceback.print_exc()
            # Try to restore backup if it exists
            if os.path.exists(backup_path):
                shutil.move(backup_path, run_info_path)
                print(f"  Restored backup file")
        
        return perplexity
        
    except Exception as e:
        print(f"✗ Error calculating perplexity: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the fixed version
run_to_check = "curriculum_seed_456"  # Change this to check other runs
print("PERPLEXITY EVALUATION")
perplexity = evaluate_perplexity(run_to_check)

if perplexity:
    print(f"FINAL RESULT: {run_to_check}")
    print(f"Perplexity: {perplexity:.4f}")
    
    # Double-check it's in the file
    run_info_path = os.path.join(BASE_OUTPUT_DIR, run_to_check, "run_info.json")
    if os.path.exists(run_info_path):
        with open(run_info_path, "r", encoding="utf-8") as f:
            final_check = json.load(f)
            if "final_eval_perplexity" in final_check:
                print(f"\n Verified: Perplexity is now in run_info.json")
            else:
                print(f"\n Warning: Still not in file after save attempt")
else:
    print(f"\n Failed to calculate perplexity")


In [None]:
# Load curriculum run results
df_results = load_all_run_results()

if len(df_results) > 0:
    # Display summary
    print("SUMMARY:")
    print(df_results[['seed', 'final_train_loss', 'final_eval_loss', 'training_time_seconds']].to_string(index=False))
    
    # Save combined results
    results_path = os.path.join(BASE_OUTPUT_DIR, "curriculum_runs_summary.csv")
    df_results.to_csv(results_path, index=False)
    print(f"\nResults saved to: {results_path}")
else:
    print("No completed runs found.")


In [None]:
if len(df_results) > 0:
    analysis_results = perform_statistical_analysis(df_results)
    print_statistical_summary(analysis_results)
    analysis_path = os.path.join(BASE_OUTPUT_DIR, "statistical_analysis.json")
    with open(analysis_path, "w") as f:
        def convert_to_serializable(obj):
            if isinstance(obj, dict):
                return {k: convert_to_serializable(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [convert_to_serializable(item) for item in obj]
            elif isinstance(obj, (np.integer, np.floating)):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            return obj
        
        json.dump(convert_to_serializable(analysis_results), f, indent=2)
    
    print(f"\nStatistical analysis saved to: {analysis_path}")
else:
    print("No results to analyze. Please run training experiments first.")


In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer, pipeline
import torch
import math

model_path = os.path.join(BASE_OUTPUT_DIR, "curriculum_seed_1011", "final_model")

device = 0 if torch.cuda.is_available() else -1
pipe = pipeline("text-generation", model=model_path, tokenizer=model_path, device=device)

# For minimal pairs, load model separately
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
    model = model.cuda()
model.eval()

# 10 Test Prompts
test_prompts = [
    "నేను పాఠశాలకు",
    "ఆమె పేరు",
    "నాకు నచ్చిన",
    "ఈ రోజు",
    "మా ఇంట్లో",
    "అతను చాలా",
    "తెలుగు భాష",
    "పిల్లలు ఆడుకుంటున్నారు",
    "భారతదేశంలో",
    "ఒక రోజు",
]

# 10 Minimal Pairs
minimal_pairs = [
    ("అతను వెళ్ళాడు", "అతను వెళ్ళారు"),
    ("ఆమె వచ్చింది", "ఆమె వచ్చాడు"),
    ("నేను తింటున్నాను", "నేను తిన్నాను"),
    ("రాముడు పుస్తకం చదివాడు", "రాముడు పుస్తకాన్ని చదివాడు"),
    ("నేను వెళ్ళాను", "నేను వెళ్ళలేదు"),
    ("నీవు వస్తావా", "నీవు వస్తావు"),
    ("మీరు రండి", "నువ్వు రండి"),
    ("ఇంటికి వెళ్ళాను", "ఇంటిలో వెళ్ళాను"),
    ("తీసుకొని వచ్చాడు", "తీసుకొని వెళ్ళాడు"),
    ("చదువుతున్నాను", "చదివాను"),
]

print("TEXT GENERATION TEST")

for prompt in test_prompts:
    output = pipe(prompt, max_new_tokens=25, do_sample=True, temperature=1.0,
                  repetition_penalty=2.0, no_repeat_ngram_size=2, top_k=30,
                  pad_token_id=pipe.tokenizer.eos_token_id)
    generated = output[0]['generated_text'][len(prompt):].strip()
    print(f"Prompt: {prompt}")
    print(f"Output: {generated}\n")


print("MINIMAL PAIR TEST")

for i, (sent1, sent2) in enumerate(minimal_pairs):
    ppls = []
    for sent in [sent1, sent2]:
        inputs = tokenizer(sent, return_tensors="pt", truncation=True, max_length=128)
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        with torch.no_grad():
            loss = model(inputs["input_ids"], labels=inputs["input_ids"]).loss.item()
        ppls.append(math.exp(loss))
    
    print(f"Pair {i+1}: '{sent1}' (PPL:{ppls[0]:.1f}) vs '{sent2}' (PPL:{ppls[1]:.1f}) → Lower: {'1st' if ppls[0]<ppls[1] else '2nd'}")

In [None]:


# Collect perplexity values from all curriculum runs
results = []

for seed in SEEDS:
    run_name = f"curriculum_seed_{seed}"
    run_info_path = os.path.join(BASE_OUTPUT_DIR, run_name, "run_info.json")
    
    if os.path.exists(run_info_path):
        with open(run_info_path, "r", encoding="utf-8") as f:
            run_info = json.load(f)
            
            results.append({
                'seed': seed,
                'perplexity': run_info.get('final_eval_perplexity'),
            })

# Create DataFrame
df = pd.DataFrame(results)

# Save to CSV
csv_path = os.path.join(BASE_OUTPUT_DIR, "curriculum_perplexities.csv")
df.to_csv(csv_path, index=False)
print(f"Saved to: {csv_path}")

# Save to JSON
json_path = os.path.join(BASE_OUTPUT_DIR, "curriculum_perplexities.json")
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Saved to: {json_path}")

# Display
print("\nPerplexity Results:")
print(df[['seed', 'perplexity']].to_string(index=False))

In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer, pipeline
import torch
import math
import json
import pandas as pd

# Test prompts
test_prompts = [
    "నేను పాఠశాలకు",
    "ఆమె పేరు",
    "నాకు నచ్చిన",
    "ఈ రోజు",
    "మా ఇంట్లో",
    "అతను చాలా",
    "తెలుగు భాష",
    "పిల్లలు ఆడుకుంటున్నారు",
    "భారతదేశంలో",
    "ఒక రోజు",
]

# Minimal pairs
minimal_pairs = [
    ("అతను వెళ్ళాడు", "అతను వెళ్ళారు"),
    ("ఆమె వచ్చింది", "ఆమె వచ్చాడు"),
    ("నేను తింటున్నాను", "నేను తిన్నాను"),
    ("రాముడు పుస్తకం చదివాడు", "రాముడు పుస్తకాన్ని చదివాడు"),
    ("నేను వెళ్ళాను", "నేను వెళ్ళలేదు"),
    ("నీవు వస్తావా", "నీవు వస్తావు"),
    ("మీరు రండి", "నువ్వు రండి"),
    ("ఇంటికి వెళ్ళాను", "ఇంటిలో వెళ్ళాను"),
    ("తీసుకొని వచ్చాడు", "తీసుకొని వెళ్ళాడు"),
    ("చదువుతున్నాను", "చదివాను"),
]

# Storage for results
all_generation_results = []
all_minimal_pair_results = []

device = 0 if torch.cuda.is_available() else -1

# Test all seeds
for seed in SEEDS:
    run_name = f"curriculum_seed_{seed}"
    model_path = os.path.join(BASE_OUTPUT_DIR, run_name, "final_model")
    
    if not os.path.exists(model_path):
        print(f"Skipping seed {seed} - model not found")
        continue
    
    print(f"\n{'='*60}")
    print(f"Testing Seed: {seed}")
    print(f"{'='*60}")
    
    # Load model
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if torch.cuda.is_available():
        model = model.cuda()
    model.eval()
    
    # Text Generation Test
    print("\nTEXT GENERATION:")
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
    
    for prompt_idx, prompt in enumerate(test_prompts):
        output = pipe(prompt, max_new_tokens=25, do_sample=True, temperature=1.0,
                     repetition_penalty=2.0, no_repeat_ngram_size=2, top_k=30,
                     pad_token_id=tokenizer.eos_token_id)
        generated = output[0]['generated_text'][len(prompt):].strip()
        
        all_generation_results.append({
            'seed': seed,
            'prompt_id': prompt_idx + 1,
            'prompt': prompt,
            'generated_text': generated
        })
        
        print(f"  {prompt_idx+1}. Prompt: {prompt}")
        print(f"     Generated: {generated}\n")
    
    # Minimal Pair Test
    print("\nMINIMAL PAIRS:")
    for pair_idx, (sent1, sent2) in enumerate(minimal_pairs):
        ppls = []
        for sent in [sent1, sent2]:
            inputs = tokenizer(sent, return_tensors="pt", truncation=True, max_length=128)
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
            with torch.no_grad():
                loss = model(inputs["input_ids"], labels=inputs["input_ids"]).loss.item()
            ppls.append(math.exp(loss))
        
        all_minimal_pair_results.append({
            'seed': seed,
            'pair_id': pair_idx + 1,
            'sentence1': sent1,
            'sentence2': sent2,
            'ppl1': ppls[0],
            'ppl2': ppls[1],
            'lower_ppl': 'sentence1' if ppls[0] < ppls[1] else 'sentence2',
            'ppl_difference': abs(ppls[0] - ppls[1])
        })
        
        print(f"  Pair {pair_idx+1}: PPL1={ppls[0]:.1f}, PPL2={ppls[1]:.1f}, Lower={'1st' if ppls[0]<ppls[1] else '2nd'}")
    
    # Clean up GPU memory
    del model, tokenizer, pipe
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Save results
print(f"\n{'='*60}")
print("SAVING RESULTS")
print(f"{'='*60}")

# Save generation results
gen_df = pd.DataFrame(all_generation_results)
gen_csv = os.path.join(BASE_OUTPUT_DIR, "all_seeds_generation_results.csv")
gen_df.to_csv(gen_csv, index=False, encoding='utf-8')
print(f"Generation results saved: {gen_csv}")

gen_json = os.path.join(BASE_OUTPUT_DIR, "all_seeds_generation_results.json")
with open(gen_json, 'w', encoding='utf-8') as f:
    json.dump(all_generation_results, f, indent=2, ensure_ascii=False)
print(f"Generation results saved: {gen_json}")

# Save minimal pair results
pair_df = pd.DataFrame(all_minimal_pair_results)
pair_csv = os.path.join(BASE_OUTPUT_DIR, "all_seeds_minimal_pairs.csv")
pair_df.to_csv(pair_csv, index=False, encoding='utf-8')
print(f"Minimal pair results saved: {pair_csv}")

pair_json = os.path.join(BASE_OUTPUT_DIR, "all_seeds_minimal_pairs.json")
with open(pair_json, 'w', encoding='utf-8') as f:
    json.dump(all_minimal_pair_results, f, indent=2, ensure_ascii=False)
print(f"Minimal pair results saved: {pair_json}")

# Summary statistics
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"\nGeneration Results: {len(all_generation_results)} entries")
print(f"Minimal Pair Results: {len(all_minimal_pair_results)} entries")
print(f"\nSeeds tested: {sorted(set([r['seed'] for r in all_generation_results]))}")