<a href="https://colab.research.google.com/github/shahedmobydeen/SCFP-benchmark/blob/main/Data_and_Model_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
import argparse
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
from sklearn.metrics import accuracy_score, precision_recall_f1_score

# --- Placeholder for LLM API Calls ---
def generate_correction_triplet(problem_text: str, api_key: str = None) -> dict:
    """
    Generates a self-correction triplet (initial response, critique, final response)
    for a given problem using an external LLM API.

    *** THIS IS A PLACEHOLDER FUNCTION ***
    You must replace the logic below with your actual API calls to a service like
    OpenAI, Anthropic, Google AI, etc.

    Args:
        problem_text: The input problem or prompt.
        api_key: Your API key for the LLM service.

    Returns:
        A dictionary containing the generated triplet.
    """
    print(f"Generating triplet for: '{problem_text[:50]}...'")
    # 1. First API Call: Generate initial response
    # client = YourLLMClient(api_key=api_key)
    # initial_response = client.generate(prompt=f"Solve this: {problem_text}. Think step-by-step.")
    initial_response = "This is a placeholder initial response, containing a simulated thought process and a likely incorrect answer."

    # 2. Second API Call: Generate self-critique
    # critique_prompt = f"Problem: {problem_text}\nYour Answer: {initial_response}\n\nPlease review your answer for errors. Provide a critique."
    # self_critique = client.generate(prompt=critique_prompt)
    self_critique = "This is a placeholder critique. It points out a simulated flaw in the initial response."

    # 3. Third API Call: Generate final response
    # final_prompt = f"Problem: {problem_text}\nYour Answer: {initial_response}\nYour Critique: {self_critique}\n\nProvide a final, corrected answer."
    # final_response = client.generate(prompt=final_prompt)
    final_response = "This is a placeholder final answer, supposedly corrected based on the critique."

    return {
        "initial_response": initial_response,
        "self_critique": self_critique,
        "final_response": final_response,
    }


def run_data_generation(args):
    """
    Runs the data generation pipeline using source problems.
    """
    print("--- Starting Data Generation ---")
    try:
        source_df = pd.read_csv(args.source_problems_path)
        print(f"Loaded {len(source_df)} source problems.")
    except FileNotFoundError:
        print(f"Error: Source problems file not found at {args.source_problems_path}")
        return

    results = []
    for index, row in source_df.iterrows():
        problem_text = row['problem_text'] # Assuming this column exists
        triplet = generate_correction_triplet(problem_text)
        results.append({
            "problem_id": row.get('problem_id', f"problem_{index}"),
            "source_dataset": row.get('source_dataset', 'unknown'),
            "problem_text": problem_text,
            **triplet
        })

    output_df = pd.DataFrame(results)
    output_df.to_csv(args.output_path, index=False)
    print(f"Successfully generated and saved {len(output_df)} data points to {args.output_path}")


def run_training(args):
    """
    Runs the meta-model training and evaluation pipeline.
    """
    print("--- Starting Meta-Model Training ---")

    # 1. Load and prepare data
    try:
        df = pd.read_csv(args.data_path)
        print(f"Loaded {len(df)} records from {args.data_path}")
    except FileNotFoundError:
        print(f"Error: Dataset not found at {args.data_path}")
        return

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # Concatenate inputs
    df['text'] = df['problem_text'] + " [SEP] " + df['initial_response'] + " [SEP] " + df['self_critique']
    df = df.rename(columns={'is_correct': 'label'})

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
    val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True)

    class SCFPDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = SCFPDataset(train_encodings, train_df['label'].tolist())
    val_dataset = SCFPDataset(val_encodings, val_df['label'].tolist())

    # 2. Define metrics
    def compute_metrics(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        precision, recall, f1, _ = precision_recall_f1_score(p.label_ids, preds, average='binary')
        acc = accuracy_score(p.label_ids, preds)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

    # 3. Configure Trainer
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # 4. Run training and evaluation
    print("Starting training...")
    trainer.train()

    print("Evaluating final model...")
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)

    trainer.save_model(f"{args.output_dir}/best_model")
    print(f"Best model saved to {args.output_dir}/best_model")


def main():
    parser = argparse.ArgumentParser(description="SCFP Benchmark Data Generation and Model Training Pipeline")
    parser.add_argument("--mode", type=str, required=True, choices=['generate', 'train'], help="Pipeline mode to run.")

    # Args for both modes
    parser.add_argument("--output_path", type=str, default="output.csv", help="Path to save generated data.")

    # Args for 'generate' mode
    parser.add_argument("--source_problems_path", type=str, help="Path to CSV with source problems for data generation.")

    # Args for 'train' mode
    parser.add_argument("--data_path", type=str, help="Path to the SCFP benchmark CSV file for training.")
    parser.add_argument("--model_name", type=str, default="microsoft/deberta-v3-base", help="Hugging Face model to fine-tune.")
    parser.add_argument("--output_dir", type=str, default="./scfp_model", help="Directory to save trained model.")

    args = parser.parse_args()

    if args.mode == 'generate':
        if not args.source_problems_path:
            parser.error("--source_problems_path is required for 'generate' mode.")
        run_data_generation(args)
    elif args.mode == 'train':
        if not args.data_path:
            parser.error("--data_path is required for 'train' mode.")
        run_training(args)

if __name__ == "__main__":
    # Example Usage:
    #
    # To generate data (after filling in the placeholder function):
    # python pipeline.py --mode generate --source_problems_path ./my_problems.csv --output_path ./generated_scfp_data.csv
    #
    # To train the model:
    # python pipeline.py --mode train --data_path ./benchmark/SCFP_v1.0.csv --output_dir ./my_trained_model
    main()

ImportError: cannot import name 'precision_recall_f1_score' from 'sklearn.metrics' (/usr/local/lib/python3.12/dist-packages/sklearn/metrics/__init__.py)