In [1]:
import os
import json
import glob
import traceback
from datetime import datetime
from text2sql_pipeline import Text2SQLPipeline

# Dataset paths - configure these for your environment
BIRD_DATASET_PATH = '/Users/sinabehnam/Desktop/Projects/Polito/Thesis/MA_text2SQL/DataSampling/src/src_data/bird_subset/stratified_output'
SPIDER_DATASET_PATH = '/Users/sinabehnam/Desktop/Projects/Polito/Thesis/MA_text2SQL/DataSampling/src/src_data/spider_subset/spider_stratified_output_200'
API_KEY = 'db07a16311d7f554af4d89c69fb2bf55eecb525746bb96be93f2167b2fffdf88'
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_api_key_here")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_evaluation_summary_from_directory(base_dir):
    """
    Generate evaluation summary by reading enriched JSON files from directory.
    
    Args:
        base_dir: Base directory containing model-specific subdirectories with enriched JSON files
        
    Returns:
        Dictionary with evaluation metrics for each model
    """
    summary = {}
    
    # Find all model directories
    model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    
    for model_dir in model_dirs:
        model_path = os.path.join(base_dir, model_dir)
        
        # Get all JSON files in this model directory
        json_files = glob.glob(os.path.join(model_path, '*.json'))
        
        if not json_files:
            print(f"No JSON files found in {model_path}")
            continue
        
        # Metrics to calculate
        num_evaluated = len(json_files)
        has_prediction = 0
        execution_correct = 0
        exact_match = 0
        semantic_equivalent = 0
        model_info = None
        
        # Process each JSON file
        for json_file in json_files:
            try:
                with open(json_file, 'r') as f:
                    data = json.load(f)
                
                # Extract inference results
                if 'inference_results' in data:
                    inference = data['inference_results']
                    
                    # Save model info if not already set
                    if model_info is None and 'model' in inference:
                        model_info = inference['model']
                    
                    # Count metrics
                    if inference.get('has_prediction', False):
                        has_prediction += 1
                        
                        pred_output = inference.get('predicted_output', {})
                        if pred_output.get('execution_correct', False):
                            execution_correct += 1
                        if pred_output.get('exact_match', False):
                            exact_match += 1
                        if pred_output.get('semantic_equivalent', False):
                            semantic_equivalent += 1
            
            except Exception as e:
                print(f"Error processing {json_file}: {str(e)}")
        
        # Calculate metrics
        metrics = {
            'num_evaluated': num_evaluated,
            'num_with_prediction': has_prediction,
            'prediction_rate': has_prediction / num_evaluated if num_evaluated > 0 else 0,
            'execution_accuracy': execution_correct / has_prediction if has_prediction > 0 else 0,
            'exact_match_accuracy': exact_match / has_prediction if has_prediction > 0 else 0,
            'semantic_equivalent_accuracy': semantic_equivalent / has_prediction if has_prediction > 0 else 0,
            'model': model_info or {'model_name': model_dir}
        }
        
        # Get the full model name from the model info
        model_name = model_info['model_name'] if model_info and 'model_name' in model_info else model_dir
        
        # Add to summary
        summary[model_name] = metrics
        
        # Print metrics
        print(f"\nMetrics for model {model_name}:")
        print(f"Total evaluated: {metrics['num_evaluated']}")
        print(f"Prediction rate: {metrics['prediction_rate']:.2f}")
        print(f"Execution accuracy: {metrics['execution_accuracy']:.2f}")
        print(f"Exact match accuracy: {metrics['exact_match_accuracy']:.2f}")
        print(f"Semantic equivalence accuracy: {metrics['semantic_equivalent_accuracy']:.2f}")
    
    return summary

In [3]:
# Set up output directories for updated JSON files
OUTPUT_BASE_DIR = '/Users/sinabehnam/Desktop/Projects/Polito/Thesis/MA_text2SQL/output/pipeline/enriched_output'
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

# Define model configurations for testing
model_configs = [
    # API-based models
    # {
    #     "type": "together_ai",
    #     "name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
    #     "api_key": API_KEY
    # },
    # Local models - uncomment and configure as needed
    {
        "type": "local",
        "name": "mistralai/Mistral-7B-Instruct-v0.1",
        "device": "auto",
        "max_new_tokens": 512
    },
    # {
    #     "type": "local",
    #     "name": "TheBloke/Llama-2-7B-Chat-GGUF",
    #     "device": "cpu",
    #     "max_new_tokens": 512
    # }
]

In [4]:
all_results = {}

for model_config in model_configs:
    model_name = model_config["name"]
    model_type = model_config["type"]
    
    try:
        print(f"\nEvaluating {model_type} model: {model_name}")
        
        # Create model-specific output directory
        # Use the last part of the model name for the directory name
        model_dir_name = model_name.split('/')[-1]
        model_output_dir = os.path.join(OUTPUT_BASE_DIR, model_dir_name)
        os.makedirs(model_output_dir, exist_ok=True)
        
        # Initialize the pipeline with this model configuration
        pipeline = Text2SQLPipeline(
            bird_path=BIRD_DATASET_PATH,
            spider_path=SPIDER_DATASET_PATH,
            model_config=model_config
        )
        
        # Run the pipeline
        results = pipeline.run_pipeline(
            num_samples=10,
            save_updated_files=True,
            output_dir=model_output_dir
        )
        
        # Store summary metrics (without detailed results)
        summary_metrics = {
            'num_evaluated': results['num_evaluated'],
            'num_with_prediction': results['num_with_prediction'],
            'prediction_rate': results['prediction_rate'],
            'execution_accuracy': results['execution_accuracy'],
            'exact_match_accuracy': results['exact_match_accuracy'],
            'semantic_equivalent_accuracy': results.get('semantic_equivalent_accuracy', 0.0),
            'model': results['model']
        }
        
        all_results[model_name] = summary_metrics
        
        # Print summary metrics
        print("\nSummary Metrics:")
        print(f"Total evaluated: {summary_metrics['num_evaluated']}")
        print(f"Prediction rate: {summary_metrics['prediction_rate']:.2f}")
        print(f"Execution accuracy: {summary_metrics['execution_accuracy']:.2f}")
        print(f"Exact match accuracy: {summary_metrics['exact_match_accuracy']:.2f}")
        print(f"Semantic equivalence accuracy: {summary_metrics.get('semantic_equivalent_accuracy', 0.0):.2f}")
        print(f"Updated JSON files saved to: {model_output_dir}")
        
    except Exception as e:
        error_traceback = traceback.format_exc()
        print(f"Error processing model {model_name}: {str(e)}")
        print(f"Traceback: {error_traceback}")
        
        # Add error information to results
        all_results[model_name] = {
            'error': str(e),
            'traceback': error_traceback,
            'model': {'model_name': model_name, 'model_type': model_type}
        }


Evaluating local model: mistralai/Mistral-7B-Instruct-v0.1
Total data points: 521
Bird data points: 200
Spider data points: 321
Training data points: 416
Testing data points: 105
Loading model mistralai/Mistral-7B-Instruct-v0.1 on cpu...
Error processing model mistralai/Mistral-7B-Instruct-v0.1: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like mistralai/Mistral-7B-Instruct-v0.1 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
Traceback: Traceback (most recent call last):
  File "/Users/sinabehnam/Desktop/Projects/Polito/Thesis/MA_text2SQL/venv/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/Users/sinabehnam/Desktop/Projects/Polito/Thesis/MA_text2SQL/venv/lib

In [None]:
# Save summary metrics to file with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = os.path.join(OUTPUT_BASE_DIR, f'evaluation_summary_{timestamp}.json')
with open(summary_file, 'w') as f:
    json.dump(all_results, f, indent=2)
print(f"\nEvaluation complete. Summary saved to '{summary_file}'")
print(f"Individual JSON files with inference results saved to model-specific directories under '{OUTPUT_BASE_DIR}'")

In [None]:
# Generate summary from enriched files as a verification
print("\nGenerating summary from enriched files...")
directory_summary = generate_evaluation_summary_from_directory(OUTPUT_BASE_DIR)

# Save the directory-based summary
summary_from_dir_file = os.path.join(OUTPUT_BASE_DIR, f'evaluation_summary_from_dir_{timestamp}.json')
with open(summary_from_dir_file, 'w') as f:
    json.dump(directory_summary, f, indent=2)

print(f"Summary from directory saved to '{summary_from_dir_file}'")