In [None]:
import pandas as pd
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel
import os
import json
from datasets import load_dataset
import tempfile

**Summary: This notebook demonstrates how to compare two SageMaker models using the Cisco CCNA dataset for question answering evaluation.**

## 1. Load and Prepare Cisco CCNA Dataset

We'll load the Cisco CCNA dataset from HuggingFace and convert it to the format expected by fmeval.

In [None]:
# Load the Cisco CCNA dataset
dataset = load_dataset("Elfsong/Cisco_CCNA")
df = pd.DataFrame(dataset['train'])

print(f"Dataset loaded with {len(df)} questions")
print("\nDataset columns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

In [None]:
# Convert dataset to fmeval format
def convert_to_fmeval_format(df, max_records=50):
    """
    Convert Cisco CCNA dataset to fmeval QA format
    Expected format: [{"question": "...", "answer": "..."}]
    """
    fmeval_data = []
    
    for _, row in df.head(max_records).iterrows():
        # Use the question text and the correct answer
        question = row['question_text']
        correct_answer = row['correct_answer']  # This should be the letter(s) like 'A', 'B', 'BC', etc.
        
        # Parse the choices to get the full answer text
        choices = row['choices']
        
        # Convert correct_answer letters to actual text
        if len(correct_answer) == 1:
            # Single answer like 'A'
            choice_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
            if correct_answer in choice_map and choice_map[correct_answer] < len(choices.split(' ')): 
                # Split choices and get the correct one
                choice_list = [c.strip() for c in choices.split(' ') if c.strip()]
                if choice_map[correct_answer] < len(choice_list):
                    answer = choice_list[choice_map[correct_answer]]
                else:
                    answer = correct_answer
            else:
                answer = correct_answer
        else:
            # Multiple answers like 'BC' - just use the letters for now
            answer = correct_answer
            
        fmeval_data.append({
            "question": question,
            "answer": answer
        })
    
    return fmeval_data

# Convert first 20 records for testing
qa_data = convert_to_fmeval_format(df, max_records=20)

print(f"Converted {len(qa_data)} question-answer pairs")
print("\nSample data:")
for i, item in enumerate(qa_data[:3]):
    print(f"\nQ{i+1}: {item['question'][:100]}...")
    print(f"A{i+1}: {item['answer']}")

In [None]:
# Save the converted data to a temporary JSONL file for fmeval
def save_to_jsonl(data, filename):
    """Save data to JSONL format for fmeval"""
    os.makedirs("cisco_ccna_data", exist_ok=True)
    filepath = os.path.join("cisco_ccna_data", filename)
    
    with open(filepath, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
    
    return filepath

# Save the QA data
qa_file_path = save_to_jsonl(qa_data, "cisco_ccna_qa.jsonl")
print(f"Saved QA data to: {qa_file_path}")

## 2. Configure Your SageMaker Endpoints

Replace the endpoint names below with your actual SageMaker endpoint names.

In [None]:
# Endpoint names
ENDPOINT_NAME_1 = "jumpstart-dft-hf-llm-gemma-7b-20250813-123819"
ENDPOINT_NAME_2 = "jumpstart-dft-hf-llm-gemma-7b-20250812-200929"

# Model IDs
MODEL_ID_1 = "gemma-7b-123819"
MODEL_ID_2 = "gemma-7b-200929"

print(f"Using endpoints:")
print(f"1. {ENDPOINT_NAME_1} ({MODEL_ID_1})")
print(f"2. {ENDPOINT_NAME_2} ({MODEL_ID_2})")

In [None]:
# Helper function to test endpoint connectivity and determine output format
def test_endpoint(endpoint_name, test_prompt="What is a router?"):
    """
    Test endpoint connectivity and determine response format
    """
    try:
        predictor = sagemaker.predictor.Predictor(
            endpoint_name=endpoint_name,
            serializer=sagemaker.serializers.JSONSerializer(),
            deserializer=sagemaker.deserializers.JSONDeserializer()
        )
        
        # Test with a simple payload - you may need to adjust this based on your model
        payload = {
            "inputs": test_prompt,
            "parameters": {
                "max_new_tokens": 100,
                "temperature": 0.1
            }
        }
        
        response = predictor.predict(payload)
        print(f"Endpoint {endpoint_name} is working!")
        print(f"Sample response: {str(response)[:200]}...")
        
        # Try to determine output format
        if isinstance(response, list) and len(response) > 0:
            if "generated_text" in response[0]:
                output_format = "[0].generated_text"
            elif "generation" in response[0]:
                output_format = "[0].generation"
            else:
                output_format = "[0]"
        elif isinstance(response, dict):
            if "generated_text" in response:
                output_format = ".generated_text"
            elif "generation" in response:
                output_format = ".generation"
            else:
                output_format = ""
        else:
            output_format = ""
            
        return predictor, output_format
        
    except Exception as e:
        print(f"Error testing endpoint {endpoint_name}: {str(e)}")
        return None, None

# Test both endpoints
predictor_1, output_format_1 = test_endpoint(ENDPOINT_NAME_1)
predictor_2, output_format_2 = test_endpoint(ENDPOINT_NAME_2)

## 3. Set up Model Runners for Evaluation

We'll configure the model runners that fmeval will use to interact with your endpoints.

In [None]:
from fmeval.eval_algorithms.qa_accuracy import QAAccuracy, QAAccuracyConfig
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner

In [None]:
# Configure model runners - adjust content_template based on your model requirements

# Model Runner 1
model_runner_1 = JumpStartModelRunner(
    endpoint_name=ENDPOINT_NAME_1,
    model_id=MODEL_ID_1,
    model_version="*",
    output=output_format_1,
    content_template='{
        "inputs": $prompt,
        "parameters": {
            "max_new_tokens": 100,
            "temperature": 0.1,
            "top_p": 0.9
        }
    }'
)

# Model Runner 2
model_runner_2 = JumpStartModelRunner(
    endpoint_name=ENDPOINT_NAME_2,
    model_id=MODEL_ID_2,
    model_version="*",
    output=output_format_2,
    content_template='{
        "inputs": $prompt,
        "parameters": {
            "max_new_tokens": 100,
            "temperature": 0.1,
            "top_p": 0.9
        }
    }'
)

print("Model runners configured successfully!")

## 4. Run the Evaluation

Now we'll run the QA accuracy evaluation on both models using the Cisco CCNA dataset.

In [None]:
# Helper function to configure and run evaluation
def run_eval(model_runner, model_name, dataset_path):
    """Configure and run QA evaluation"""
    
    # Configure evaluation with custom dataset
    config = QAAccuracyConfig(
        dataset_config_name="cisco_ccna",
        dataset_name="cisco_ccna_qa",
        dataset_uri=dataset_path,
        dataset_mime_type="application/jsonlines",
        model_outputs_to_save=None
    )
    
    qa_eval = QAAccuracy(config)
    
    # Configure filepath for results
    results_dir = "cisco_ccna_results"
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, f"{model_name}.json")
    
    # Load results from file if the eval has already been run
    if os.path.exists(results_path):
        with open(results_path, 'r') as f:
            results = json.load(f)
            print(f'Results loaded from {results_path}')
    else:
        print(f"Running evaluation for {model_name}...")
        try:
            # Run evaluation with limited records for testing
            results = qa_eval.evaluate(
                model=model_runner, 
                save=True,
                num_records=10  # Start with fewer records for testing
            )
            
            # Save results
            with open(results_path, 'w') as f:
                json.dump(results, f, default=lambda c: c.__dict__)
                print(f'Results saved to {results_path}')
        except Exception as e:
            print(f"Error running evaluation for {model_name}: {str(e)}")
            return None
    
    return results

In [None]:
# Run evaluation for Model 1
if predictor_1 is not None:
    print(f"\n=== Evaluating {MODEL_ID_1} ===")
    results_model_1 = run_eval(model_runner_1, MODEL_ID_1, qa_file_path)
else:
    print(f"Skipping {MODEL_ID_1} evaluation - endpoint not available")
    results_model_1 = None

In [None]:
# Run evaluation for Model 2
if predictor_2 is not None:
    print(f"\n=== Evaluating {MODEL_ID_2} ===")
    results_model_2 = run_eval(model_runner_2, MODEL_ID_2, qa_file_path)
else:
    print(f"Skipping {MODEL_ID_2} evaluation - endpoint not available")
    results_model_2 = None

## 5. Visualize Results

We'll create radar plots to compare the performance of both models.

In [None]:
# Install plotting packages if not already available
!pip install -U plotly kaleido

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'

In [None]:
# Function to load and format results for visualization
def load_results_for_viz(model_names):
    """Load evaluation results and format for visualization"""
    accuracy_results = []
    
    for model_name in model_names:
        results_path = os.path.join("cisco_ccna_results", f"{model_name}.json")
        
        if not os.path.exists(results_path):
            print(f"Results file not found: {results_path}")
            continue
            
        try:
            with open(results_path, 'r') as f:
                res = json.load(f)
                
            for accuracy_eval in res:
                for accuracy_scores in accuracy_eval["dataset_scores"]:
                    accuracy_results.append({
                        'model': model_name,
                        'evaluation': 'accuracy',
                        'dataset': accuracy_eval["dataset_name"],
                        'metric': accuracy_scores["name"],
                        'value': accuracy_scores["value"]
                    })
        except Exception as e:
            print(f"Error loading results for {model_name}: {str(e)}")
            
    return pd.DataFrame(accuracy_results)

# Function to create radar plot
def visualize_cisco_results(results_df):
    """Create radar plot for Cisco CCNA evaluation results"""
    if results_df.empty:
        print("No results available for visualization")
        return
        
    # Create the radar plot
    fig = px.line_polar(
        results_df, 
        r='value', 
        theta='metric', 
        color='model', 
        line_close=True,
        title="Model Comparison on Cisco CCNA Dataset"
    )
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )
        ),
        title=dict(font=dict(size=20)),
        margin=dict(l=150, r=0, t=100, b=80)
    )
    
    fig.show()
    
    # Save the plot
    results_dir = "cisco_ccna_results"
    fig.write_image(os.path.join(results_dir, "cisco_ccna_comparison.pdf"))
    fig.write_html(os.path.join(results_dir, "cisco_ccna_comparison.html"))
    
    return fig

In [None]:
# Load and visualize results
available_models = []
if results_model_1 is not None:
    available_models.append(MODEL_ID_1)
if results_model_2 is not None:
    available_models.append(MODEL_ID_2)

if available_models:
    print(f"Loading results for models: {available_models}")
    results_df = load_results_for_viz(available_models)
    
    if not results_df.empty:
        print("\nResults summary:")
        print(results_df.groupby(['model', 'metric'])['value'].mean())
        
        # Create visualization
        fig = visualize_cisco_results(results_df)
    else:
        print("No valid results found for visualization")
else:
    print("No models were successfully evaluated")

## 6. Summary and Next Steps

This notebook provides a framework for comparing your SageMaker models using the Cisco CCNA dataset. 

### Key Points:
1. **Dataset**: Uses the Cisco CCNA networking questions dataset
2. **Models**: Compares two of your existing SageMaker endpoints
3. **Evaluation**: Uses fmeval's QA accuracy metrics
4. **Visualization**: Creates radar plots to compare performance

### To customize for your use case:
1. Update the endpoint names and model IDs
2. Adjust the content templates based on your model's expected input format
3. Modify the output format parsing if needed
4. Increase the number of evaluation records once you verify everything works

### Troubleshooting:
- If endpoints fail to connect, verify they're running and accessible
- If evaluation fails, check the content template format matches your model's API
- Adjust the output format string based on your model's response structure