## Evaluate a language model's performance on a dataset 

I have tried two methods and they both works. Feel free to try and compare two methods' efficiency.

1. **First Method (Straightforward Approach)**: 
   - This method utilizes a defined function (`get_answer`) that takes structured input and generates predictions from the language model. It is straightforward and easy to understand, making it ideal for users who prefer a clear and direct approach to model evaluation. The function encapsulates the logic for processing input data and invoking the model, ensuring that the evaluation process is organized and maintainable.

2. **Second Method (Creative Approach)**: 
   - The second method employs a lambda function as the `llm_or_chain_factory`. This approach allows for more creativity in how predictions are generated and evaluated. While it can lead to more concise code, it requires careful handling of input and output structures. This method is suitable for users who want to experiment with different ways of processing input data and generating model outputs, providing flexibility in the evaluation process.


In [None]:
import os
from langsmith import Client
from langsmith.evaluation import evaluate
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Set your API key env
os.environ['GROQ_API_KEY'] = "your groq key"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "your langchain key"  # Update with your API key

 
llm = ChatGroq(
    model="llama3-groq-8b-8192-tool-use-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [None]:
from langsmith import Client
import pandas as pd
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
from langchain.smith import RunEvalConfig
import uuid

# Load the CSV file into a DataFrame
csv_file_path = "Spar.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file_path).head(5)

# Initialize the LangSmith client
client = Client()

# Upload the dataset
try:
    dataset = client.upload_csv(
        csv_file=csv_file_path,
        input_keys=["Story", "Question", "Candidate_Answers"],  # Adjust based on your CSV structure
        output_keys=["Answer"],
        name=f"spar_human_{str(uuid.uuid4())}"
    )
    print("Dataset uploaded successfully.")
except Exception as e:
    print(f"Error uploading dataset: {e}")

# Define the model
model = ChatGroq(
    model="llama3-groq-8b-8192-tool-use-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

# Modify get_answer to accept input as a parameter
def get_answer(input_):
    story = input_["Story"]
    question = input_["Question"]
    options = input_["Candidate_Answers"].split(', ')  # Assuming options are comma-separated

    # Create messages for the current model
    messages = [
        SystemMessage(content=Visual_prompt),  # Ensure Visual_prompt is defined
        HumanMessage(content=f"Given the context: {story} and question: {question}\nOptions: {options}\nPlease select the correct answer.")
    ]
    
    # Invoke the model and return the output
    return model.invoke(messages).content

# Define evaluation configuration
eval_config = RunEvalConfig(
    evaluators=["exact_match"]  # Use the "exact_match" evaluator
)

# Run the evaluation using the LangSmith client and the dataset
client.run_on_dataset(
    dataset_name=dataset.name,  # The data to predict and grade over
    llm_or_chain_factory=get_answer,  # Provide a function to generate the predicted output
    evaluation=eval_config,  # The evaluators to score the results
    project_metadata={"version": "1.0.0", "model": model.model_name},  # The name of the experiment
)

In [None]:
import pandas as pd
from langsmith import Client
from langchain_groq import ChatGroq
from langchain_core.messages import SystemMessage, HumanMessage
from langchain.smith import RunEvalConfig
import uuid

# Load the CSV file into a DataFrame
csv_file_path = "Spar.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)

# Initialize the LangSmith client
client = Client()

# Upload the dataset
try:
    dataset = client.upload_csv(
        csv_file=csv_file_path,
        input_keys=["Story", "Question", "Candidate_Answers"],  # Adjust based on your CSV structure
        output_keys=["Answer"],
        name=f"spar_human_{str(uuid.uuid4())}"
    )
    print("Dataset uploaded successfully.")
except Exception as e:
    print(f"Error uploading dataset: {e}")



def evaluate_model(model):
    results = []  
    story = input_["Story"]
    question = input_["Question"]
    options = input_["Candidate_Answers"].split(', ') 
    ground_truth = output_["Answer"]  # Assuming options are comma-separated

    # Create messages for the current model
    messages = [
        SystemMessage(content=Visual_prompt),  # Ensure Visual_prompt is defined
        HumanMessage(content=f"Given the context: {story} and question: {question}\nOptions: {options}\nPlease select the correct answer.")
    ]
    
    try:
        # Generate output from the LLM using the messages
        output = model.invoke(messages)

        # Use the runnable evaluator
        eval_config = RunEvalConfig(
            evaluators=["exact_match"],  # Use the "exact_match" evaluator
            prediction_key="predicted",  # Key for the model's output
            reference_key="Answer"  # Use the correct key for ground truth
        )
        
        # Run the evaluation using the LangSmith client and the dataset
        client.run_on_dataset(
            dataset_name=dataset.name,  # The data to predict and grade over
            llm_or_chain_factory=lambda input_: {"predicted": output.content, "Answer": ground_truth},  # Provide a function to generate the predicted output
            evaluation=eval_config,  # The evaluators to score the results
            project_metadata={"version": "1.0.0", "model": model.model_name},  # The name of the experiment
        )
        
        results.append({
            "model": model.model_name,
            "question": question,
            "predicted": output.content,
            "ground_truth": ground_truth,
        })
    except Exception as e:
        print(f"Error invoking model {model.model_name}: {e}")

    return results


model = ChatGroq(
    model="llama3-groq-8b-8192-tool-use-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
test= evaluate_model(model)
test.to_csv("test_result.csv")