### Package Requirements

In [None]:
import os
import jsons
from dotenv import load_dotenv
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from openai import OpenAI
import tiktoken

from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
import sacrebleu

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM



In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('API_KEY')

# Introduction
# Technical Translation: English to Spanish, GPT 4
---

## Description
The USFS AirFire group maintains systems that forecasters use to create Smoke Outlooks, i.e. wildfire smoke forecasts. These are initially written in English. Then they are auto-translated into Spanish with ChatGPT. Finally, the translations are manually edited by a native speaker. The goal of this project is to use our corpus of existing outlooks and translations to train a ChatGPT model for improved translations. A simple user interface will also be needed to allow professional translators to compare translations using the trained and untrained models.

## Goals
We hope to reduce the amount of time staff spend translating outlooks while maintaining high quality translations. If all goes well, we will end up with: 1) a trained model for improved translations; 2) tailored prompts; and 3) a small user interface to test ChatGPT models. If successful, this approach will be used to support translation targets other than Spanish.

# Part 1
---
## Data Preparation
The following data contains historical smoke and fire forecasts that have been translated from English to Spanish. These translations have been verified by a translator.  

**Comment:** you may need to change the paths for where the data is being pulled from.

In [None]:
location_df = pd.read_csv('./data/Location_Forecast_translations.csv')
collection_df = pd.read_csv('./data/Collection_Forecast_translations.csv')

In [None]:
location_df.head()

In [None]:
collection_df.head()

**Comment:** Edits to the translation prompt can be made here.

In [None]:
translation_prompt = """
You are an official English to Spanish translator translating air quality forecasts
from English into Spanish. Use the following word-translation pairs when translating:

GOOD => BUENO
MODERATE => MODERADA
USG => IGS
UNHEALTHY => INSALUBRE
VERY UNHEALTHY => MUY INSALUBRE
HAZARDOUS => PELIGROSA
NaN => NaN

Translate the following air quality discussion into Spanish:"""

**Comment:** The following functions are used to load translations from .csv format to a usable .jsonl format. This is required for OpenAI API usage.

The function write_to_jsonl() saves the jsonl files in the current working directory. The names of the training, validation, and testing files can be changed when the function is actually called. 

In [None]:
def prepare_location_forecast(location_data, translation_prompt, random_state=42):
    """
    Prepares training, testing, and validation datasets for location forecast translations.
    
    The function splits the input dataset into training (70%), validation (15%), and testing (15%) sets.
    It formats data into structured messages for a translation model.
    
    Parameters:
    location_data (DataFrame): A pandas DataFrame containing English and Spanish forecast texts.
    translation_prompt (str): The system prompt for guiding the translation model.
    random_state (int, optional): Random seed for reproducibility. Default is 42.
    
    Returns:
    tuple: A tuple containing three lists:
        - location_training_data (list): Training data formatted as system, user, and assistant messages.
        - location_testing_data (list): Testing data containing paired English and Spanish texts.
        - location_validation_data (list): Validation data formatted similarly to training data.
    """
    
    train_data, temp_data = train_test_split(location_data, test_size=0.3, random_state=random_state)
    validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=random_state)

    # prepare training data
    location_training_data = []
    for _, row in train_data.iterrows():
        english_text = f"""
Forecast Summary: {row['forecast_summary']}
Today's Comment: {row['today_comment']}
Tomorrow's Comment: {row['tomorrow_comment']}
Extended Comment: {row['extended_comment']}"""
    
        spanish_text = f"""
Resumen del Pronóstico: {row['forecast_summary_es']}
Comentario de Hoy: {row['today_comment_es']}
Comentario de Mañana: {row['tomorrow_comment_es']}
Comentario Extendido: {row['extended_comment_es']}"""
    
        location_training_data.append({
            'messages': [
                {'role': 'system', 'content': translation_prompt}, 
                {'role': 'user', 'content': english_text}, 
                {'role': 'assistant', 'content': spanish_text}
            ]
        })

    # prepare testing data
    location_testing_data = []
    for _, row in test_data.iterrows():
        english_text = f"""
Forecast Summary: {row['forecast_summary']}
Today's Comment: {row['today_comment']}
Tomorrow's Comment: {row['tomorrow_comment']}
Extended Comment: {row['extended_comment']}"""
    
        spanish_text = f"""
Resumen del Pronóstico: {row['forecast_summary_es']}
Comentario de Hoy: {row['today_comment_es']}
Comentario de Mañana: {row['tomorrow_comment_es']}
Comentario Extendido: {row['extended_comment_es']}"""
    
        location_testing_data.append({
            'english_text': english_text,
            'spanish_text': spanish_text
        })

    # prepare validation data
    location_validation_data = []
    for _, row in validation_data.iterrows():
        english_text = f"""
Forecast Summary: {row['forecast_summary']}
Today's Comment: {row['today_comment']}
Tomorrow's Comment: {row['tomorrow_comment']}
Extended Comment: {row['extended_comment']}"""
    
        spanish_text = f"""
Resumen del Pronóstico: {row['forecast_summary_es']}
Comentario de Hoy: {row['today_comment_es']}
Comentario de Mañana: {row['tomorrow_comment_es']}
Comentario Extendido: {row['extended_comment_es']}"""
    
        location_validation_data.append({
            'messages': [
                {'role': 'system', 'content': translation_prompt}, 
                {'role': 'user', 'content': english_text}, 
                {'role': 'assistant', 'content': spanish_text}
            ]
        })

    return location_training_data, location_testing_data, location_validation_data

In [None]:
def prepare_collection_forecast(collection_data, translation_prompt, random_state=42):
    """
    Prepares training, testing, and validation datasets for collection forecast translations.
    
    The function splits the input dataset into training (70%), validation (15%), and testing (15%) sets.
    It formats data into structured messages for a translation model.
    
    Parameters:
    collection_data (DataFrame): A pandas DataFrame containing English and Spanish forecast texts.
    translation_prompt (str): The system prompt for guiding the translation model.
    random_state (int, optional): Random seed for reproducibility. Default is 42.
    
    Returns:
    tuple: A tuple containing three lists:
        - collection_training_data (list): Training data formatted as system, user, and assistant messages.
        - collection_testing_data (list): Testing data containing paired English and Spanish texts.
        - collection_validation_data (list): Validation data formatted similarly to training data.
    """
    
    train_data, temp_data = train_test_split(collection_data, test_size=0.3, random_state=random_state)
    validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=random_state)

    # prepare training data
    collection_training_data = []
    for _, row in train_data.iterrows():
        english_text = f"""
Fire Statement: {row['fire_statement_json']}
Smoke Statement: {row['smoke_statement_json']}
Special Statement: {row['special_statement_json']}"""

        spanish_text = f"""
Declaración del Incendio: {row['fire_statement_json_es']}
Declaración del Humo: {row['smoke_statement_json_es']}
Declaración Especial: {row['special_statement_json_es']}"""

        collection_training_data.append({
            'messages': [
                {'role': 'system', 'content': translation_prompt}, 
                {'role': 'user', 'content': english_text}, 
                {'role': 'assistant', 'content': spanish_text}
            ]
        })

    # prepare testing data
    collection_testing_data = []
    for _, row in test_data.iterrows():
        english_text = f"""
Fire Statement: {row['fire_statement_json']}
Smoke Statement: {row['smoke_statement_json']}
Special Statement: {row['special_statement_json']}"""

        spanish_text = f"""
Declaración del Incendio: {row['fire_statement_json_es']}
Declaración del Humo: {row['smoke_statement_json_es']}
Declaración Especial: {row['special_statement_json_es']}"""

        collection_testing_data.append({
            'english_text': english_text,
            'spanish_text': spanish_text
        })

    # prepare validation data
    collection_validation_data = []
    for _, row in validation_data.iterrows():
        english_text = f"""
Fire Statement: {row['fire_statement_json']}
Smoke Statement: {row['smoke_statement_json']}
Special Statement: {row['special_statement_json']}"""

        spanish_text = f"""
Declaración del Incendio: {row['fire_statement_json_es']}
Declaración del Humo: {row['smoke_statement_json_es']}
Declaración Especial: {row['special_statement_json_es']}"""

        collection_validation_data.append({
            'messages': [
                {'role': 'system', 'content': translation_prompt}, 
                {'role': 'user', 'content': english_text}, 
                {'role': 'assistant', 'content': spanish_text}
            ]
        })

    return collection_training_data, collection_testing_data, collection_validation_data

In [None]:
def write_to_jsonl(datasets, file_name):
    """
    Writes multiple datasets to a JSONL (JSON Lines) file.
    
    Each entry from the provided datasets is written as a separate line in the JSONL file.
    
    Parameters:
    datasets (list of lists): A list containing multiple datasets, where each dataset is a list of dictionaries.
    file_name (str): The base name of the output file (without extension).
    
    Returns:
    str: The file path of the saved JSONL file.
    """

    # directory to which files are saved to can be changed here.
    # data saved in same directory as notebook
    output_path = f'{file_name}.jsonl'
    with open(output_path, 'w', encoding='utf-8') as file:
        for dataset in datasets:
            for entry in dataset:
                json.dump(entry, file)
                file.write('\n')
    print('File saved at:', output_path)
    return output_path

In [None]:
# prepare data
loc_train, loc_test, loc_val = prepare_location_forecast(location_df, translation_prompt)
coll_train, coll_test, coll_val = prepare_collection_forecast(collection_df, translation_prompt)

# group data
training_datasets = [loc_train, coll_train]
testing_datasets = [loc_test, coll_test]
validation_datasets = [loc_val, coll_val]

# merge data to jsonl
combined_training_dataset_path = write_to_jsonl(training_datasets, 'combined_training_dataset')
combined_testing_dataset_path = write_to_jsonl(testing_datasets, 'combined_testing_dataset')
combined_validation_dataset_path = write_to_jsonl(validation_datasets, 'combined_validation_dataset')

# Part 2
---
## Data Validation
Now that the training data has been validated, validation and cost estimation procedures ([found here](https://cookbook.openai.com/examples/chat_finetuning_data_prep)) will be used to check for format errors, provide basic statistics, and estimate token counts for fine-tuning costs.

In [None]:
def load_dataset(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        dataset = [json.loads(line) for line in file]

    return dataset

combined_training_dataset = load_dataset(combined_training_dataset_path)
combined_testing_dataset = load_dataset(combined_testing_dataset_path)
combined_validation_dataset = load_dataset(combined_validation_dataset_path)

print('Number of training examples:', len(combined_training_dataset))
print('Number of testing examples:', len(combined_testing_dataset))
print('Number of validation examples:', len(combined_validation_dataset))

**Comment:** Testing Data will give an error for missing message list. This is ok.

In [None]:
def check_format(dataset):
    # Format error checks
    format_errors = defaultdict(int)
    
    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue
            
        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue
            
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            
            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                
            content = message.get("content", None)
            function_call = message.get("function_call", None)
            
            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1
        
        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1
    
    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")
        
print('Training Data')
check_format(combined_training_dataset)
# missing messages in testing is fine
print('\nTesting Data')
check_format(combined_testing_dataset)
print('\nValidation Data')
check_format(combined_validation_dataset)

## Token Counting and Cost Utilities

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
def warnings_counts_cost(dataset):
    # Warnings and tokens counts
    n_missing_system = 0
    n_missing_user = 0
    n_messages = []
    convo_lens = []
    assistant_message_lens = []
    
    for ex in dataset:
        messages = ex["messages"]
        if not any(message["role"] == "system" for message in messages):
            n_missing_system += 1
        if not any(message["role"] == "user" for message in messages):
            n_missing_user += 1
        n_messages.append(len(messages))
        convo_lens.append(num_tokens_from_messages(messages))
        assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
        
    print("Num examples missing system message:", n_missing_system)
    print("Num examples missing user message:", n_missing_user)
    print_distribution(n_messages, "num_messages_per_example")
    print_distribution(convo_lens, "num_total_tokens_per_example")
    print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
    n_too_long = sum(l > 16385 for l in convo_lens)
    print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

    # Pricing and default n_epochs estimate
    MAX_TOKENS_PER_EXAMPLE = 16385
    
    TARGET_EPOCHS = 3
    MIN_TARGET_EXAMPLES = 100
    MAX_TARGET_EXAMPLES = 25000
    MIN_DEFAULT_EPOCHS = 1
    MAX_DEFAULT_EPOCHS = 25
    
    n_epochs = TARGET_EPOCHS
    n_train_examples = len(dataset)
    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
        n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
        n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
    
    n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
    print(f"\nDataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
    print(f"By default, you'll train for {n_epochs} epochs on this dataset")
    print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")


warnings_counts_cost(combined_training_dataset)

# Part 3
---
## Model Tuning

In [None]:
client = OpenAI()

**Comment:** training and validation data both need to be uploaded separately for fine-tuning. The function file_upload() takes the file paths returned earlier as inputs and returns the corresponding file IDs.

In [None]:
def file_upload(dataset_path):
    """
    Uploads a dataset file for fine-tuning using the client API.

    Parameters:
    dataset_path (str): The file path of the dataset to be uploaded.

    Returns:
    str: The ID of the uploaded file, which can be used for fine-tuning.

    Example:
    >>> file_id = file_upload("training_dataset.jsonl")
    Fine-tuning file ID: file-abc123
    """
    file_response = client.files.create(
        file=open(dataset_path, "rb"),
        purpose="fine-tune"
    )
    file_id = file_response.id
    print(f'Fine-tuning file ID: {file_id}')
    return file_id

In [None]:
tf_id = file_upload(combined_training_dataset_path)
vf_id = file_upload(combined_validation_dataset_path)

In [None]:
def create_fine_tune_job(model_name, training_file_id, validation_file_id=None, seed=42,
                        batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'):
    """
    Creates a fine-tuning job for the specified model using the given training and optional validation files.

    Parameters:
    model_name (str): The name of the base model to fine-tune.
    training_file_id (str): The ID of the uploaded training file.
    validation_file_id (str, optional): The ID of the uploaded validation file (default: None).
    seed (int, optional): Random seed for reproducibility (default: 42).
    batch_size (str or int, optional): Batch size for training (default: 'auto').
    learning_rate_multiplier (str or float, optional): Learning rate multiplier (default: 'auto').
    n_epochs (str or int, optional): Number of training epochs (default: 'auto').

    Returns:
    str: The ID of the created fine-tuning job.

    Example:
    >>> job_id = create_fine_tune_job("gpt-3.5-turbo", "file-abc123")
    Fine-tuning job created with ID: job-xyz789
    """
    fine_tune_job = client.fine_tuning.jobs.create(
        seed=seed,
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model_name,
        method={
            'type': 'supervised',
            'supervised': {
                'hyperparameters': {
                    'batch_size': batch_size,
                    'learning_rate_multiplier': learning_rate_multiplier,
                    'n_epochs': n_epochs
                }
            }
        }
    )
    
    print(f"Fine-tuning job created with ID: {fine_tune_job.id}")
    return fine_tune_job.id

In [None]:
ftj_id = create_fine_tune_job(model_name='gpt-4o-mini-2024-07-18',
                             training_file_id=tf_id,
                             validation_file_id=vf_id)

**Comment:** If you would like to explore the training and testing results without fully retaining a model, save the fine tune job id and paste it below. This allows you to get specific models based on the fine tuning job id and conduct further testing.

In [None]:
# ftj_id = 'ftjob-WoNQoiX3pnfVcLaQsAyz5EAg'

**Comment:** Depending on the size of the dataset, finetuning may take a while! You can check on the progress of the fine tuning job using the line of code below. 

In the training job response, the .data dictionary contains all of the metrics on progress, accuracy, loss, etc. 'total_steps' shows how many training steps are required for the job. When calling the progress report, set the limit to >= 'total_steps'. Once 'step' = 'total_steps', training is complete.

**To-do:** TQDM progress bar.

In [None]:
# wait until training is complete
# may take a while
print(client.fine_tuning.jobs.list_events(fine_tuning_job_id=ftj_id, limit=2100))

In [None]:
def get_training_metrics(fine_tune_job_id, limit=2100):
    """
    Retrieves and processes fine-tuning job metrics, including training loss, validation loss, 
    and token accuracy, from the job's event logs.

    Parameters:
    fine_tune_job_id (str): The ID of the fine-tuning job.
    limit (int, optional): The maximum number of event records to fetch (default: None).

    Returns:
    tuple: A tuple containing six lists:
        - model_train_loss (list[float]): Training loss values.
        - model_valid_loss (list[tuple(int, float)]): Validation loss values with steps.
        - model_full_valid_loss (list[tuple(int, float)]): Full validation loss values with steps.
        - model_train_mean_token_accuracy (list[float]): Training mean token accuracy values.
        - model_valid_mean_token_accuracy (list[tuple(int, float)]): Validation mean token accuracy values with steps.
        - model_full_valid_mean_token_accuracy (list[tuple(int, float)]): Full validation mean token accuracy values with steps.

    Example:
    >>> mtl, mvl, mfvl, mtmta, mvmta, mfvmta = get_training_metrics(fine_tune_job_id=ftj_id, limit=2100)
    """
    fine_tune_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=ftj_id, limit=limit)
    event_data = fine_tune_response.data
    event_data.reverse()
    
    model_train_loss = []
    model_valid_loss = [(0,1)]
    model_full_valid_loss = [(0,1)]
    model_train_mean_token_accuracy = []
    model_valid_mean_token_accuracy = []
    model_full_valid_mean_token_accuracy = []
    
    # Iterate over each event and append values if they exist in event.data
    for event in event_data:
        if event.data:
            if 'train_loss' in event.data:
                model_train_loss.append(event.data['train_loss'])
            if 'valid_loss' in event.data:
                model_valid_loss.append((event.data['step'], event.data['valid_loss']))
            if 'full_valid_loss' in event.data:
                model_full_valid_loss.append((event.data['step'], event.data['full_valid_loss']))
            if 'train_mean_token_accuracy' in event.data:
                model_train_mean_token_accuracy.append(event.data['train_mean_token_accuracy'])
            if 'valid_mean_token_accuracy' in event.data:
                model_valid_mean_token_accuracy.append((event.data['step'], event.data['valid_mean_token_accuracy']))
            if 'full_valid_mean_token_accuracy' in event.data:
                model_full_valid_mean_token_accuracy.append((event.data['step'], event.data['full_valid_mean_token_accuracy']))

    return model_train_loss, model_valid_loss, model_full_valid_loss, model_train_mean_token_accuracy, model_valid_mean_token_accuracy, model_full_valid_mean_token_accuracy

mtl, mvl, mfvl, mtmta, mvmta, mfvmta = get_training_metrics(fine_tune_job_id=ftj_id, limit=2100)

In [None]:
def moving_average(data, window_size=4):
    """
    Computes the moving average of a given dataset using a specified window size.

    Parameters:
    data (list[float]): The input list of numerical values.
    window_size (int, optional): The number of consecutive values to consider for averaging (default: 4).

    Returns:
    list[float]: A list of averaged values, where each value is the mean of `window_size` consecutive elements.
    """
    i = 0
    m_avg = []
    while i < (len(data) - window_size + 1):
        window = data[i:i + window_size]
        window_average = round(sum(window) / window_size, 2)
        m_avg.append(window_average)
        i += 1
    
    return m_avg

mtl_smooth = moving_average(mtl, window_size=12)
mtmta_smooth = moving_average(mtmta, window_size=12)

In [None]:
plt.plot(mtl, label='Training Loss', color='b')
plt.plot(mtl_smooth, label='Moving Average (Window=12)', color='r')
plt.title('Model Training Loss')
plt.legend()
plt.show()

x, y = zip(*mfvl)
plt.plot(x, y, label='Full Validation Loss', color='cyan', marker='x')
x, y = zip(*mvl)
plt.plot(x, y, label='Validation Loss', color='orange', marker = '+')
plt.title('Model Validation Loss')
plt.ylim(0,1)
plt.legend()
plt.show()

plt.plot(mtmta, label='Mean Training Accuracy', color='b')
plt.plot(mtmta_smooth, label='Moving Average (Window=12)', color='r')
plt.title('Model Mean Token Accuracy')
plt.legend()
plt.show()

mvmta, mfvmta
x, y = zip(*mfvmta)
plt.plot(x, y, label='Full Mean Validation Accuracy', color='cyan', marker='x')
x, y = zip(*mvmta)
plt.plot(x, y, label='Mean Validation Accuracy', color='orange', marker = '+')
plt.title('Model Validation Accuracy')
plt.ylim(0,1)
plt.legend()
plt.show()

**Comment:** Fine Tuned Model name is retrieved here! This can be called in the app.

In [None]:
def get_model_name(fine_tune_job_id):
    fine_tune_response = client.fine_tuning.jobs.retrieve(fine_tune_job_id)
    fine_tuned_model_name = fine_tune_response.fine_tuned_model
    print(fine_tuned_model_name)

    return fine_tuned_model_name

ft_model_name = get_model_name(ftj_id)

# Part 4
---
## Model Testing

**Comment:** testing data needs to be loaded as well.

In [None]:
def load_testing_data(testing_data_path):
    """
    Loads testing data from a JSONL (JSON Lines) file, where each line contains a separate JSON object.

    Parameters:
    testing_data_path (str): The file path to the testing dataset.

    Returns:
    list[dict]: A list of dictionaries, where each dictionary represents a testing record.

    Example:
    >>> testing_data = load_testing_data("data/testing_dataset.jsonl")
    Loaded 1000 testing records.
    """
    testing_data = []
    with open(testing_data_path, "r") as file:
        for line in file:
            testing_data.append(json.loads(line.strip()))
    
    print(f"Loaded {len(testing_data)} testing records.")

    return testing_data

testing_data = load_testing_data(combined_testing_dataset_path)

**Comment:** If you would like to see how the trained model is performing, testing data can be used below. Additionally, you can input a base model for comparison. Neither of the model names in the function below need to be a fine tuned model. This is useful if you would like to compare results between 2 base models in the event you would like to fine tune a different model.

In [None]:
def test_model(testing_data, model_name, base_model_name=None, temperature=0.2):
   """
    Evaluates a fine-tuned language model on a given testing dataset by generating translations
    and compares it with a base model's predictions.

    Parameters:
    testing_data (list[dict]): A list of dictionaries, where each dictionary contains:
        - 'english_text' (str): The input English sentence.
        - 'spanish_text' (str): The expected Spanish translation.
    model_name (str): The name of the fine-tuned model to use for inference.
    base_model_name (str, optional): The name of the base model for comparison (default: None).
    temperature (float, optional): The temperature setting for model responses, controlling randomness (default: 0.2).

    Returns:
    list[dict]: A list of dictionaries, where each dictionary contains:
        - 'input' (str): The original English text.
        - 'expected_output' (str): The expected Spanish translation.
        - 'predicted_output' (str): The model-generated Spanish translation.
        - 'base_predicted_output' (str, optional): The base model's generated translation.

    Example:
    >>> predictions = test_model(testing_data=testing_data, model_name="ft-gpt-4o", base_model_name="gpt-4o-mini")
    """
    predictions = []
    for test_instance in testing_data:
        english_text = test_instance['english_text']
        expected_output = test_instance['spanish_text']
        
        response = client.chat.completions.create(
            model=model_name,
            temperature=temperature,
            messages=[
                {"role": "system", "content": translation_prompt},
                {"role": "user", "content": english_text}
            ]
        )
        
        prediction = response.choices[0].message.content

        base_response = client.chat.completions.create(
            model=base_model_name,
            temperature=temperature,
            messages=[
                {"role": "system", "content": translation_prompt},
                {"role": "user", "content": english_text}
            ]
        )

        base_prediction = base_response.choices[0].message.content
        
        predictions.append({
            'input': english_text,
            'expected_output': expected_output,
            'predicted_output': prediction,
            'base_predicted_output': base_prediction
        })

    return predictions

predictions = test_model(testing_data=testing_data, model_name=ft_model_name, base_model_name='gpt-4o-mini')

In [None]:
meteor_scores = [
    meteor_score(
        [word_tokenize(pred['expected_output'])],  # Tokenized reference
        word_tokenize(pred['predicted_output'])   # Tokenized hypothesis
    )
    for pred in predictions
]

base_meteor_scores = [
     meteor_score(
        [word_tokenize(pred['expected_output'])],  # Tokenized reference
        word_tokenize(pred['base_predicted_output'])   # Tokenized hypothesis
    )
    for pred in predictions
]    

**Comment:** Additional (Unsupervised) Scoring is being worked on. Will be implemented as soon as possible.

In [None]:
plt.plot(meteor_scores, label='Fine Tuned Model')
plt.plot(base_meteor_scores, label='Base Model')
plt.legend()
plt.ylim(0, 1.2)
plt.title('Model Meteor Scoring')
plt.show()

# to get a better sense of what is happening, both of the scores are sorted independently to show each models trends in scoring.
sorted_meteor_scores = sorted(meteor_scores, reverse=True)
sorted_base_meteor_scores = sorted(base_meteor_scores, reverse=True)

plt.plot(sorted_meteor_scores, label='Fine Tuned Model')
plt.plot(sorted_base_meteor_scores, label='Base Model')
plt.legend()
plt.ylim(0, 1.2)
plt.title('Model Meteor Scoring (Sorted)')
plt.show()
