In [3]:
import json
import requests
import os
import time

from datasets import load_dataset
from fireworks.client import Fireworks
from pydantic import BaseModel, Field
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [45]:
# Make sure you have the FIREWORKS_API_KEY env var set to your account's key!
account_id = 'sdkramer10-5e98cb'
client = Fireworks()

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# This is a datset containing human preferences from the chatbot arena. When a human types a message, they are sent responses from two
# different chatbots. The human then votes on which response they prefer. Throughout this course, I am going to fine-tune a model to predict
# human chatbot preferences. For this week's assignment, I will perform prompt engineering to get a baseline of how well llama3-8b-instruct
# performs at this task before performing fine-tuning
# For more details on the dataset: https://huggingface.co/datasets/lmsys/chatbot_arena_conversations
dataset = load_dataset("lmsys/chatbot_arena_conversations")['train']

In [6]:
# For simplicity, I am only going to look at single-turn chats, where the user declared a winner after a single response from the bot.
examples = [example for example in dataset if example['turn'] == 1]

# The query the user sent to both bots should be exactly the same, so that we are fairly judging the responses. This should be always be
# the case for this dataset. This line just acts as a sanity check.
examples = [example for example in examples if example['conversation_a'][0]['content'] == example['conversation_b'][0]['content']]

# We take different examples for the train/validation/test sets
training_examples = examples[:2000]
validation_examples = examples[-1000:]
test_examples = examples[-2000:-1000]

In [34]:
sys_msg = f'''Choose the better chatbot response between model_a and model_b.

Your response MUST be ONLY the JSON object {{"winner": XXX}}. XXX can only equal "model_a", "model_b", "tie", or "tie (bothbad)".'''

def get_user_msg(example):
    user_query = example['conversation_a'][0]['content']
    model_a_response = example['conversation_a'][1]['content']
    model_b_response = example['conversation_b'][1]['content']
    user_msg = f"""user query: {user_query}

model_a response: {model_a_response}

model_b response: {model_b_response}"""
    return user_msg

# Even though this is a classification task, the chat completions api from Fireworks is the most general and most well developed,
# and performs the best
def create_messages(example):
    user_msg = get_user_msg(example)
    asst_msg = json.dumps({"winner": example['winner']})

    return {"messages": [
        {"role": "system", "content": sys_msg}, 
        {"role": "user", "content": user_msg}, 
        {"role": "assistant", "content": asst_msg}
    ]}

In [35]:
# Converts the training examples to the format expected by Fireworks. See https://readme.fireworks.ai/docs/fine-tuning-models#conversation
def training_examples_to_json(examples):
    json_objs = list()
    for example in examples:  
        msg = create_messages(example)
        json_objs.append(msg)
    
    print(f'Total tokens: {sum([len(tokenizer.tokenize(json.dumps(obj))) for obj in json_objs])}')
    return json_objs

training_json = training_examples_to_json(training_examples)

Total tokens: 1052349


In [36]:
# Writes the data to a file so that it can be uploaded to Fireworks
dataset_file_name = 'chatbot_arena_training_data.jsonl'
dataset_id = 'chatbot-arena-v3'

with open(dataset_file_name, 'w') as f:
    for obj in training_json:
        json.dump(obj, f)
        f.write('\n')

In [38]:
# Follow instructions here to first install the firectil CLI - https://readme.fireworks.ai/docs/fine-tuning-models#installing-firectl
# Then run this command to upload the file to Fireworks
!firectl create dataset {dataset_id} {dataset_file_name}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


3.98 MiB / 3.98 MiB [--------------------------------] 100.00% 2.51 MiB p/s 1.8s


In [53]:
# Creates a training job with the default hyperparameters
# Uncomment out to run (prints my api key to stdout, so commenting it out for the demo).
# !firectl create fine-tuning-job --settings-file chatbot_arena_training_v1.yaml --display-name chatbot-arena-v1 --dataset {dataset_id} 

In [54]:
# Creates a training job with the increased rank, learning rate, and epochs
# Uncomment out to run (prints my api key to stdout, so commenting it out for the demo).
# !firectl create fine-tuning-job --settings-file chatbot_arena_training_v2.yaml --display-name chatbot-arena-v2 --dataset {dataset_id} 

In [51]:
# v1 is the id of the training job with default hyperparameters, v2 is with the increased settings
model_v1_id = 'cc8324868ff04936855cffb392dba3b8'
model_v2_id = '4fe290a74b72458cafe0c9d8881e5d37'

In [68]:
# Wait until the fine-tuning jobs have finished running.
# Uncomment out to run (prints my api key to stdout, so commenting it out for the demo).
# !firectl get fine-tuning-job {model_v1_id}

In [58]:
# Deploy the first model to a serverless endpoint
!firectl deploy {model_v1_id}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
# Deploy the second model to a serverless endpoint
!firectl deploy {model_v2_id}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [61]:
# Wait until the models are in the "DEPLOYED" state
!firectl list models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


NAME                              CREATE TIME          KIND           CHAT  PUBLIC  STATE     STATUS MESSAGE
4fe290a74b72458cafe0c9d8881e5d37  2024-06-18 19:36:34  HF_PEFT_ADDON  true  false   DEPLOYED  
cc8324868ff04936855cffb392dba3b8  2024-06-18 19:31:50  HF_PEFT_ADDON  true  false   DEPLOYED  

Total size: 2


In [50]:
def get_results(examples, model_id):
    winners = list()
    
    for i, example in enumerate(examples):    
        user_msg = get_user_msg(example)

        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": sys_msg},
                {"role": "user", "content": user_msg},
            ],
            # setting temperature to 0 for this use case, so that responses are as deterministic as possible
            temperature=0, 
        )
        content = response.choices[0].message.content
    
        try:
            winner = json.loads(content.split('\n')[-1])["winner"]
            winners.append((i, winner))
        except:
            print(f"Failed to parse JSON for example {i}.")

    num_correct = sum([1 if winner[1] == examples[winner[0]]['winner'] else 0 for winner in winners])
    return winners, num_correct
        
num_to_eval = 500

In [52]:
# Determine how the base model without any fine-tuning performs
model_id = 'accounts/fireworks/models/llama-v3-8b-instruct'

train_results, train_num_correct = get_results(training_examples[:num_to_eval], model_id)
print(f'Training Set Correct: {train_num_correct}')

validation_results, validation_num_correct = get_results(validation_examples[:num_to_eval], model_id)
print(f'Validation Set Correct: {validation_num_correct}')

Training Set Correct: 302
Validation Set Correct: 251


In [63]:
# Determine how the fine-tuned model performs with the default fine-tuning params
model_id = f'accounts/{account_id}/models/{model_v1_id}'

train_results, train_num_correct = get_results(training_examples[:num_to_eval], model_id)
print(f'Training Set Correct: {train_num_correct}')

validation_results, validation_num_correct = get_results(validation_examples[:num_to_eval], model_id)
print(f'Validation Set Correct: {validation_num_correct}')

Failed to parse JSON for example 127.
Training Set Correct: 336
Validation Set Correct: 281


In [64]:
# Determine how the base model performs with the increases rank, epochs, and learning rate
model_id = f'accounts/{account_id}/models/{model_v2_id}'

train_results, train_num_correct = get_results(training_examples[:num_to_eval], model_id)
print(f'Training Set Correct: {train_num_correct}')

validation_results, validation_num_correct = get_results(validation_examples[:num_to_eval], model_id)
print(f'Validation Set Correct: {validation_num_correct}')

Training Set Correct: 369
Validation Set Correct: 279


In [65]:
# Undeploy the models (shouldn't cost anything extra to leave deployed, but the fireworks documentation is conflicting on this point).
!firectl undeploy {model_v1_id}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [66]:
# Undeploy the models (shouldn't cost anything extra to leave deployed, but the fireworks documentation is conflicting on this point).
!firectl undeploy {model_v2_id}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
# Double check that the model is now undeployed
!firectl list models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


NAME                              CREATE TIME          KIND           CHAT  PUBLIC  STATE       STATUS MESSAGE
4fe290a74b72458cafe0c9d8881e5d37  2024-06-18 19:36:34  HF_PEFT_ADDON  true  false   UNDEPLOYED  
cc8324868ff04936855cffb392dba3b8  2024-06-18 19:31:50  HF_PEFT_ADDON  true  false   UNDEPLOYED  

Total size: 2
