In [None]:
import os
from datetime import datetime, timezone

import pandas as pd
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

from demo_utils import calculate_stats, collect_responses, display_stats

In [None]:
HF_TOKEN = os.getenv("HF_TOKEN")
!huggingface-cli login --token $HF_TOKEN

# Set up model and tokenizer.
We'll be using a bare-bones [Mistral model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) for our examples.

In practice, you could pass your model weights into the `from_pretrained` function to load
weights from fine-tuned or pretrained models.

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             torch_dtype=torch.bfloat16, 
                                             trust_remote_code=True, 
                                             device_map="auto", 
                                             attn_implementation="eager")
model.eval()
type(model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

In [None]:
DEBUG = False
now = datetime.now(timezone.utc)
instruction_template = "straightforward"
dataset = pd.read_csv("./blog_toy_dataset.csv")
output_dataset_path = f"./base-mistral_{instruction_template}_{now.year}-{now.month}-{now.day}.csv"

# Run evaluation on our first instruction template

In [None]:
collect_responses(dataset=dataset, 
                  model=model, 
                  instruction_template=instruction_template, 
                  tokenizer=tokenizer, 
                  debug=DEBUG)

In [None]:
evaluation_stats = calculate_stats(dataset=dataset)
display_stats(stats_dict=evaluation_stats)

In [None]:
dataset.to_csv(output_dataset_path, index=False)

# Run evaluation on the `just_answer` instruction template.
First, we'll get a clean dataset since our last run appended its results inplace.

In practice, the user would want to parallelize these three experiments or run them in separate notebooks.

In [None]:
instruction_template = "just_answer"
dataset = pd.read_csv("./blog_toy_dataset.csv")
output_dataset_path = f"./base-mistral_{instruction_template}_{now.year}-{now.month}-{now.day}.csv"

In [None]:
collect_responses(dataset=dataset, 
                  model=model, 
                  instruction_template=instruction_template, 
                  tokenizer=tokenizer, 
                  debug=DEBUG)

In [None]:
evaluation_stats = calculate_stats(dataset=dataset)
display_stats(stats_dict=evaluation_stats)

In [None]:
dataset.to_csv(output_dataset_path, index=False)

# Run evaluation on our final instruction template.
Get a new, cleaned dataset and start a new evaluation run.

In [None]:
instruction_template = "accounting_related"
dataset = pd.read_csv("./blog_toy_dataset.csv")
output_dataset_path = f"./base-mistral_{instruction_template}_{now.year}-{now.month}-{now.day}.csv"

In [None]:
collect_responses(dataset=dataset, 
                  model=model, 
                  instruction_template=instruction_template, 
                  tokenizer=tokenizer, 
                  debug=DEBUG)

In [None]:
evaluation_stats = calculate_stats(dataset=dataset)
display_stats(stats_dict=evaluation_stats)

In [None]:
dataset.to_csv(output_dataset_path, index=False)