# Finetune on ScienceQA
Let's use LLM Engine to fine-tune Llama-2 on ScienceQA!

# Data Preparation
Let's load in the dataset using Huggingface and view the features.

In [1]:
from datasets import load_dataset
from smart_open import smart_open
import pandas as pd

dataset = load_dataset('derek-thomas/ScienceQA')
dataset['train'].features

Using custom data configuration derek-thomas--ScienceQA-ca4903a3b5795914
Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--ScienceQA-ca4903a3b5795914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

{'image': {'bytes': Value(dtype='binary', id=None),
  'path': Value(dtype='string', id=None)},
 'question': Value(dtype='string', id=None),
 'choices': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'answer': Value(dtype='int8', id=None),
 'hint': Value(dtype='string', id=None),
 'task': Value(dtype='string', id=None),
 'grade': Value(dtype='string', id=None),
 'subject': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'skill': Value(dtype='string', id=None),
 'lecture': Value(dtype='string', id=None),
 'solution': Value(dtype='string', id=None)}

Now, let's format the dataset into what's acceptable for LLM Engine - a CSV file with 'prompt' and 'response' columns.

In [3]:
choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z
def format_options(options, choice_prefixes):
    return ' '.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

def format_prompt(r, choice_prefixes):
    options = format_options(r['choices'], choice_prefixes)
    return f'''Context: {r["hint"]}\nQuestion: {r["question"]}\nOptions:{options}\nAnswer:'''

def format_label(r, choice_prefixes):
    return choice_prefixes[r['answer']]

def convert_dataset(ds):
    prompts = [format_prompt(i, choice_prefixes) for i in ds if i['hint'] != '']
    labels = [format_label(i, choice_prefixes) for i in ds if i['hint'] != '']
    df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
    return df

train_s3_uri = 's3://...'
val_s3_uri = 's3://...'
df_train = convert_dataset(dataset['train'])
#with smart_open(train_s3_uri, 'wb') as f:
#    df.to_csv(f)
    
df_train = convert_dataset(dataset['validation'])
#with smart_open(val_s3_uri, 'wb') as f:
#    df.to_csv(f)
    
df_train

Unnamed: 0,prompt,response
0,"Context: Sturgeons eat invertebrates, plants, ...",B
1,Context: People can use the engineering-design...,C
2,Context: Figure: Chicago.\nChicago is known as...,B
3,"Context: In a group of cows, some individuals ...",A
4,"Context: Bald eagles eat fish, mammals, and ot...",B
...,...,...
2090,Context: Select the best estimate.\nQuestion: ...,C
2091,Context: Flat-tail horned lizards live in the ...,B
2092,Context: Read the description of a trait.\nTim...,B
2093,Context: Garrett enjoys feeding the squirrels ...,A


# Fine-tune
Now, we can fine-tune the model using LLM Engine.

In [None]:
import os
os.environ['SCALE_API_KEY'] = 'xxx'

from llmengine import FineTune

response = FineTune.create(
    model="llama-2-7b",
    training_file=train_s3_uri,
    validation_file=val_s3_uri,
    hyperparameters={
        'lr':2e-4,
    },
    suffix='science-qa-llama'
)
run_id = response.fine_tune_id

We can sleep until the job completes.

In [6]:
while True:
    job_status = FineTune.get(run_id).status
    print(job_status)
    if job_status == 'SUCCESS':
        break
    time.sleep(60)
    
ft_model = FineTune.get(run_id).fine_tuned_model

BatchJobStatus.SUCCESS


# Evaluation
Let's evaluate the new fine-tuned model by running inference against it.

In [None]:
import pandas as pd

# Helper function to get outputs for fine-tuned model with retries
def get_output(prompt: str, num_retry: int = 5):
    for _ in range(num_retry):
        try:
            response = Completion.create(
                model=fine_tuned_model, 
                prompt=prompt, 
                max_new_tokens=1, 
                temperature=0.01
            )
            return response.output.text.strip()
        except Exception as e:
            print(e)
    return ""

# Read the test data
test = pd.read_csv("<path to science qa validation>")

test["prediction"] = test["prompt"].apply(get_output)
print(f"Accuracy: {(test['response'] == test["prediction"]).mean() * 100:.2f}%")