# Finetune on ScienceQA
Let's use LLM Engine to fine-tune Llama-2 on ScienceQA!

# Data Preparation
Let's load in the dataset using Huggingface and view the features.

In [1]:
from datasets import load_dataset
from smart_open import smart_open
import pandas as pd

dataset = load_dataset('derek-thomas/ScienceQA')
dataset['train'].features

Using custom data configuration derek-thomas--ScienceQA-ca4903a3b5795914
Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--ScienceQA-ca4903a3b5795914/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

{'image': {'bytes': Value(dtype='binary', id=None),
  'path': Value(dtype='string', id=None)},
 'question': Value(dtype='string', id=None),
 'choices': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'answer': Value(dtype='int8', id=None),
 'hint': Value(dtype='string', id=None),
 'task': Value(dtype='string', id=None),
 'grade': Value(dtype='string', id=None),
 'subject': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'skill': Value(dtype='string', id=None),
 'lecture': Value(dtype='string', id=None),
 'solution': Value(dtype='string', id=None)}

Now, let's format the dataset into what's acceptable for LLM Engine - a CSV file with 'prompt' and 'response' columns.

In [2]:
choice_prefixes = [chr(ord('A') + i) for i in range(26)] # A-Z
def format_options(options, choice_prefixes):
    return ' '.join([f'({c}) {o}' for c, o in zip(choice_prefixes, options)])

def format_prompt(r, choice_prefixes):
    options = format_options(r['choices'], choice_prefixes)
    return f'''Context: {r["hint"]}\nQuestion: {r["question"]}\nOptions:{options}\nAnswer:'''

def format_label(r, choice_prefixes):
    return choice_prefixes[r['answer']]

def convert_dataset(ds):
    prompts = [format_prompt(i, choice_prefixes) for i in ds if i['hint'] != '']
    labels = [format_label(i, choice_prefixes) for i in ds if i['hint'] != '']
    df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
    return df

save_to_s3 = False
df_train = convert_dataset(dataset['train'])
if save_to_s3:
    train_url = 's3://...'
    val_url = 's3://...'
    df_train = convert_dataset(dataset['train'])
    with smart_open(train_url, 'wb') as f:
        df_train.to_csv(f)

    df_val = convert_dataset(dataset['validation'])
    with smart_open(val_url, 'wb') as f:
        df_val.to_csv(f)
else:
    # Gists of the already processed datasets
    train_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_train.csv'
    val_url = 'https://gist.githubusercontent.com/jihan-yin/43f19a86d35bf22fa3551d2806e478ec/raw/91416c09f09d3fca974f81d1f766dd4cadb29789/scienceqa_val.csv'
    
df_train

Unnamed: 0,prompt,response
0,Context: The passage below describes an experi...,B
1,Context: The passage below describes an experi...,A
2,Context: This passage describes the myotonia c...,A
3,Context: The diagrams below show two pure samp...,C
4,Context: Below is a food web from an ocean eco...,A
...,...,...
6074,Context: The images below show two pairs of ma...,A
6075,Context: Select the better answer.\nQuestion: ...,A
6076,Context: Read the description of a trait.\nHan...,A
6077,Context: The objects are identical except for ...,A


# Fine-tune
Now, we can fine-tune the model using LLM Engine.

In [3]:
import os
os.environ['SCALE_API_KEY'] = 'xxx'

from llmengine import FineTune

response = FineTune.create(
    model="llama-2-7b",
    training_file=train_url,
    validation_file=val_url,
    hyperparameters={
        'lr':2e-4,
    },
    suffix='science-qa-llama'
)
run_id = response.fine_tune_id

We can sleep until the job completes.

In [5]:
import time

while True:
    job_status = FineTune.get(run_id).status
    print(job_status)
    if job_status == 'SUCCESS':
        break
    time.sleep(60)
    
fine_tuned_model = FineTune.get(run_id).fine_tuned_model

BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobStatus.RUNNING
BatchJobSta

# Inference and Evaluation
Let's evaluate the new fine-tuned model by running inference against it.

In [11]:
import pandas as pd
from llmengine import Completion

# Helper function to get outputs for fine-tuned model with retries
def get_output(prompt: str, num_retry: int = 5):
    for _ in range(num_retry):
        try:
            response = Completion.create(
                model=fine_tuned_model, 
                prompt=prompt, 
                max_new_tokens=1, 
                temperature=0.01
            )
            return response.output.text.strip()
        except Exception as e:
            print(e)
    return ""

# Read the test data
test = pd.read_csv(val_url)

test["prediction"] = test["prompt"].apply(get_output)
print(f"Accuracy: {(test['response'] == test['prediction']).mean() * 100:.2f}%")

upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delayed connect error: 111
upstream connect error or disconnect/reset before headers. reset reason: connection failure, transport failure reason: delay