## Chapter 4 Tutorial: Parameter-efficient fine-tuning

Requirements:

1.   A Kaggle account with the kaggle.json credential file uploaded in the Colab working directory
2.   A Huggingface account with an API token generated.
3.   An OpenAI account with an API key.

**You will also have to request access to the Llama-2 family of models.** Please visit https://huggingface.co/meta-llama/Llama-2-7b-hf and request access. Once granted, you can log in with hugggingface_hub in the runtime and you will be allowed to download the Llama-2 model.

# Installation and Imports

## Run once after creating run-time

In [None]:
!pip install -q kaggle torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate openai

## Download the Tweetsumm dataset and switch to a fixed commit.
!git clone https://github.com/guyfe/Tweetsumm.git
!git -C Tweetsumm/ checkout 4903b0f20665a59e4b5494abd83d8735893c0333

In [None]:
## tweet_sum_processor.py has an issue where it crashes when < 3 summaries are
## available. add a small fix:
with open('Tweetsumm/tweet_sum_processor.py', 'r') as file:
    lines = file.readlines()
lines.insert(79, "            if summ is None: continue\n")
with open('Tweetsumm/tweet_sum_processor2.py', 'w') as file:
    file.writelines(lines)

In [None]:
## Upload your kaggle.json credential file into the home directory,
## then run this cell to place it in the correct location.

kaggle_cred_file_loc = './kaggle.json'

!mkdir ~/.kaggle
!cp $kaggle_cred_file_loc ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## If the data has not been downloaded, retrieve from kaggle
import os
if not os.path.exists('./twcs'):
  !kaggle datasets download -d thoughtvector/customer-support-on-twitter
  !unzip customer-support-on-twitter.zip

In [None]:
## Run this cell, click on the link that comes up, copy your Huggingface
## API code, and enter it into the prompt box.

from huggingface_hub import notebook_login
if not os.path.exists('/root/.cache/huggingface/token'):
    notebook_login()

In [None]:
## Enter you OpenAI API key, and it will save as an env variable that
## automatically populates until the runtime is deleted.

if not os.getenv('OPENAI_API_KEY'):
    os.environ['OPENAI_API_KEY'] = getpass.getpass()

## Run every time you restart the session

### Imports

In [None]:
## Data download and manipulation imports
from Tweetsumm.tweet_sum_processor2 import TweetSumProcessor
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login
import numpy as np
import pandas as pd
import json
import os
import ast
import getpass
from tqdm import tqdm

## ML imports
import openai
import transformers
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from trl import SFTTrainer
from peft import LoraConfig

## Plotting imports
import matplotlib.pyplot as plt
%matplotlib inline

### Collect and format data

In [None]:
## Process the Kaggle data with the TweetSmmm processor and extract into instruction-based HF datasets.

processor = TweetSumProcessor('./twcs/twcs.csv')

def process_data(file_loc):
    ## Load the tweet conversations
    all_convos = []
    with open(file_loc) as f:
        dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())
        for i, dialog_with_summary in enumerate(dialog_with_summaries):
            json_format = dialog_with_summary.get_json()
            all_convos.append(json_format)

    ## Process into conversations and summaries
    all_processed = []
    for ac in all_convos:
        ac = json.loads(ac)

        ident = ac['dialog']['dialog_id']
        turns = ac['dialog']['turns']
        convo = ''
        for turn in turns:
            agent = turn['is_agent']
            sent = ' '.join(turn['sentences'])
            sent = ' '.join([t for t in sent.split() if 'https://t.co' not in t])
            if sent[0] == '@':
                sent = ' '.join(sent.split()[1:])

            if agent: convo += 'Agent: '
            else: convo += 'Customer: '
            convo += sent+' \n '

        summaries = [' '.join(summ) for summ in ac['summaries']['abstractive_summaries']]
        all_processed.append([ident,convo,summaries])

    ## Save as dataframe
    df = pd.DataFrame({'id':[p[0] for p in all_processed],
                       'convos':[p[1] for p in all_processed],
                       'summaries':[p[2] for p in all_processed]
                      })
    return df

def create_llama_data(file_loc):
    ## Read in the data and process into conversations
    df = process_data(file_loc)

    ## Create prompts for each combination of conversation + summary
    out = []
    for i in df.to_numpy():
        for j in i[2]:
            question = "### Instruction:\n"\
            + "Read the following conversation between a customer and a customer service agent, and then create a two sentence "\
            + "summary of the conversation, describing the customer's question and the agent's response.\n\n"\
            +f"### Conversation: \n{i[1]}\n\n"\
            + "### Summary: "
            prompt = question + j + ' <END_OF_SECOND_SENTENCE>'
            out.append([i[0],prompt,question,j])

            ## If this is the test set, only collect only one example of each prompt.
            if 'test_tweetsum' in file_loc: break

    return pd.DataFrame({'id':[o[0] for o in out],'text':[o[1] for o in out],'question':[o[2] for o in out],'answer':[o[3] for o in out]})

train_df = create_llama_data('./Tweetsumm/tweet_sum_data_files/final_train_tweetsum.jsonl')
valid_df = create_llama_data('./Tweetsumm/tweet_sum_data_files/final_valid_tweetsum.jsonl')
test_df = create_llama_data('./Tweetsumm/tweet_sum_data_files/final_test_tweetsum.jsonl')

tweetsum_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'valid': Dataset.from_pandas(valid_df),
    'test': Dataset.from_pandas(test_df)
})

In [None]:
tweetsum_datasets

In [None]:
tweetsum_datasets['test'][1]

### Set up GPT-4 grader

In [None]:
## Log in with your OpenAI credentials if grading responses with GPT.

client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
)

In [None]:
## Create grading rubrik with GPT-4.

def submit_prompt_gpt(prompt,gpt_model):
    if gpt_model == '3': model = "gpt-3.5-turbo"
    elif gpt_model == '4': model = "gpt-4"
    else: sys.exit('Invalid gpt model number')

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=model
    )
    return chat_completion.choices[0].message.content

def grade_with_gpt(response):
    ## Split the conversation summary into the input and output portions
    try:
        conversation = response.split('### Conversation:')[-1]
        conversation, summary = conversation.split('### Summary:')
        ## Remove the <END_OF_SECOND_SENTENCE> tokens
        if '<END_' in summary:
            summary = summary.split('<END_')[0]
    except:
        gpt_eval = 'failed. score is 0.'
        gpt_score = '(0, 0, 0)'
        return gpt_eval, gpt_score

    ## Pass in the base rubric.
    gpt_eval = submit_prompt_gpt(f'''I am going to give you a conversation between a Customer and a customer service Agent. Please read the conversation, then read the summary below and answer three questions about the summary:

1) Does the summary of the customer's complaint reasonably match the conversation?
2) Does the summary of the agent's response reasonably match the conversation?
3) Are there exactly two sentences ending in periods in the summary?

Please give you answer to each of these questions. Then, give the summary a score of 1, 2, or 3, depending on how many of the above questions you answered "yes" to.

### Conversation: {conversation}
### Summary: {summary}''','4')

    ## Grade the rubric
    gpt_score = submit_prompt_gpt("Please read the following summary and return a three-entry python tuple with "\
    + "the scores given to 1), 2), and 3). Note that 'Yes' is a score of 1, 'No' is a score of 0, "\
    +f"and 'Partially' or similar is a score of 0.5: {gpt_eval}\n\nOutput: ",'4')

    ## Double-check the lenth score
    gpt_len = submit_prompt_gpt(f'''Read the following TEXT and tell me, yes or no, does it consist of '''\
                             +f'''exactly two sentences ending with periods:\n\n### TEXT:\n{summary}\n\n### Response: ''',
                        '4')

    gpt_score = ast.literal_eval(gpt_score)
    if 'yes' in gpt_len.lower():
        out_score = [gpt_score[0], gpt_score[1], 1]
    else:
        out_score = [gpt_score[0], gpt_score[1], 0]
    return gpt_eval, out_score


def grading_loop(responses):
    gpt_evals = []
    gpt_scores = []
    for ii, resp in tqdm(enumerate(responses)):
        gpt_eval, gpt_score = grade_with_gpt(resp)
        gpt_evals.append(gpt_eval)
        gpt_scores.append(gpt_score)

    gpt_results = pd.DataFrame({
        'summary':responses,
        'evaluation':gpt_evals,
        'scores':gpt_scores,
    })

    gpt_results['scores'] = gpt_results.scores.astype(str).apply(lambda x: ast.literal_eval(x))
    print('Scores per criteria: ',np.array(gpt_results['scores'].tolist()).sum(axis=0))
    print('Total score: ',np.array(gpt_results['scores'].tolist()).sum())
    return gpt_results

# DistilGPT Experiments

## DistilGPT2 -- Base

First, let's look at a smaller LLM. DistilGPT2 is an 84 million parameter model that is tuned with supervision by the 124 million parameter GPT2. It is capable of language generation and responds well to fine-tuning. Generate summaries with no fine-tuning first.

In [None]:
## Define the model
llm_name = 'distilgpt2'

## Create a generator from the base model for inference.
generator_base = pipeline("text-generation",
                     model=llm_name,
                     tokenizer=llm_name,
                     device='cuda:0',
                     max_new_tokens=100)

## Run each test prompt question through
base_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        base_output = generator_base(theprompt)
    base_responses.append(base_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

base_gpt_results = grading_loop(base_responses[:50])

## DistilGPT2 -- Fine-tuned
Now fine-tune the model with the training data, and generate summaries of the test sample.

In [None]:
## Define and instantiate the model
llm_name = 'distilgpt2'

model = AutoModelForCausalLM.from_pretrained(llm_name)
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
## Define training arguments
training_args = TrainingArguments(
    output_dir="./sft_test_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1
)

## Instantiate the trl trainer.
sft_trainer = SFTTrainer(
    model,
    train_dataset = tweetsum_datasets['train'],
    eval_dataset = tweetsum_datasets['valid'],
    args = training_args,
    dataset_text_field = 'text',
    max_seq_length=1024
)

## Train
sft_trainer.train()

In [None]:
## Create a generator from the tuned model for inference.
generator_tuned = pipeline("text-generation",
                     model=sft_trainer.model,
                     tokenizer=tokenizer,
                     device='cuda:0',
                     max_new_tokens=100)

## Run each test prompt question through
tuned_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        tuned_output = generator_tuned(theprompt)
    tuned_responses.append(tuned_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

tuned_gpt_results = grading_loop(base_responses[:50])

## DistilGPT2 -- LoRA/QLoRA-tuned
Now use LoRA or QLoRA to complete the training. One or the other may be activated by changing the `True` bool just before the call to LoraConfig.

In [None]:
## This cell instantiates both LoRA and QLoRA tuning, depending on whether
## the boolean flag beflore is True or False.

## Define and instantiate the model

llm_name = 'distilgpt2'

model = AutoModelForCausalLM.from_pretrained(llm_name)
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

## Define the LoRA or QLoRA parameters. This uses a LoraConfig object which is
## passed to the trl trainer at training time. For QLoRA, you also create a
## BitsAndBytesConfig object which is passed when the base model instantiated
## so that it can be quantized to 4-bit.

if True: ## LoRA
    ## LoRA parameters
    ## Note these numbers are the default suggested in the trl documentation about fine-tuning with LORA
    ## https://huggingface.co/docs/trl/main/en/lora_tuning_peft
    lora_params = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    ## Load model with LoRA parameters
    model = AutoModelForCausalLM.from_pretrained(
        llm_name,
        device_map={"":0}
    )
    model.config.use_cache = False

else: ## QLoRA
    lora_params = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Quantization parameters
    bnb_params = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    ## Load model with LoRA parameters
    model = AutoModelForCausalLM.from_pretrained(
        llm_name,
        quantization_config=bnb_params,
        device_map={"":0}
    )
    model.config.use_cache = False

In [None]:
## Define training arguments
training_args = TrainingArguments(
    output_dir="./qlora_test_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    fp16=False,
    bf16=False,
)

## Instantiate the trl trainer.
sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tweetsum_datasets['train'],
    eval_dataset = tweetsum_datasets['valid'],
    args = training_args,
    dataset_text_field = 'text',
    max_seq_length = 1024,
    peft_config=lora_params,
)

# Train
sft_trainer.train()

In [None]:
## Create a generator from the lora-tuned model for inference.
generator_lora = pipeline("text-generation",
                     model=sft_trainer.model,
                     tokenizer=tokenizer,
                     device='cuda:0',
                     max_new_tokens=100)

## Generate a summary for each test set conversation
lora_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        lora_output = generator_lora(theprompt)
    lora_responses.append(lora_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

lora_gpt_results = grading_loop(lora_responses[:50])

# Llama 2 Experiments
Lacking more data, we want to move to a larger LLM to get better transfer learning basis. We select Llama-2-7b-hf for this task, with roughly 7 billion parameters.

**NOTE** Due to Llama-2's size, the run-time should be restarted at the end of each section in this portion of the tutorial to ensure GPU memory is freed up. As of July 2024, an `L4 GPU` instance (or higher memory) is of sufficient size for this portion of the tutorial.

## Llama 2 -- Base

In [None]:
## Create a generator from the base model for inference, quantizing to 16-bit float.
llm_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

generator_base = pipeline("text-generation",
                     model=llm_name,
                     tokenizer=tokenizer,
                     torch_dtype=torch.float16,
                     device='cuda:0',
                     max_new_tokens=100)

In [None]:
## Run each test prompt question through
base_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        base_output = generator_base(theprompt)
    base_responses.append(base_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

base_gpt_results = grading_loop(base_responses[:50])

## Llama 2 -- Fine-tuning
Now attempt to fine-tune the Llama-2 model using the same approach as applied for Distil-GPT2

In [None]:
llm_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(llm_name)

training_args = TrainingArguments(
    output_dir="./sft_test_llama2",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2
)

sft_trainer = SFTTrainer(
    model,
    train_dataset = tweetsum_datasets['train'],
    eval_dataset = tweetsum_datasets['valid'],
    args = training_args,
    dataset_text_field = 'text',
    max_seq_length=1024
)

## Train
sft_trainer.train()

This cell cashed with a CUDA out of memory error -- looked like this:

```
---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
<ipython-input-21-085032d91470> in <cell line: 12>()
     10 )
     11
---> 12 sft_trainer = SFTTrainer(
     13     model,
     14     train_dataset = tweetsum_datasets['train'],

...

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in convert(t)
   1156                 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
   1157                             non_blocking, memory_format=convert_to_format)
-> 1158             return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
   1159
   1160         return self._apply(convert)

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 15.77 GiB of which 24.12 MiB is free. Process 532183 has 15.75 GiB memory in use. Of the allocated memory 14.89 GiB is allocated by PyTorch, and 5.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
```

## Llama 2 -- LoRA-tuning
As we do not have the hardware for normal fine-tuning, let us try LoRA fine-tuning.

In [None]:
llm_name = "meta-llama/Llama-2-7b-hf"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix for fp16

## Base model read in 8bit
base_model = AutoModelForCausalLM.from_pretrained(
    llm_name,
    load_in_8bit=True,
    device_map={"": 0}
)
base_model.config.use_cache = False

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=tweetsum_datasets['train'],
    eval_dataset = tweetsum_datasets['valid'],
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
    peft_config=peft_parameters
)

In [None]:
fine_tuning.train()

In [None]:
## Create a generator from the base model for inference.
generator_lora = pipeline("text-generation",
                     model=fine_tuning.model,
                     tokenizer=tokenizer,
                     device='cuda:0',
                     max_new_tokens=100)

## Run each test prompt question through
lora_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        lora_output = generator_lora(theprompt)
    lora_responses.append(lora_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

lora_gpt_results = grading_loop(lora_responses[:50])

## Llama 2 -- QLoRA-tuning

In [None]:
llm_name = "meta-llama/Llama-2-7b-hf"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    llm_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=tweetsum_datasets['train'],
    eval_dataset = tweetsum_datasets['valid'],
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
    peft_config=peft_parameters
)

In [None]:
fine_tuning.train()

In [None]:
## Create a generator from the base model for inference.
generator_qlora = pipeline("text-generation",
                     model=fine_tuning.model,
                     tokenizer=tokenizer,
                     device='cuda:0',
                     max_new_tokens=100)

## Run each test prompt question through
qlora_responses = []
for theprompt in tqdm(tweetsum_datasets['test']['question']):
    with torch.autocast("cuda"):
        lora_output = generator_lora(theprompt)
    qlora_responses.append(lora_output[0]['generated_text'])

In [None]:
## Grade the summaries using GPT-4. The grading rubric is:
##
## 1) Is the description of the customer’s question/complaint reasonably accurate?
## 2) Is the description of the agent’s response reasonably accurate?
## 3) Is the summary exactly two sentences in length?
##
## Give one point for each criteria met, for a total of 3 possible points. Return
## both a textual description of the score, and a tuple with 3 entries giving
## 0, 1, or 0.5 (if the model thought it was partially correct).

qlora_gpt_results = grading_loop(qlora_responses[:50])