<a href="https://colab.research.google.com/github/realbenpope/essay_grader/blob/main/Essay_Annotator_Llama_3_1_8b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's train an essay annotator!




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets

In [None]:
import random
import re
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import numpy as np
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from unsloth import FastLanguageModel
import torch
from sklearn.model_selection import train_test_split
from google.colab import userdata, runtime

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
#load_model = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
load_model = "/content/drive/MyDrive/LLM Projects/Kaggle/EssayGrader/Llama_Annotator_v4/checkpoint-40"


In [None]:
all_data_df = pd.read_csv('/content/drive/MyDrive/LLM Projects/Kaggle/Prepared Data/all_data.csv')
arguments_df = pd.read_csv('/content/drive/MyDrive/LLM Projects/Kaggle/Prepared Data/scores_arguments.csv')

#remove nan
arguments_df = arguments_df.dropna()

#remove duplicates
arguments_df = arguments_df.drop_duplicates()

In [None]:
train_data, eval_data = train_test_split(arguments_df, test_size=0.2, random_state=42, stratify=arguments_df['holistic_essay_score'])

# Print info about the sampled data
print(type(train_data))
print(f"Total training samples: {len(train_data)}")
print(train_data['holistic_essay_score'].value_counts().sort_index())

print(type(eval_data))
print(f"Total evaluation samples: {len(eval_data)}")
print(eval_data['holistic_essay_score'].value_counts().sort_index())

<class 'pandas.core.frame.DataFrame'>
Total training samples: 3245
holistic_essay_score
1.0    430
2.0    988
3.0    843
4.0    387
5.0    412
6.0    185
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Total evaluation samples: 812
holistic_essay_score
1.0    108
2.0    247
3.0    211
4.0     97
5.0    103
6.0     46
Name: count, dtype: int64


## Prompt Templates

Since I have limited training data, I had to get creative. By introducing errors into the samples and asking the LLM to fix them, data the model has seen before becomes new data.

In [None]:
def create_train_prompt(example, scramble_annotations=False):

    full_text = example['full_text']
    annotated_text = example['annotated_text']

    def convert_annotated_text(text):
        return re.sub(r'<(\w+)([^>]*)>([^<]*)</\1>', r'\3</\1\2>', text)

    def introduce_errors(text):
        # Randomly change some discourse types or effectiveness ratings
        def random_change(match):
            if random.random() < 0.3:  # 30% chance to introduce an error
                types = ["Lead", "Position", "Claim", "Counterclaim", "Rebuttal", "Evidence", "Concluding Statement"] # TODO: Add "Unannotated"
                effectiveness = ["Effective", "Adequate", "Ineffective"]
                new_type = random.choice(types)
                new_effectiveness = random.choice(effectiveness)
                return f"{match.group(1)}</{new_type}:{new_effectiveness}>"
            return match.group(0)
        return re.sub(r'(.*?)</.*?:.*?>', random_change, text)

    if scramble_annotations:
        if random.random() < 0.3:  # 30% chance for partly_marked
            essay_type = "partly_marked"
            # Keep 30-70% of annotations, introduce some errors
            keep_prob = random.uniform(0.3, 0.7)
            partly_annotated = re.sub(r'<(\w+)([^>]*)>([^<]*)</\1>',
                                      lambda m: m.group(0) if random.random() < keep_prob else m.group(3),
                                      annotated_text)
            #essay_text = convert_annotated_text(introduce_errors(partly_annotated))
            essay_text = introduce_errors(partly_annotated)

            instruction = f"""Complete the tagging for all discourse segments and correct any errors in existing tags. Use this format: "<Discourse Type> Text of discourse segment. </Discourse Type:Discourse Effectiveness>"
            Essay: {essay_text}
            """ # Remove essay_text when using llama prompt format
        else:
            essay_type = "fully_marked"
            #essay_text = convert_annotated_text(introduce_errors(annotated_text))
            essay_text = introduce_errors(annotated_text)
            instruction = f"""
            Review all tagged discourse segments. Identify and correct any errors in discourse type or effectiveness rating.
            Essay: {essay_text}
            """
    else:
        essay_type = "unmarked"
        essay_text = full_text
        instruction = f"""
        Tag the start and end of discourse segments with an xml-like tag that indicates its Discourse Type and Discourse Effectiveness.
        Use this format: "<Discourse Type> Text of discourse segment. </Discourse Type:Discourse Effectiveness>"

        Example essay text: "Online learning is overwhelmingly popular. This study shows that 80% of students prefer online learning."
Example annotated text: "Online learning is overwhelmingly popular. <Evidence>This study shows that 80% of students prefer online learning.</Evidence:Adequate>"
        Essay: {essay_text}
        """

    prompt = f"""For the essay below, identify these Discourse Types:
- Lead: Attention-grabbing introduction pointing to thesis
- Position: Main opinion/conclusion
- Claim: Supports position
- Counterclaim: Opposes position/claim
- Rebuttal: Refutes counterclaim
- Evidence: Supports claims/counterclaims/rebuttals
- Concluding Statement: Restates position

Use the effctiveness ratings: Effective, Adequate, or Ineffective.
{instruction}
"""
    prompt = prompt.strip()
    return prompt

In [None]:
def create_test_prompt(example):
    full_text = example['full_text']
    prompt = f"""For the essay below, tag the start and end of discourse segments with an xml-like tag that indicates its Discourse Type and Discourse Effectiveness. Use this format. "<Discourse Type> Text of the discourse </Discourse Type:Discourse Effectiveness>"
Identify these Discourse Types:
- Lead: Attention-grabbing introduction pointing to the thesis
- Position: Main opinion/conclusion
- Claim: Supports position
- Counterclaim: Opposes position/claim
- Rebuttal: Refutes counterclaim
- Evidence: Supports claims/counterclaims/rebuttals
- Concluding Statement: Restates position

Assign a discourse effectiveness of Effective, Adequate, or Ineffective.

Example essay text: Online learning is overwhelmingly popular. This study shows that 80% of students prefer online learning.
Example annotated text: Online learning is overwhelmingly popular. <Evidence:Adequate>This study shows that 80% of students prefer online learning.</Evidence:Adequate>

Essay:
{full_text}"
"""
    prompt = prompt.strip()
    return prompt

In [None]:
train_data['prompt'] = train_data.apply(lambda row: create_train_prompt(row, scramble_annotations=False), axis=1)
print(train_data['prompt'][0])

eval_data['prompt'] = eval_data.apply(lambda row: create_test_prompt(row), axis=1)
print(eval_data.reset_index(drop=True)['prompt'][0])

For the essay below, identify these Discourse Types:
- Lead: Attention-grabbing introduction pointing to thesis
- Position: Main opinion/conclusion
- Claim: Supports position
- Counterclaim: Opposes position/claim
- Rebuttal: Refutes counterclaim
- Evidence: Supports claims/counterclaims/rebuttals
- Concluding Statement: Restates position

Use the effctiveness ratings: Effective, Adequate, or Ineffective.

        Tag the start and end of discourse segments with an xml-like tag that indicates its Discourse Type and Discourse Effectiveness.
        Use this format: "<Discourse Type> Text of discourse segment. </Discourse Type:Discourse Effectiveness>"

        Example essay text: "Online learning is overwhelmingly popular. This study shows that 80% of students prefer online learning."
Example annotated text: "Online learning is overwhelmingly popular. <Evidence>This study shows that 80% of students prefer online learning.</Evidence:Adequate>"
        Essay: Phones

Modern humans today a

In [None]:
def prepare_sharegpt_dataset(df):
    def create_sharegpt_conversation(index, row):
        conversation = [
            {"from": "human", "value": row['prompt']},
            {"from": "assistant", "value": row['annotated_text']}
        ]
        return {
            "id": index,
            "essay_id": row['essay_id'],
            "conversations": conversation
        }

    data = [create_sharegpt_conversation(i, row) for i, row in df.iterrows()]

    dataset = Dataset.from_list(data)
    return dataset

In [None]:
from datasets import Dataset

def prepare_alpaca_dataset(df):
    def create_alpaca_example(index, row):
        return {
            "id": index,
            "essay_id": row['essay_id'],
            "instruction": row['prompt'],
            "input": "",  # Alpaca format includes an optional 'input' field, which we'll leave empty
            "output": row['annotated_text']
        }

    data = [create_alpaca_example(i, row) for i, row in df.iterrows()]

    dataset = Dataset.from_list(data)
    return dataset

In [None]:
# Convert dataframes to datasets
train_dataset = prepare_alpaca_dataset(train_data)
#train_dataset = prepare_sharegpt_dataset(train_data)
print(train_dataset)

eval_dataset = prepare_alpaca_dataset(eval_data)
#eval_dataset = prepare_sharegpt_dataset(eval_data)
print(eval_dataset)

Dataset({
    features: ['id', 'essay_id', 'instruction', 'input', 'output'],
    num_rows: 3245
})
Dataset({
    features: ['id', 'essay_id', 'instruction', 'input', 'output'],
    num_rows: 812
})


In [None]:


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = load_model,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = userdata.get('huggingface')
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # 0 is optimized
    bias = "none",    # "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "alpaca",
)
# Need to add EOS_TOKEN to the end of the Assistant portion.
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(example):
    output_texts = []
    for instruction, output in zip(example['instruction'], example['output']):
        text = f"{instruction_template} {instruction}\n{response_template} {output} {EOS_TOKEN}"
        output_texts.append(text)
    return {"text": output_texts}


instruction_template = "### Human:"
response_template = "### Assistant:"

# Data collator to train only on completions
collator = DataCollatorForCompletionOnlyLM(
    instruction_template=instruction_template,
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=train_dataset.column_names
)
eval_dataset = eval_dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=eval_dataset.column_names
)

#train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)
#eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, remove_columns=eval_dataset.column_names)


print(train_dataset.features)
print(train_dataset[0])
print(eval_dataset[0])

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/812 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None)}
{'text': '### Human: For the essay below, identify these Discourse Types:\n- Lead: Attention-grabbing introduction pointing to thesis\n- Position: Main opinion/conclusion\n- Claim: Supports position\n- Counterclaim: Opposes position/claim\n- Rebuttal: Refutes counterclaim\n- Evidence: Supports claims/counterclaims/rebuttals\n- Concluding Statement: Restates position\n\nUse the effctiveness ratings: Effective, Adequate, or Ineffective.\n\n        Tag the start and end of discourse segments with an xml-like tag that indicates its Discourse Type and Discourse Effectiveness.\n        Use this format: "<Discourse Type> Text of discourse segment. </Discourse Type:Discourse Effectiveness>"\n\n        Example essay text: "Online learning is overwhelmingly popular. This study shows that 80% of students prefer online learning."\nExample annotated text: "Online learning is overwhelmingly popular. <Evidence>This study shows that 80% of students prefer onlin

In [None]:
print(train_dataset.features)
print(train_dataset[0])

{'text': Value(dtype='string', id=None)}
{'text': '### Human: For the essay below, identify these Discourse Types:\n- Lead: Attention-grabbing introduction pointing to thesis\n- Position: Main opinion/conclusion\n- Claim: Supports position\n- Counterclaim: Opposes position/claim\n- Rebuttal: Refutes counterclaim\n- Evidence: Supports claims/counterclaims/rebuttals\n- Concluding Statement: Restates position\n\nUse the effctiveness ratings: Effective, Adequate, or Ineffective.\n\n        Tag the start and end of discourse segments with an xml-like tag that indicates its Discourse Type and Discourse Effectiveness.\n        Use this format: "<Discourse Type> Text of discourse segment. </Discourse Type:Discourse Effectiveness>"\n\n        Example essay text: "Online learning is overwhelmingly popular. This study shows that 80% of students prefer online learning."\nExample annotated text: "Online learning is overwhelmingly popular. <Evidence>This study shows that 80% of students prefer onlin

<a name="Train"></a>
### Train the model
My first training looked good. I reached a low point in training loss and the eval loss continued down also, allbeit more slowly. Then I run an inference pass and find that the model learned to repeat my prompt. Doph! 🤡 I need to re-train on completions only.

In [None]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of epochs with no improvement after which training will be stopped
    early_stopping_threshold=0.01  # Minimum change to qualify as an improvement
)

In [None]:
batch_size = 4
gradient_steps = 4
eval_batch_size = 2
eval_steps = 20
train_epochs = 1
save_directory = "/content/drive/MyDrive/LLM Projects/Kaggle/EssayGrader/Llama_Annotator_v4"

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field = 'text',
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    data_collator=collator,
    callbacks=[early_stopping_callback],


    args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = gradient_steps,
        warmup_ratio = 0.05,
        num_train_epochs = train_epochs,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_dir=save_directory,
        logging_steps=1,
        report_to="tensorboard",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = save_directory,


        # Evaluation settings
        fp16_full_eval = True,
        per_device_eval_batch_size = eval_batch_size,
        eval_strategy = "steps",
        eval_steps = eval_steps,
        greater_is_better=False,

        save_strategy="steps",
        save_steps=20,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    ),
)

Map (num_proc=2):   0%|          | 0/3245 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/812 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
6.457 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

model.save_pretrained("save_directory")
tokenizer.save_pretrained("save_directory")

# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

if True:
  runtime.unassign()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,245 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 203
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss,Validation Loss
20,0.0475,0.083913
40,0.0389,0.068433


Step,Training Loss,Validation Loss
20,0.0475,0.083913
40,0.0389,0.068433
60,0.0411,0.074783


KeyboardInterrupt: 

In [None]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

if True:
  runtime.unassign()

<a name="Inference"></a>
### Inference
Now we'll run the model and get some samples to review.

In [None]:
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = load_model, # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    )
FastLanguageModel.for_inference(model) # Enable native 2x faster inference


tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts


#from datasets import load_dataset
eval_dataset = eval_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
import torch
from tqdm import tqdm
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)
EOS_TOKEN = tokenizer.eos_token
def run_inference(model, tokenizer, eval_dataset, max_new_tokens=2048, num_samples=10):
    model.eval()
    results = []

    # Take only the first num_samples from the dataset
    sample_dataset = eval_dataset.select(range(min(num_samples, len(eval_dataset))))

    for item in tqdm(sample_dataset):
        prompt = item['instruction']

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                num_return_sequences=1
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's response
        assistant_response = generated_text.split("### Assistant:")[-1].strip()

        results.append({
            'essay_id': item['essay_id'],
            'instruction': prompt,
            'human_annotated': item['output'],
            'model_output': assistant_response
        })

    return results

# Run inference on 10 samples
inference_results = run_inference(model, tokenizer, eval_dataset, num_samples=10)

# Convert results to DataFrame
eval_results_df = pd.DataFrame(inference_results)

# Save to CSV
eval_results_df.to_csv('/content/drive/MyDrive/LLM Projects/Kaggle/EssayGrader/Llama_Annotator_v4/v4_eval_results_comparison.csv', index=False)

100%|██████████| 10/10 [25:34<00:00, 153.49s/it]
