In [15]:
# !pip install -q transformers[torch]
# !pip install -q datasets
# !pip install -q accelerate -U
# !pip install -q py7zr
# !pip install -q evaluate nltk rouge_score
# !pip install peft
# !pip install -q -U bitsandbytes

# Load Evaluator

In this section of the code, we are initializing evaluation metrics for the language model we plan to fine-tune. Specifically, we use the `evaluate` library, which is a part of the Hugging Face ecosystem, designed for evaluating and comparing the performance of models across a wide range of NLP tasks.

1. `bleu_scorer = evaluate.load('bleu')`: This line loads the BLEU (Bilingual Evaluation Understudy) scorer from the `evaluate` library. BLEU is a widely used metric for evaluating the quality of text which has been machine-translated from one natural language to another. It works by comparing the machine-generated text to one or more reference texts (typically human-generated) and computes a score indicating how similar they are, based on the presence of the same words and phrases. BLEU is particularly popular in tasks like machine translation but is also used in other contexts like text summarization.

2. `rouge_scorer = evaluate.load('rouge')`: This line loads the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scorer. ROUGE is another popular evaluation metric used primarily in summarization tasks. Unlike BLEU, which is precision-oriented, ROUGE focuses on recall, meaning it measures how well the generated summaries cover the content present in the reference summaries. It compares the overlap of n-grams, word sequences, and word pairs between the computer-generated output and the reference texts.

These metrics will be used later in the training process to evaluate how well the fine-tuned language model performs on specific NLP tasks, such as translation or summarization. Using these evaluation metrics allows us to quantitatively assess the quality of the generated text and make informed decisions about the model's performance and potential improvements.


In [1]:
import evaluate

bleu_scorer = evaluate.load('bleu')
rouge_scorer = evaluate.load('rouge')

In [None]:
import torch
import torch.nn as nn
import math
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "facebook/opt-2.7b"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load the dataset

In [18]:
from datasets import load_dataset
from torch.utils.data import DataLoader

dataset = pd.read_csv("training_corpus.csv")
test_dataset = pd.read_csv("assignments/6595203.csv")

# dataset_len = len(dataset)
# dataset = dataset[:dataset_len/2]

# test_dataset_len = len(test_dataset)
# test_dataset = dataset[:test_dataset_len/2]

class TrainDataset(torch.utils.data.Dataset):
    # longest first for batch finder
    def __init__(self, dataset, split):
        input_x = dataset['context']
        target = dataset['target']
        self.input_x = input_x
        self.target = target
        self.split = split
        assert len(input_x) == len(target) # test if the condition is true
    def __getitem__(self, idx):
        ### Your code here
        input = self.input_x[idx]
        target = self.target[idx]
        return{
            'input': input,
            'target': target,
            'split': self.split,
        }
    def __len__(self):
        ### Your code here
        return len(self.input_x)

class TestDataset(torch.utils.data.Dataset):
    # longest first for batch finder
    def __init__(self, dataset, split):
        input_x = dataset['context']
        self.input_x = input_x
        self.split = split

    def __getitem__(self, idx):
        ### Your code here
        input = self.input_x[idx]
        return{
            'input': input,
            'split': self.split,
        }
    def __len__(self):
        ### Your code here
        return len(self.input_x)
    

# You can adjust the dataset scale with your own preference
# The total number for training is 87,9K, validation is 3.61K
# train_dataset = TrainDataset(dataset['context'][:1000], 'train')
# test_dataset = TestDataset(dataset['context'][:100], 'test')
# train_dataset = TrainDataset(dataset, 'train')
# test_dataset = TestDataset(dataset, 'test')
train_dataset = TrainDataset(dataset, 'train')
test_dataset = TestDataset(test_dataset, 'test')



# Tokenizer

In [19]:
import transformers

MAX_TOKEN_LENGTH = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

rtokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
rtokenizer.padding_side = 'right'
rtokenizer.truncation_side = 'right'

def data_collator_customized(features, return_tensors="pt"):
    batch = {}
    ### Your code here
    batchfied_features = {}
    keys = features[0].keys()
    for key in keys:
        batchfied_features[key]=[f[key] for f in features]
    split = batchfied_features['split'][0]
    for_inference = (split == 'test')

    input_text = batchfied_features['input']
    bos_token = tokenizer.bos_token
    eos_token = tokenizer.eos_token

    if for_inference:
        concated_text = [f'{bos_token}Context: {i}. Prediction: ' for i in input_text]
        lm_input = rtokenizer(concated_text, add_special_tokens=False, return_tensors='pt',
                             padding=True, truncation = True, 
                             max_length=MAX_TOKEN_LENGTH)
        return lm_input

    target_text = batchfied_features['target']
    concated_text = [f'{bos_token}Context: {i}. Prediction:{t}{eos_token}' for i, t in zip(input_text, target_text)]

    lm_input = rtokenizer(concated_text, add_special_tokens=False, return_tensors='pt', padding = 'max_length', truncation = True, max_length=MAX_TOKEN_LENGTH)
    
    lm_target = lm_input.copy()
    lm_target = lm_target['input_ids'][:, :]
    batch = {**lm_input, 'labels': lm_target}
    # if for_inference:
    #     concated_text = [f'{bos_token}Context:{i}. Prediction:' for i in input_text]
    #     lm_input = tokenizer(concated_text, add_special_tokens=False, return_tensors='pt',
    #                          padding=True, truncation = True, 
    #                          max_length=MAX_TOKEN_LENGTH)
    # batch = lm_input
    ### End of code writing
    return batch

# Trainer

In [None]:
batch_size = 1
trainer = transformers.Seq2SeqTrainer(
    model=model,
    train_dataset=train_dataset,
    args=transformers.Seq2SeqTrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=1,
        warmup_steps=0,
        num_train_epochs=1.0,
        learning_rate=0.0001,
        bf16=False, # If your GPU supports, make it True
        fp16=True, # Since we disable the bf16, we use FP16 instead
        logging_steps=1,
        report_to=['none'],
        remove_unused_columns=False,
        output_dir='model_output',
        generation_config=transformers.GenerationConfig(
            max_length=5,
            num_beams=1,
        ),
        predict_with_generate=True,
    ),
    data_collator=data_collator_customized
)

trainer.train()

# Evaluation

In [None]:
eval_result = trainer.predict(test_dataset, max_new_tokens=96)
logits = eval_result.predictions
logits[logits == -100] = tokenizer.eos_token_id
# text_result = []

In [None]:
predicted_text = tokenizer.batch_decode(logits)

results = []

broken_count = 0

for tt in predicted_text:
    tt = tt.replace(tokenizer.pad_token, '')
    tt = tt.replace('<pad>', '')
    # print(tt)
    keyword = 'Prediction:'
    if keyword in tt:
        cc_idx = tt.index(keyword)
        # print(cc_idx)
        tt = tt[cc_idx + len(keyword):]
        if tokenizer.eos_token in tt:
            tt = tt[:tt.index(tokenizer.eos_token)]

        results.append(tt)
    else:
        # print(tt)
        broken_count += 1

results
# broken_count


In [None]:
empty = 0

new_results = []

for x in results:
    if x == "":
        empty += 1
        print("empty")
    else:
        if x.startswith(' '):
            x = x[1:]
        print(x)
    
    new_results.append(x)

empty

In [38]:
results_dataset = pd.read_csv("assignments/6595203.csv")

In [None]:
# adding the results to the dataset
results_dataset['prediction'] = new_results

In [None]:
results_dataset

In [None]:
results_dataset.to_csv('6595203.csv', index=None)