<h1 style="text-align: center;">T5 fine-tuning</h1>

# Libraries

In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
colab_path = '/content/notebooks'
os.symlink('/content/drive/My Drive/Colab Notebooks', colab_path)
sys.path.insert(0,colab_path)

Mounted at /content/drive


In [2]:
!pip install -q -U transformers
!pip install -q -U datasets
!pip install -q tensorboard
!pip install -q sentencepiece
!pip install -q accelerate
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)

# Dataset

This data has been annotated manually.

In [None]:
#Import chunk dataframe
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Final Project/Field Project/Highlighted Papers/ParaChunk_with_n_chunk.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Paragraph,Chunk,File_name,n_chunks
0,0,"Fast fashion, inexpensive and widely available...","Fast fashion, inexpensive and widely available...",The global environmental injustice of fast fas...,4
1,1,"In this paper, we posit that negative external...","In this paper, we posit that negative external...",The global environmental injustice of fast fas...,4
2,2,Fast fashion is a term used to describe the re...,Fast fashion is a term used to describe the re...,The global environmental injustice of fast fas...,3
3,3,"Globally, 80 billion pieces of new clothing ar...","Globally, 80 billion pieces of new clothing ar...",The global environmental injustice of fast fas...,2
4,4,The global health costs associated with the pr...,The hazardous working conditions that attracte...,The global environmental injustice of fast fas...,3


In [None]:
# Train, Validation and test split
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=42, shuffle=True)
print(X_train.shape, X_test.shape, X_val.shape)

(427, 5) (134, 5) (107, 5)


In [None]:

columns_to_drop = ['File_name', 'n_chunks', 'Unnamed: 0']

In [None]:
from datasets import Dataset, DatasetDict

# pandas to Hugging Face Dataset
dataset_train = Dataset.from_pandas(X_train.drop(columns_to_drop,axis=1).reset_index(drop=True))
dataset_valid = Dataset.from_pandas(X_val.drop(columns_to_drop,axis=1).reset_index(drop=True))
dataset_test = Dataset.from_pandas(X_test.drop(columns_to_drop,axis=1).reset_index(drop=True))

print(dataset_train)
print(dataset_valid)
print(dataset_test)

Dataset({
    features: ['Paragraph', 'Chunk'],
    num_rows: 427
})
Dataset({
    features: ['Paragraph', 'Chunk'],
    num_rows: 107
})
Dataset({
    features: ['Paragraph', 'Chunk'],
    num_rows: 134
})


We tried to find out how the input data is distributed to find the optimal input length.

In [None]:
def find_longest_length(dataset):
    max_length = 0
    counter_50 = 0
    counter_100 = 0
    counter_250 = 0
    counter_500 = 0
    for text in dataset:
        corpus = [word for word in text.split()]
        if len(corpus) > 50:
            counter_50 += 1
        if len(corpus) > 100:
            counter_100 += 1
        if len(corpus) > 250:
            counter_250 += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_50, counter_100, counter_250, counter_500

In [None]:
longest_article_length, counter_50, counter_100, counter_250, counter_500 = find_longest_length(dataset_train['Paragraph'])
print(f"longest article length: {longest_article_length} words")
print(f"article larger than 50: {counter_50}")
print(f"article larger than 100: {counter_100}")
print(f"article larger than 250: {counter_250}")
print(f"article larger than 500: {counter_500}")

longest article length: 590 words
article larger than 50: 371
article larger than 100: 267
article larger than 250: 38
article larger than 500: 3


In [None]:
def find_avg_sentence_length (dataset):
	sentence_lengths = []
	for text in dataset:
		corpus = [
			word for word in text.split()
		]
		sentence_lengths.append(len(corpus))
	return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length (dataset_train['Paragraph'])
print (avg_article_length)
avg_summary_length = find_avg_sentence_length(dataset_train['Chunk'])
print (avg_summary_length)

137.79859484777518
54.82903981264637


# Model Configuration

In [None]:
MODEL = 'google-t5/t5-base' #model name
BATCH_SIZE = 4
EPOCHS = 10
OUT_DIR = '/content/drive/MyDrive/T5' #output directory
MAX_LENGTH = 256 # based on previous cells

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL) #use T5 tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
#convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Paragraph']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Chunk']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/427 [00:00<?, ? examples/s]



Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


In [None]:
#Rouge for evaluation
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
#possible memory lack
def preprocess_logits_for_metrics(logits, labels):

    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
#Training arguments
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01, #to prevent overfitting
    logging_dir=OUT_DIR,
    logging_steps=10, # number of update steps between two logs
    eval_strategy='steps',
    eval_steps=200, # number of steps between evaluations
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=2e-4,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

# Training and result

In [None]:
history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.1775,0.304277,0.7499,0.7008,0.7393,78.729
400,0.1783,0.259375,0.7547,0.7057,0.7442,78.8411
600,0.0998,0.281668,0.7524,0.7048,0.7406,78.8411
800,0.0485,0.312441,0.7521,0.703,0.7409,78.8411
1000,0.009,0.332124,0.751,0.7026,0.7398,78.8411


In [None]:
tokenizer.save_pretrained(OUT_DIR)

('/content/drive/MyDrive/T5/tokenizer_config.json',
 '/content/drive/MyDrive/T5/special_tokens_map.json',
 '/content/drive/MyDrive/T5/spiece.model',
 '/content/drive/MyDrive/T5/added_tokens.json')

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}

  adding: content/drive/MyDrive/T5/ (stored 0%)
  adding: content/drive/MyDrive/T5/events.out.tfevents.1720351094.5aad49ab0a73.2594.0 (deflated 62%)
  adding: content/drive/MyDrive/T5/events.out.tfevents.1720352445.503a7510f6f3.943.0 (deflated 68%)
  adding: content/drive/MyDrive/T5/checkpoint-963/ (stored 0%)
  adding: content/drive/MyDrive/T5/checkpoint-963/config.json (deflated 62%)
  adding: content/drive/MyDrive/T5/checkpoint-963/generation_config.json (deflated 29%)
  adding: content/drive/MyDrive/T5/checkpoint-963/model.safetensors (deflated 8%)
  adding: content/drive/MyDrive/T5/checkpoint-963/training_args.bin (deflated 51%)
  adding: content/drive/MyDrive/T5/checkpoint-963/optimizer.pt


zip error: Interrupted (aborting)


In [None]:
def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text, #Add a prefix to the text (to specify task for T5)
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    )

    # summary generation
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=5,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def summarize_text(text, model, tokenizer, device):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text, #Add a prefix to the text (to specify task for T5)
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    ).to(device)

    # summary generation
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=5,
        early_stopping=True
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
summaries = []
for paragraph in dataset_test['Paragraph']:
    summary = summarize_text(paragraph, model, tokenizer, device)
    summaries.append(summary)

#print the summaries
for summary in summaries:
    print(summary)
    print('-' * 75)

This world- switch constitutes an enactor accessible sub-world in which Richards ‘receives’ the intelligence via a telegram
---------------------------------------------------------------------------
In addition to direct transmission between animals and humans via contact or via food, resistant strains and resistance genes may also spread into the environment
---------------------------------------------------------------------------
The in silico method used here allowed for the assessment of different parameters for commonly used ITS primers, including the length amplicons generated, taxonomic biases, and the consequences of primer mismatches.
---------------------------------------------------------------------------
Outliers in BPD, HC, AC, or FL were removed from the data. Generalized Additive Models for Location, Scale and Shape was applied to construct the growth curves for all four fetal routine
---------------------------------------------------------------------------
the us

In [None]:
dataset_result = pd.DataFrame(dataset_test)

In [None]:
dataset_result['result'] = summaries

In [None]:
dataset_result

Unnamed: 0,Paragraph,Chunk,result
0,After the news is broken there is a world-swit...,After the news is broken there is a world-swit...,This world- switch constitutes an enactor acce...
1,"Recently, gene sequencing has revealed that th...","Recently, gene sequencing has revealed that th...",In addition to direct transmission between ani...
2,The in silico method used here allowed for the...,The in silico method used here allowed for the...,The in silico method used here allowed for the...
3,"Outliers in BPD, HC, AC, or FL were removed fr...","Outliers in BPD, HC, AC, or FL were removed fr...","Outliers in BPD, HC, AC, or FL were removed fr..."
4,§1 describes the three kinds of fictionalism i...,\n,the usual considerations motivating fictionali...
...,...,...,...
129,where ο is the composition operator. We can no...,\n,
130,Our antenatal growth curves are unique in that...,Longitudinal growth charts were constructed fo...,Our antenatal growth curves are unique in that...
131,Because the Koopman operator calculated with l...,the predicted state is an approximation of the...,Because the Koopman operator calculated with l...
132,Looking at the song’s reception through a musi...,\n,Looking at the song’s reception through a musi...


In [None]:
#Save as .CSV
dataset_result.to_csv('/content/drive/MyDrive/Final Project/Field Project/Highlighted Papers/T5/summarized_dataset.csv', index=False)

In [None]:
dataset_result.to_csv('/content/drive/MyDrive/summarized_dataset.csv', index=False)

In [None]:
from google.colab import files

files.download('/content/drive/MyDrive/Final Project/Field Project/Highlighted Papers/T5/summarized_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
dataset_result.head()

Unnamed: 0,Paragraph,Chunk,result
0,After the news is broken there is a world-swit...,After the news is broken there is a world-swit...,This world- switch constitutes an enactor acce...
1,"Recently, gene sequencing has revealed that th...","Recently, gene sequencing has revealed that th...",In addition to direct transmission between ani...
2,The in silico method used here allowed for the...,The in silico method used here allowed for the...,The in silico method used here allowed for the...
3,"Outliers in BPD, HC, AC, or FL were removed fr...","Outliers in BPD, HC, AC, or FL were removed fr...","Outliers in BPD, HC, AC, or FL were removed fr..."
4,§1 describes the three kinds of fictionalism i...,\n,the usual considerations motivating fictionali...


In [None]:
#Rouge for evaluation
from rouge_score import rouge_scorer

def calculate_rouge_scores(hypotheses, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for hyp, ref in zip(hypotheses, references):
        score = scorer.score(hyp, ref)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    avg_scores = {metric: sum(values) / len(values) for metric, values in scores.items()}
    return avg_scores


In [None]:
#Define data and lable
hypotheses = dataset_result['result'].tolist()
references = dataset_result['Chunk'].tolist()


In [None]:
rouge_scores = calculate_rouge_scores(hypotheses, references)

print("ROUGE-1: {:.4f}".format(rouge_scores['rouge1']))
print("ROUGE-2: {:.4f}".format(rouge_scores['rouge2']))
print("ROUGE-L: {:.4f}".format(rouge_scores['rougeL']))


ROUGE-1: 0.3601
ROUGE-2: 0.2891
ROUGE-L: 0.3251


In [3]:
!pip freeze > requirements.txt