In [1]:
!pip install datasets transformers nltk

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel 

In [2]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [3]:
!pip install accelerate -U



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import transformers
from datasets import load_dataset, load_metric

In [6]:
data = load_dataset("csv",data_files="/content/drive/MyDrive/DevRev/data.csv")
data


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 264
    })
})

In [7]:
datasets_train_test = data["train"].train_test_split(test_size=72)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=50)

data["train"] = datasets_train_validation["train"]
data["validation"] = datasets_train_validation["test"]
data["test"] = datasets_train_test["test"]
data

DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 142
    })
    validation: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 50
    })
    test: Dataset({
        features: ['tools', 'question', 'answer'],
        num_rows: 72
    })
})

In [8]:
from transformers import AutoTokenizer


In [9]:
model_checkpoint = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
prefix = "question: "
max_input_length = 512
max_target_length = 256

prompt = '\n\n'

def preprocess_data(examples):
  inputs = [ tools + prompt + prefix + question for tools, question in zip(examples["tools"],examples["question"])]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["answer"], max_length=max_target_length,
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [11]:
tokenized_datasets = data.map(preprocess_data, batched=True)
tokenized_datasets

Map:   0%|          | 0/142 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 142
    })
    validation: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['tools', 'question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 72
    })
})

In [12]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [13]:
batch_size = 8
model_name = "t5-base"
model_dir = f"/content/drive/MyDrive/DevRev/model/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [15]:
import numpy as np
import nltk
nltk.download('punkt')

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [16]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: ignored

In [None]:
trainer.save_model()

In [None]:
model_name = "t5-base"
model_dir = f"/content/drive/MyDrive/DevRev/model/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [None]:
import torch

# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
  inputs = [ tools + prompt + prefix + question for tools, question in zip(examples["tools"],examples["question"])]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
  return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=18)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["answer"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
print(predictions_labels)
compute_metrics(predictions_labels)

Map:   0%|          | 0/72 [00:00<?, ? examples/s]



[[tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121,  291, 1744, 4128,  121,   10,  784,    2,  121,
         291, 1744, 4128,  121,   10,  784,    2,  121]), tensor([   0,  784,    2,  121, 

{'rouge1': 17.7044,
 'rouge2': 9.1333,
 'rougeL': 17.6645,
 'rougeLsum': 17.5983,
 'gen_len': 19.0}

In [None]:
decoded_output = tokenizer.batch_decode(all_predictions[0], skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)

["arguments": ["arguments": ["
