In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.auto import tqdm
import nltk
import evaluate
import numpy as np
import pandas as pd
import torch
import json

2024-07-10 00:38:08.493874: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 00:38:08.524093: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 00:38:08.524144: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 00:38:08.544140: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# MODEL_NAME = "google/flan-t5-base"
MODEL_NAME = "openai-community/gpt2"
CACHE_DIR = "/scratch/adyansh/cache"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.

In [3]:
TRAIN_PATH = "../../Data/ADR-data/data_train.jsonl"
VAL_PATH = "../../Data/ADR-data/data_val.jsonl"
TEST_PATH = "../../Data/ADR-data/data_test.jsonl"

train = pd.read_json(TRAIN_PATH, lines=True).sample(frac=0.1)
val = pd.read_json(VAL_PATH, lines=True).sample(frac=0.1)
test = pd.read_json(TEST_PATH, lines=True).sample(frac=0.1)

In [4]:
len(train), len(val), len(test)

(295, 98, 98)

In [5]:
def preprocess_function(data):
    inputs = data["Context"]
    targets = data["Decision"]
    model_inputs = tokenizer(inputs.tolist(), padding="max_length", truncation=True)

    labels = tokenizer(text_target=targets.tolist(), padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_train = preprocess_function(train)
tokenized_val = preprocess_function(val)
tokenized_test = preprocess_function(test)

In [7]:
nltk.download("punkt", quiet=True)
rouge = evaluate.load("rouge")
bleu = evaluate.load('bleu', cache_dir=CACHE_DIR)
meteor = evaluate.load('meteor', cache_dir=CACHE_DIR)
bertscore = evaluate.load("bertscore", cache_dir=CACHE_DIR)

[nltk_data] Downloading package wordnet to /home2/adyansh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home2/adyansh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home2/adyansh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   # decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   # decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
   result = {}

   result["rouge"] = rouge.compute(predictions=decoded_preds, references=decoded_labels)
   result['bleu'] = bleu.compute(predictions=decoded_preds,references= decoded_labels)
   result['meteor'] = meteor.compute(predictions=decoded_preds,references= decoded_labels)
   result['bertscore'] = bertscore.compute(predictions=decoded_preds,references= decoded_labels, lang='en', batch_size=64)
   
   cols = ['precision', 'recall', 'f1']

   for c in cols:
      result['bertscore'][c] = pd.Series(result['bertscore'][c]).mean()

   return result

In [9]:
class FineTuningDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = FineTuningDataset(tokenized_train, tokenized_train["labels"])
val_dataset = FineTuningDataset(tokenized_val, tokenized_val["labels"])
test_dataset = FineTuningDataset(tokenized_test, tokenized_test["labels"])

In [14]:
# L_RATE = 1e-3
BATCH_SIZE = 1
# PER_DEVICE_EVAL_BATCH = 4
# WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 4
NUM_EPOCHS = 20

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir='/scratch/adyansh/results',
   evaluation_strategy="steps",
   logging_dir='./logs/',
   logging_steps=1,
   log_level='info',
   # auto_find_batch_size=True,
   # learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   # per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   # weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [15]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)



In [None]:
trainer.evaluate(test_dataset)

In [16]:
training_data = trainer.train() 

***** Running training *****
  Num examples = 295
  Num Epochs = 20
  Instantaneous batch size per device = 1
  Training with DataParallel so batch size has been adjusted to: 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1,480
  Number of trainable parameters = 247,577,856


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 98
  Batch size = 32


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 10.75 GiB of which 28.50 MiB is free. Including non-PyTorch memory, this process has 10.72 GiB memory in use. Of the allocated memory 8.97 GiB is allocated by PyTorch, and 939.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [15]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.7042543888092041,
 'eval_rouge': {'rouge1': 0.22862053444283675,
  'rouge2': 0.1194193450653849,
  'rougeL': 0.210150231409782,
  'rougeLsum': 0.21005370971760545},
 'eval_bleu': {'bleu': 0.0004432112980397053,
  'precisions': [0.5721078889928282,
   0.3328830730265935,
   0.25179524949364757,
   0.22340317845935823],
  'brevity_penalty': 0.0013776350717715257,
  'length_ratio': 0.13179768008137183,
  'translation_length': 12828,
  'reference_length': 97331},
 'eval_meteor': {'meteor': 0.15956976739084702},
 'eval_bertscore': {'precision': 0.8858638638639984,
  'recall': 0.8245361891185764,
  'f1': 0.8534995245909278,
  'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.40.0)'},
 'eval_runtime': 74.9283,
 'eval_samples_per_second': 13.119,
 'eval_steps_per_second': 0.414,
 'epoch': 10.0}