In [1]:
!nvidia-smi

Sun Jun 30 15:20:08 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 556.12                 Driver Version: 556.12         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650      WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   43C    P8              2W /   50W |     244MiB /   4096MiB |     18%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

# import nltk
# nltk.download('punkt')
# from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader



from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [3]:
import pytorch_lightning as pl

In [4]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [5]:
data = pd.read_csv("../data/raw_data/rawdat/AFG/quadruple.txt", sep='\t', names = ["source","relation","target","date"])
data.head()

Unnamed: 0,source,relation,target,date
0,Armed Gang (Afghanistan),Carry out suicide bombing,United States,2010-01-01
1,Central Intelligence Agency,Make statement,Taliban,2010-01-01
2,Taliban,Make statement,Attacker (Afghanistan),2010-01-01
3,Citizen (Afghanistan),Demonstrate or rally,Unspecified Actor,2010-01-01
4,Armed Gang (Afghanistan),Carry out suicide bombing,Central Intelligence Agency,2010-01-01


In [6]:
text =pd.read_csv("../data/raw_data/rawdat/AFG/text.txt", sep='\t', names = ["text"])
text.head()

Unnamed: 0,text
0,"According to a foreign news agency, a suicide ..."
1,The CIA said a Taliban bomber on Wednesday man...
2,The Taliban claimed responsibility for the att...
3,The case in Kunar has already prompted Afghans...
4,The CIA base attacked by a suicide bomber in A...


In [7]:
data['source'] = data['source'].str.strip()
data['relation'] = data['relation'].str.strip()
data['target'] = data['target'].str.strip()
text['text'] = text['text'].str.strip()

In [8]:
result = pd.concat([data,text],axis=1)
result.head()

Unnamed: 0,source,relation,target,date,text
0,Armed Gang (Afghanistan),Carry out suicide bombing,United States,2010-01-01,"According to a foreign news agency, a suicide ..."
1,Central Intelligence Agency,Make statement,Taliban,2010-01-01,The CIA said a Taliban bomber on Wednesday man...
2,Taliban,Make statement,Attacker (Afghanistan),2010-01-01,The Taliban claimed responsibility for the att...
3,Citizen (Afghanistan),Demonstrate or rally,Unspecified Actor,2010-01-01,The case in Kunar has already prompted Afghans...
4,Armed Gang (Afghanistan),Carry out suicide bombing,Central Intelligence Agency,2010-01-01,The CIA base attacked by a suicide bomber in A...


In [9]:
result.relation.value_counts()

relation
Make statement                                   53958
Use conventional military force                  33243
Consult                                          26085
Use unconventional violence                      15225
Make an appeal or request                        14141
                                                 ...  
Forgive                                              1
Ease political dissent                               1
Reject request or demand for political reform        1
Reject mediation                                     1
Investigate war crimes                               1
Name: count, Length: 218, dtype: int64

In [10]:
result.count()

source      279859
relation    279859
target      279859
date        279859
text        279859
dtype: int64

In [11]:
# result.to_csv("../data/AFG_data_quintuples/train.csv")

In [12]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
result.relation.unique()

array(['Carry out suicide bombing', 'Make statement',
       'Demonstrate or rally', "Meet at a 'third' location",
       'Employ aerial weapons', 'Acknowledge or claim responsibility',
       'Engage in negotiation', 'Make an appeal or request',
       'Use conventional military force', 'Consult',
       'Threaten with military force',
       'Mobilize or increase armed forces',
       'Abduct, hijack, or take hostage', 'Use unconventional violence',
       'Make a visit', 'Host a visit',
       'Appeal for diplomatic cooperation (such as policy support)',
       'Provide aid',
       'Conduct suicide, car, or other non-military bombing',
       'Investigate', 'Accuse', 'Criticize or denounce', 'Reject',
       'Arrest, detain, or charge with legal action', 'Praise or endorse',
       'Demand', 'fight with artillery and tanks', 'Deny responsibility',
       'Use as human shield', 'Accuse of human rights abuses',
       'fight with small arms and light weapons', 'Cooperate militarily',

In [14]:
relations = result.relation.unique()
for r in relations[:10]:
    print(r," ",tokenizer.encode(r))

Carry out suicide bombing   [11274, 63, 91, 12259, 6417, 53, 1]
Make statement   [1796, 2493, 1]
Demonstrate or rally   [15782, 29, 7, 17, 2206, 42, 13980, 1]
Meet at a 'third' location   [12325, 44, 3, 9, 3, 31, 14965, 31, 1128, 1]
Employ aerial weapons   [19631, 22142, 7749, 1]
Acknowledge or claim responsibility   [4292, 20542, 13553, 42, 1988, 3263, 1]
Engage in negotiation   [27246, 15, 16, 21862, 1]
Make an appeal or request   [1796, 46, 3958, 42, 1690, 1]
Use conventional military force   [2048, 7450, 2716, 2054, 1]
Consult   [11540, 17, 1]


In [15]:
class EventDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
        self.path = os.path.join(data_dir, type_path + '.csv')
        self.data_column = "text"
        self.class_column = "relation"
        self.data = pd.read_csv(self.path,names=["source","relation","target","date","text"])
        # self.data.drop_duplicates(inplace=True)
        relation_ids = pd.read_csv('../data/raw_data/rawdat/AFG/relation2id.txt', sep='\t', names=['relation','id'])
        relation_id_maps = pd.Series(relation_ids.id.values, index=relation_ids.relation).to_dict()
        self.data['relation'] = self.data['relation'].map(relation_id_maps).astype('str')
        self.data.head()
        
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
    
        self._build()
  
    def __len__(self):
        return len(self.inputs)
  
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()
        
        src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze
        
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.data_column], self.data.loc[idx, self.class_column]      
            
            input_ = input_ + ' '
            target = target + " "
            
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
              [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
              [target], pad_to_max_length=True, return_tensors="pt"
            )
            
            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        # print(hparams)
        self.save_hyperparameters(hparams)
        
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
    def is_logger(self):
        return self.trainer.proc_rank <= 0
  
    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )
        loss = outputs[0]
        
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}
  
    def on_train_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}
  
    def on_validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]
  
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
        
        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4,persistent_workers=True)
        t_total = (
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4,persistent_workers=True)

In [18]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
        # Log results
        for key in sorted(metrics):
            if key not in ["log", "progress_bar"]:
                logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

        # Log and save results to file
        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))
                    writer.write("{} = {}\n".format(key, str(metrics[key])))

In [19]:
args_dict = dict(
    data_dir="../data/AFG_data_quintuples/", # path for data files
    output_dir="../data/outputs/", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [20]:
# dataset = EventDataset(tokenizer, '../data/AFG_data_quintuples/', 'val', 512)
# len(dataset)

In [21]:
# # data = dataset[1550]
# for i in range(20):
#     data = dataset[i]
#     # if tokenizer.decode(data['target_ids'])=='</s>':
#     # print(tokenizer.decode(data['source_ids']))
#     print(tokenizer.decode(data['target_ids']))

In [22]:
args_dict.update({'data_dir': '../data/AFG_data_quintuples', 'output_dir': '../data/outputs', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dir': '../data/AFG_data_quintuples', 'output_dir': '../data/outputs', 'model_name_or_path': 't5-base', 'tokenizer_name_or_path': 't5-base', 'max_seq_length': 512, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 8, 'eval_batch_size': 8, 'num_train_epochs': 2, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [23]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
)
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=True,
    mode='min'
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    devices=args.n_gpu,  # Use devices instead of gpus
    accelerator='gpu' if args.n_gpu > 0 else 'cpu',  # Use accelerator
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    # amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback(),early_stopping_callback,checkpoint_callback],
)

In [24]:
def get_dataset(tokenizer, type_path, args):
    return EventDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [25]:
model = T5FineTuner(args)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
trainer = pl.Trainer(**train_params)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\desai\anaconda3\envs\cuda_test\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | eval
------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |                                                                               | 0/? [00:00<…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
