In [1]:
# !pip install sentencepiece
# !pip install transformers
# !pip install rich[jupyter]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

csv_path_train = '/content/drive/MyDrive/data/Final_Dataset/pattext_train.csv'
csv_path_test = '/content/drive/MyDrive/data/Final_Dataset/pattext_test.csv'
csv_path_val = '/content/drive/MyDrive/data/Final_Dataset/pattext_validate.csv'
df_train = pd.read_csv(csv_path_train)
df_test = pd.read_csv(csv_path_test)
df_val = pd.read_csv(csv_path_val)

In [4]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
import torch
import torch.nn as nn

# Importing the T5 modules from huggingface/transformers
from transformers import T5TokenizerFast, T5ForConditionalGeneration, LongT5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"),
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [5]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long),
        'source_mask': source_mask.to(dtype=torch.long),
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [7]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%100==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [8]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.longt5.generate(
              input_ids=ids,
              attention_mask=mask,
              max_length=1024,  # Increased max_length
              num_beams=2,
              repetition_penalty=2.5,
              length_penalty=1.0,
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%100==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

In [9]:
class HybridLongT5BiLSTM(nn.Module):
    def __init__(self, longt5_model_name, hidden_dim, num_layers, dropout_rate):
        super(HybridLongT5BiLSTM, self).__init__()
        self.longt5 = LongT5ForConditionalGeneration.from_pretrained(longt5_model_name)
        self.tokenizer = T5TokenizerFast.from_pretrained(longt5_model_name)

        self.encoder_dim = self.longt5.config.d_model  # Dimension of LongT5 embeddings
        self.bilstm = nn.LSTM(input_size=self.encoder_dim,
                              hidden_size=hidden_dim,
                              num_layers=num_layers,
                              batch_first=True,
                              bidirectional=True,
                              dropout=dropout_rate if num_layers > 1 else 0)

        # Linear layer to match the LSTM output dimensions with the LongT5 input dimensions
        self.linear = nn.Linear(hidden_dim * 2, self.encoder_dim)  # hidden_dim * 2 because of bidirectional

    def forward(self, input_ids, attention_mask, decoder_input_ids=None, labels=None):
        # Encode input with LongT5
        encoder_outputs = self.longt5.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Pass encoder outputs through Bidirectional LSTM
        bilstm_output, _ = self.bilstm(encoder_outputs.last_hidden_state)

        # Linear layer to match dimensions
        bilstm_output = self.linear(bilstm_output)

        # Use BiLSTM output for decoding
        outputs = self.longt5(input_ids=input_ids,
                              attention_mask=attention_mask,
                              encoder_outputs=(bilstm_output,),
                              decoder_input_ids=decoder_input_ids,
                              labels=labels)

        return outputs

In [10]:
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/"):
    """
    T5 trainer
    """
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenizer for encoding the text
    tokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])

    # Importing the raw dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    train_dataset = df_train
    val_dataset = df_test

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALIDATION Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': model_params["TRAIN_BATCH_SIZE"],
        'shuffle': True,
        'num_workers': 0
    }

    val_params = {
        'batch_size': model_params["VALID_BATCH_SIZE"],
        'shuffle': False,
        'num_workers': 0
    }

    # Creation of Dataloaders for testing and validation
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the model
    model = HybridLongT5BiLSTM(model_params["MODEL"], model_params["HIDDEN_DIM"], model_params["NUM_LAYERS"], model_params["DROPOUT_RATE"])
    model = model.to(device)

    # Defining the optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=model_params["LEARNING_RATE"])

    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model state dictionary
    path = os.path.join(output_dir, "model_files")
    os.makedirs(path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(path, "model_state.pt"))

    # Evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'pred_claims': predictions, 'claims': actuals})
        final_df.to_csv(os.path.join(output_dir, 'predictions.csv'))

    console.save_text(os.path.join(output_dir, 'logs.txt'))

    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model state saved @ {os.path.join(output_dir, "model_files", "model_state.pt")}\n""")
    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir, 'predictions.csv')}\n""")
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir, 'logs.txt')}\n""")


In [11]:
model_params={
    "MODEL": "google/longt5-tglobal-base",
    "HIDDEN_DIM": 768,  # Adjusted hidden dimension
    "NUM_LAYERS": 2,    # Number of LSTM layers
    "DROPOUT_RATE": 0.1,
    "TRAIN_BATCH_SIZE":1,          # training batch size
    "VALID_BATCH_SIZE":1,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":2048,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":1024,   # max length of target text
    "SEED": 42                     # set seed for reproducibility
}

In [12]:
T5Trainer(dataframe=df_train, source_text="detail", target_text="claims", model_params=model_params, output_dir="/content/drive/MyDrive/data" )

Output hidden; open in https://colab.research.google.com to view.

In [13]:


# Load the model
model_path = "/content/drive/MyDrive/data/model_files_T5_512"
model = HybridLongT5BiLSTM(model_params["MODEL"], model_params["HIDDEN_DIM"], model_params["NUM_LAYERS"])
model.load_state_dict(torch.load(os.path.join(model_path, "model_state.pt")))
model = model.to(device)
model.eval()

# Load the tokenizer
tokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])

# Prepare the new data (assuming you have a DataFrame new_data with columns 'detail' and 'claims')
val_dataset = df_test
val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text="detail", target_text="claims")
val_params = {
    'batch_size': model_params["VALID_BATCH_SIZE"],
    'shuffle': False,
    'num_workers': 0
    }

val_loader = DataLoader(val_set, **val_params)

# Generate predictions using the modified validate function
predictions = []
with torch.no_grad():
    for _, data in enumerate(val_loader, 0):
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)

        generated_ids = model.longt5.generate(
            input_ids=ids,
            attention_mask=mask,
            max_length=1024,
            num_beams=2,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True
        )
        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        if _ % 100 == 0:
            console.print(f'Completed {_}')

        predictions.extend(preds)

# Save predictions to CSV
df_val['Generated Claims'] = predictions
df_val.to_csv("/content/drive/MyDrive/data/test_predictions_hybridnew.csv", index=False)


'\n\n# Load the model\nmodel_path = "/content/drive/MyDrive/data/model_files_Hybrid_2048"\nmodel = HybridLongT5BiLSTM(model_params["MODEL"], model_params["HIDDEN_DIM"], model_params["NUM_LAYERS"])\nmodel.load_state_dict(torch.load(os.path.join(model_path, "model_state.pt")))\nmodel = model.to(device)\nmodel.eval()\n\n# Load the tokenizer\ntokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])\n\n# Prepare the new data (assuming you have a DataFrame new_data with columns \'detail\' and \'claims\')\nval_dataset = df_test\nval_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text="detail", target_text="claims")\nval_params = {\n    \'batch_size\': model_params["VALID_BATCH_SIZE"],\n    \'shuffle\': False,\n    \'num_workers\': 0\n    }\n\nval_loader = DataLoader(val_set, **val_params)\n\n# Generate predictions using the modified validate function\npredictions = []\nwith torch.no_grad():\n    f