<a href="https://colab.research.google.com/github/satani99/byt5_small_gec/blob/main/byt5_small_gec_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle



In [None]:
from google.colab import files

files.upload()



In [4]:
!mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [5]:
 ! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets download nikhilsatani/c4-200m-1m

Downloading c4-200m-1m.zip to /content
 99% 80.0M/81.1M [00:06<00:00, 21.9MB/s]
100% 81.1M/81.1M [00:06<00:00, 13.4MB/s]


In [7]:
!unzip /content/c4-200m-1m.zip

Archive:  /content/c4-200m-1m.zip
  inflating: c4_200m_1M.csv          


In [1]:
import pandas as pd 

path = "/content/c4_200m_1M.csv"
df = pd.read_csv(path)

df.head()
df = df[:100000]
len(df)

100000

In [2]:
!pip install sentencepiece
!pip install transformers
!pip install torch 
!pip install rich[jupyter]

import os
import numpy as np 
import pandas as pd
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5ForConditionalGeneration, AutoTokenizer

from rich.table import Column, Table
from rich import box 
from rich.console import Console 

console = Console(record=True)

def display_df(df):

  console = Console()
  table = Table(
      Column("input", justify="center"),
      Column("output", justify="center"),
      title="Sample Data",
      pad_edge=False,
      box=box.ASCII,
  )

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

from torch import cuda 
device = 'cuda' if cuda.is_available() else 'cpu'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
class YourDatasetClass(Dataset):
  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):

    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len 
    self.summ_len = target_len 
    self.target_text = self.data['output']
    self.source_text = self.data['input'] 

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):

    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus(
        [source_text],
        max_length=self.source_len,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    target = self.tokenizer.batch_encode_plus(
        [target_text],
        max_length=self.summ_len,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()

    return {
        "source_ids": source_ids.to(dtype=torch.long),
        "source_mask": source_mask.to(dtype=torch.long),
        "target_ids": target_ids.to(dtype=torch.long),
        "target_ids_y": target_ids.to(dtype=torch.long),
      }


In [4]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  model.train()
  for _, data in enumerate(loader, 0):
    y = data["target_ids"].to(device, dtype=torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data["source_ids"].to(device, dtype=torch.long)
    mask = data["source_mask"].to(device, dtype=torch.long)

    outputs = model(
        input_ids=ids,
        attention_mask=mask,
        decoder_input_ids=y_ids,
        labels=lm_labels,
      )

    loss = outputs[0]

    if _ % 10 == 0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [5]:
def validate(epoch, tokenizer, model, device, loader):

  model.eval()
  predicitons = []
  actuals = []
  with torch.no_grad():
    for _, data in enumerate(loader, 0):
      y = data['target_ids'].to(device, dtype=torch.long)
      ids = data['source_ids'].to(device, dtype=torch.long)
      mask = data['source_mask'].to(device, dtype=torch.long)

      generated_ids = model.generate(
          input_ids = ids, 
          attention_mask = mask,
          max_length = 150,
          num_beams = 2,
          repetition_penalty = 2.5,
          length_penalty = 1.0,
          early_stopping = True
      )

      preds = [tokenizer.decode(g ,skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
      target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

      if _%10 == 0:
        console.print(f"Completed {_}")

      predictions.extend(preds)
      actuals.extend(target)
    return predictions, actuals

In [6]:
def Byt5_small(
    dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):

  torch.manual_seed(model_params["SEED"])
  np.random.seed(model_params["SEED"])
  torch.backends.cudnn.deterministic = True 

  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])

  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)

  console.log(f"[Data]: Reading data...\n")

  dataframe = dataframe[[source_text, target_text]]
  display_df(dataframe.head(2))

  train_size = 0.8
  train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
  val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
  train_dataset = train_dataset.reset_index(drop=True)

  console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset: {val_dataset.shape}\n")

  training_set = YourDatasetClass(
      train_dataset,
      tokenizer,
      model_params["MAX_SOURCE_TEXT_LENGTH"],
      model_params["MAX_TARGET_TEXT_LENGTH"],
      source_text,
      target_text,
  )

  val_set = YourDatasetClass(
      val_dataset,
      tokenizer,
      model_params["MAX_SOURCE_TEXT_LENGTH"],
      model_params["MAX_TARGET_TEXT_LENGTH"],
      source_text, 
      target_text,
  )

  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      "shuffle": True,
      "num_workers": 0,
  }

  val_params = {
      "batch_size": model_params["VALID_BATCH_SIZE"],
      "shuffle": False,
      "num_workers": 0,
  }

  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)

  optimizer = torch.optim.Adam(
      params=model.parameters(), lr=model_params["LEARNING_RATE"]
  )

  console.log(f"[Initiating Fine Tuning]...\n")

  for epoch in range(model_params["TRAIN_EPOCHS"]):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

  console.log(f"[Saving Model]...\n")

  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)

  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
    final_df.to_csv(os.path.join(output_dir, "prediction.csv"))

  console.save_text(os.path.join(output_dir, "logs.txt"))

  console.log(f"[Validation Completed.]\n")
  console.print(
      f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
  )
  console.print(
      f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir, 'predictions.csv')}\n"""
  )
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir, 'logs.txt')}\n""")



In [15]:
model_params = {
    'MODEL': "google/byt5-small",
    "TRAIN_BATCH_SIZE": 3,
    "VALID_BATCH_SIZE": 3,
    "TRAIN_EPOCHS": 3,
    "VAL_EPOCHS": 1,
    "LEARNING_RATE": 1e-4,
    "MAX_SOURCE_TEXT_LENGTH": 512,
    "MAX_TARGET_TEXT_LENGTH": 512,
    "SEED": 42,
}

In [16]:
df["input"] = "correction: " + df["input"]

Byt5_small(
    dataframe=df,
    source_text="input",
    target_text="output",
    model_params=model_params,
    output_dir="/content/outputs",
)

RuntimeError: ignored