In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import json
from typing import List, Dict, Any
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset

In [3]:
class DatasetProcessor:
    def __init__(self, dataset_paths: Dict[str, str]):
        """Initialize with paths for multiple datasets."""
        self.dataset_paths = dataset_paths
        self.data = {}

    def load_datasets(self):
        """Load multiple datasets from JSON files."""
        for name, path in self.dataset_paths.items():
            with open(path, 'r', encoding='utf-8') as f:
                self.data[name] = [json.loads(line) for line in f]

    def preprocess(self, output_paths: Dict[str, str]):
        """Perform full preprocessing for all datasets and save results."""
        self.load_datasets()
        # Assuming alignment and other preprocessing are done earlier
        for name, path in output_paths.items():
            with open(path, 'w', encoding='utf-8') as f:
                json.dump(self.data[name], f, ensure_ascii=False, indent=2)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data: List[Dict[str, Any]], tokenizer: T5Tokenizer, max_length: int = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        input_text = sample.get("input_text", "")
        target_text = sample.get("target_text", "")

        inputs = self.tokenizer(input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }

In [5]:
class EventExtractionModel:
    def __init__(self, model_name: str):
        """Initialize the T5 model and tokenizer."""
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def fine_tune(self, train_data: List[Dict[str, Any]], val_data: List[Dict[str, Any]], epochs: int = 3, learning_rate: float = 1e-4, batch_size: int = 8):
        """Fine-tune the T5 model on the training data."""
        train_dataset = CustomDataset(train_data, self.tokenizer)
        val_dataset = CustomDataset(val_data, self.tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")
            total_loss = 0

            for batch in train_loader:
                optimizer.zero_grad()
                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.model.device),
                    attention_mask=batch["attention_mask"].to(self.model.device),
                    labels=batch["labels"].to(self.model.device),
                )
                loss = outputs.loss
                total_loss += loss.item()

                loss.backward()
                optimizer.step()

            avg_loss = total_loss / len(train_loader)
            print(f"Training Loss: {avg_loss}")

            # Validation step
            self.evaluate(val_loader)

    def evaluate(self, val_loader: DataLoader):
        """Evaluate the model on validation data."""
        self.model.eval()
        total_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.model.device),
                    attention_mask=batch["attention_mask"].to(self.model.device),
                    labels=batch["labels"].to(self.model.device),
                )
                total_loss += outputs.loss.item()

        avg_loss = total_loss / len(val_loader)
        print(f"Validation Loss: {avg_loss}")
        self.model.train()

In [6]:
dataset_paths = {
        "full": "/content/full.json",
        "test": "/content/test.json",
        "train": "/content/train.json",
        "dev": "/content/dev.json"
}

output_paths = {
        "full": "/content/processed_full.json",
        "test": "/content/processed_test.json",
        "train": "/content/processed_train.json",
        "dev": "/content/processed_dev.json"
}

processor = DatasetProcessor(dataset_paths)
processor.preprocess(output_paths)

In [7]:
# Load processed data
with open(output_paths["train"], 'r', encoding='utf-8') as f:
        train_data = json.load(f)
with open(output_paths["dev"], 'r', encoding='utf-8') as f:
        val_data = json.load(f)

In [8]:
# Model training
model = EventExtractionModel("t5-small")
model.fine_tune(train_data, val_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/3


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Training Loss: 0.42849795195673196
Validation Loss: 0.03540017083287239
Epoch 2/3
Training Loss: 0.017584529054954976
Validation Loss: 0.0013040894409641623
Epoch 3/3
Training Loss: 0.002448578042830187
Validation Loss: 8.063568384386599e-05


In [11]:
class EventExtractionModel:
    def __init__(self, model_name: str):
        """Initialize the T5 model and tokenizer."""
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def fine_tune(self, train_data: List[Dict[str, Any]], val_data: List[Dict[str, Any]], epochs: int = 3, learning_rate: float = 1e-4, batch_size: int = 8):
        """Fine-tune the T5 model on the training data."""
        train_dataset = CustomDataset(train_data, self.tokenizer)
        val_dataset = CustomDataset(val_data, self.tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
        self.model.train()

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")
            total_loss = 0

            for batch in train_loader:
                optimizer.zero_grad()
                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.model.device),
                    attention_mask=batch["attention_mask"].to(self.model.device),
                    labels=batch["labels"].to(self.model.device),
                )
                loss = outputs.loss
                total_loss += loss.item()

                loss.backward()
                optimizer.step()

            avg_loss = total_loss / len(train_loader)
            print(f"Training Loss: {avg_loss}")

            # Validation step
            self.evaluate(val_loader)

    def evaluate(self, val_loader: DataLoader):
        """Evaluate the model on validation data."""
        self.model.eval()
        total_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.model.device),
                    attention_mask=batch["attention_mask"].to(self.model.device),
                    labels=batch["labels"].to(self.model.device),
                )
                total_loss += outputs.loss.item()

        avg_loss = total_loss / len(val_loader)
        print(f"Validation Loss: {avg_loss}")
        self.model.train()

    def save_model(self, save_path: str):
        """Save the model and tokenizer to a specified path."""
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"Model and tokenizer saved to {save_path}.")

In [12]:
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Define a directory to save the model
save_directory = "/content/t5_event_extraction_model"

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Save the model and tokenizer
model.model.save_pretrained(save_directory)
model.tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to /content/t5_event_extraction_model
