In [1]:
!pip install torch transformers pandas datasets rouge-score evaluate

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ba1f6c130faa0efe7cbc5e0e17bb6d095be194b528eef2cb6de85bb45457ad93
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.3 rouge-score-0.1.2


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

# Load preprocessed train and validation data
train_file_path = "/kaggle/input/input-data/train_600.csv"
val_file_path = "/kaggle/input/input-data/val_600.csv"

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)

# Ensure all values are strings and handle missing data
def preprocess_dataframe(df):
    df['Text'] = df['Text'].astype(str)
    df['Summary'] = df['Summary'].astype(str)
    return df

train_df = preprocess_dataframe(train_df)
val_df = preprocess_dataframe(val_df)

# Initialize T5 tokenizer and model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
class T5PointerGenerator(nn.Module):
    def __init__(self, model_name):
        super(T5PointerGenerator, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        if labels is not None:
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                output_hidden_states=True
            )
        else:
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_attentions=True,
                output_hidden_states=True
            )
        return outputs

    def generate(self, *args, **kwargs):
        """
        Forward the generate call to the underlying T5 model's generate method.
        """
        return self.model.generate(*args, **kwargs)


In [6]:
class SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=700, max_target_length=200):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Text']
        summary = self.data.iloc[idx]['Summary']

        text_encoding = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        summary_encoding = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = text_encoding["input_ids"].squeeze()
        attention_mask = text_encoding["attention_mask"].squeeze()
        labels = summary_encoding["input_ids"].squeeze()

        return input_ids, attention_mask, labels

In [6]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5PointerGenerator(model_name=model_name).to(device)

train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for input_ids, attention_mask, labels in dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [10]:
def validate_model(model, dataloader, tokenizer, device):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Generate predictions
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=200,  # Adjust based on your desired output length
                num_beams=4,     # Beam search for better results
                early_stopping=True
            )

            # Decode predictions and references
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(decoded_preds)
            references.extend(decoded_refs)

    return predictions, references


In [11]:
def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        total_scores['rouge1'] += scores['rouge1'].fmeasure
        total_scores['rouge2'] += scores['rouge2'].fmeasure
        total_scores['rougeL'] += scores['rougeL'].fmeasure

    for key in total_scores:
        total_scores[key] /= len(predictions)

    return total_scores


In [10]:
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}")

predictions, references = validate_model(model, val_loader, tokenizer, device)
rouge_scores = calculate_rouge(predictions, references)

print(f"Validation ROUGE Scores: {rouge_scores}")

# Save the trained model
model_save_path = "/kaggle/working/t5_pointer_generator_model.pth"
model.tokenizer_save_path = "/kaggle/working/t5_tokenizer"

model.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model.tokenizer_save_path)

print(f"Model saved to {model_save_path}")

Epoch 1/5, Loss: 3.0435
Epoch 3/5, Loss: 2.5955
Epoch 4/5, Loss: 2.5078
Epoch 5/5, Loss: 2.4379
Validation ROUGE Scores: {'rouge1': 0.4458189776248656, 'rouge2': 0.1842469108144673, 'rougeL': 0.27792086658622395}
Model saved to /kaggle/working/t5_pointer_generator_model.pth


In [7]:
train_file_path = "/kaggle/input/input-300/train_300.csv"
val_file_path = "/kaggle/input/input-300/val_300.csv"

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)

# Ensure all values are strings and handle missing data
def preprocess_dataframe(df):
    df['Text'] = df['Text'].astype(str)
    df['Summary'] = df['Summary'].astype(str)
    return df

train_df = preprocess_dataframe(train_df)
val_df = preprocess_dataframe(val_df)

In [8]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5PointerGenerator(model_name=model_name).to(device)

train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}")

predictions, references = validate_model(model, val_loader, tokenizer, device)
rouge_scores = calculate_rouge(predictions, references)

print(f"Validation ROUGE Scores: {rouge_scores}")

# Save the trained model
model_save_path = "/kaggle/working/t5_pointer_generator_model_300.pth"
model.tokenizer_save_path = "/kaggle/working/t5_tokenizer_300"

model.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model.tokenizer_save_path)

print(f"Model saved to {model_save_path}")

Epoch 1/5, Loss: 3.4305
Epoch 2/5, Loss: 3.1189
Epoch 3/5, Loss: 2.9836
Epoch 5/5, Loss: 2.8197
Validation ROUGE Scores: {'rouge1': 0.41712182652945134, 'rouge2': 0.15597624896170764, 'rougeL': 0.2572208205406291}
Model saved to /kaggle/working/t5_pointer_generator_model_300.pth
