<h1>Dataset Download

In [None]:
!pip install datasets transformers

In [79]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   -- ------------------------------------ 61.4/991.5 kB 465.5 kB/s eta 0:00:02
   ---- --------------------------------- 122.9/991.5 kB 798.9 kB/s eta 0:00:02
   ------- -------------------------------- 194.6/991.5 kB 1.1 MB/s eta 0:00:01
   ------------- -------------------------- 337.9/991.5 kB 1.4 MB/s eta 0:00:01
   -------------------- ------------------- 501.8/991.5 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------  983.0/991.5 kB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 2.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfull

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader, TensorDataset

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import os
from datasets import load_dataset, DatasetDict
data_dir = "squad_data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
dataset = load_dataset("squad", cache_dir=data_dir)
print(dataset)
dataset = load_dataset("squad", cache_dir=data_dir)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load SQuAD dataset
dataset = load_dataset("squad")

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device)

# Define custom dataset class
class QADataset(Dataset):
    def __init__(self, tokenizer, dataset, max_length=512):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length
        self.inputs = []
        self.targets = []
        self._build()

    def _build(self):
        for example in self.dataset:
            context = example["context"]
            question = example["question"]
            answer = example["answers"]["text"][0]

            input_text = f"question: {question}  context: {context}"
            target_text = answer

            tokenized_inputs = self.tokenizer(
                input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt"
            ).to(device)
            tokenized_targets = self.tokenizer(
                target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt"
            ).to(device)

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return {
            'input_ids': self.inputs[index]['input_ids'].squeeze(),
            'attention_mask': self.inputs[index]['attention_mask'].squeeze(),
            'labels': self.targets[index]['input_ids'].squeeze()
        }

# Create dataset and dataloader
train_dataset = QADataset(tokenizer, dataset["train"])
val_dataset = QADataset(tokenizer, dataset["validation"])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Training loop with mixed precision
scaler = GradScaler()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss}')

# Save the trained model
model.save_pretrained("t5_qa_model")
tokenizer.save_pretrained("t5_qa_tokenizer")



