In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm
import wandb  # Optional for logging
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id



In [4]:
model.forward()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [1]:


class CustomTextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.tokenizer = tokenizer
        self.block_size = block_size
        
        # Read and tokenize the entire file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Tokenize the text
        self.examples = tokenizer.encode(text)
        
    def __len__(self):
        # Return the number of possible chunks minus one (for shifted targets)
        return max(0, len(self.examples) - self.block_size)
    
    def __getitem__(self, i):
        # Get chunk of tokens
        chunk = self.examples[i:i + self.block_size]
        
        # Prepare input and target
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        
        return x, y

def create_dataloader(file_path, tokenizer, block_size=128, batch_size=8, shuffle=True):
    dataset = CustomTextDataset(file_path, tokenizer, block_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

class GPT2Trainer:
    def __init__(
        self,
        model,
        train_dataloader,
        val_dataloader=None,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=1000,
        max_steps=None,
        gradient_accumulation_steps=1,
        max_grad_norm=1.0,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        use_wandb=False
    ):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.device = device
        self.use_wandb = use_wandb
        
        # Training parameters
        self.learning_rate = learning_rate
        self.warmup_steps = warmup_steps
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.max_grad_norm = max_grad_norm
        
        # Calculate max steps if not provided
        self.max_steps = max_steps if max_steps is not None else \
            len(train_dataloader) // gradient_accumulation_steps
        
        # Initialize optimizer
        self.optimizer = self._create_optimizer()
        
        # Loss function
        self.criterion = CrossEntropyLoss()
        
        if use_wandb:
            wandb.init(project="gpt2-finetuning")
    
    def _create_optimizer(self):
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() 
                          if not any(nd in n for nd in no_decay)],
                'weight_decay': 0.01
            },
            {
                'params': [p for n, p in self.model.named_parameters() 
                          if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0
            }
        ]
        return AdamW(optimizer_grouped_parameters, lr=self.learning_rate)
    
    def _get_learning_rate(self, step):
        # Implement learning rate schedule with warmup
        if step < self.warmup_steps:
            return self.learning_rate * (step / self.warmup_steps)
        return self.learning_rate * (1.0 - (step / self.max_steps))
    
    def train_step(self, batch):
        input_ids, labels = [b.to(self.device) for b in batch]
        
        # Forward pass
        outputs = self.model(input_ids, labels=labels)
        loss = outputs.loss / self.gradient_accumulation_steps
        
        # Backward pass
        loss.backward()
        
        return loss.item() * self.gradient_accumulation_steps
    
    def evaluate(self):
        self.model.eval()
        total_loss = 0
        total_steps = 0
        
        with torch.no_grad():
            for batch in tqdm(self.val_dataloader, desc="Evaluating"):
                input_ids, labels = [b.to(self.device) for b in batch]
                outputs = self.model(input_ids, labels=labels)
                total_loss += outputs.loss.item()
                total_steps += 1
        
        return total_loss / total_steps
    
    def train(self, num_epochs):
        global_step = 0
        best_val_loss = float('inf')
        
        for epoch in range(num_epochs):
            self.model.train()
            epoch_loss = 0
            accumulation_loss = 0
            
            with tqdm(total=len(self.train_dataloader), desc=f"Epoch {epoch+1}") as pbar:
                for step, batch in enumerate(self.train_dataloader):
                    # Perform training step
                    loss = self.train_step(batch)
                    accumulation_loss += loss
                    
                    # Update weights if gradient accumulation is complete
                    if (step + 1) % self.gradient_accumulation_steps == 0:
                        # Clip gradients
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), 
                            self.max_grad_norm
                        )
                        
                        # Update learning rate
                        curr_lr = self._get_learning_rate(global_step)
                        for param_group in self.optimizer.param_groups:
                            param_group['lr'] = curr_lr
                        
                        # Optimizer step
                        self.optimizer.step()
                        self.optimizer.zero_grad()
                        
                        # Log metrics
                        epoch_loss += accumulation_loss
                        if self.use_wandb:
                            wandb.log({
                                'loss': accumulation_loss,
                                'learning_rate': curr_lr,
                                'global_step': global_step
                            })
                        
                        accumulation_loss = 0
                        global_step += 1
                    
                    pbar.update(1)
                    pbar.set_postfix({'loss': loss})
            
            # Evaluate on validation set
            if self.val_dataloader is not None:
                val_loss = self.evaluate()
                print(f"Validation loss: {val_loss:.4f}")
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    # Save best model
                    torch.save(self.model.state_dict(), 'best_model.pt')
                
                if self.use_wandb:
                    wandb.log({'val_loss': val_loss, 'epoch': epoch})
            
            # Save checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'global_step': global_step,
            }, f'checkpoint_epoch_{epoch}.pt')

def main():
    # Initialize model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    # Add padding token
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    
    # Create dataloaders
    train_dataloader = create_dataloader(
        'path/to/train.txt',
        tokenizer,
        block_size=128,
        batch_size=8
    )
    val_dataloader = create_dataloader(
        'path/to/val.txt',
        tokenizer,
        block_size=128,
        batch_size=8,
        shuffle=False
    )
    
    # Initialize trainer
    trainer = GPT2Trainer(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        learning_rate=5e-5,
        warmup_steps=1000,
        gradient_accumulation_steps=4,
        use_wandb=True  # Set to False if not using wandb
    )
    
    # Train the model
    trainer.train(num_epochs=3)

if __name__ == '__main__':
    main()

  from .autonotebook import tqdm as notebook_tqdm
2024-10-23 17:03:20.933708: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-23 17:03:20.933772: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-23 17:03:20.950806: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-23 17:03:21.121555: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
libarrow_python.so: cannot open shared object file: No such file or directory

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
import json
import pandas as pd
from typing import Dict, List, Union

class SummarizationDataset(Dataset):
    def __init__(
        self,
        data_path: str,
        tokenizer: GPT2Tokenizer,
        max_article_length: int = 512,
        max_summary_length: int = 128,
        file_type: str = 'json'
    ):
        """
        Initialize the summarization dataset.
        
        Args:
            data_path: Path to the data file (json or csv)
            tokenizer: Tokenizer to use for encoding
            max_article_length: Maximum length for articles
            max_summary_length: Maximum length for summaries
            file_type: Type of data file ('json' or 'csv')
        """
        self.tokenizer = tokenizer
        self.max_article_length = max_article_length
        self.max_summary_length = max_summary_length
        
        df = pd.read_csv(data_path)
        self.data = df.to_dict('records')
        
        # Add special tokens for summarization
        special_tokens = {
            'sep_token': '<|sep|>',
            'pad_token': '<|pad|>',
            'bos_token': '<|startoftext|>',
            'eos_token': '<|endoftext|>'
        }
        
        # Add special tokens to tokenizer
        self.tokenizer.add_special_tokens({
            'additional_special_tokens': list(special_tokens.values())
        })
        
        # Get special token ids
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids('<|sep|>')
        self.pad_token_id = self.tokenizer.convert_tokens_to_ids('<|pad|>')
        self.bos_token_id = self.tokenizer.convert_tokens_to_ids('<|startoftext|>')
        self.eos_token_id = self.tokenizer.convert_tokens_to_ids('<|endoftext|>')
        
    def __len__(self) -> int:
        return len(self.data)
    
    def prepare_input(self, article: str, summary: str) -> Dict[str, torch.Tensor]:
        """
        Prepare input by tokenizing and formatting article and summary.
        """
        # Format: <|startoftext|> article <|sep|> summary <|endoftext|>
        # Tokenize article and summary separately
        article_tokens = self.tokenizer.encode(
            article,
            max_length=self.max_article_length,
            truncation=True,
            add_special_tokens=False
        )
        
        summary_tokens = self.tokenizer.encode(
            summary,
            max_length=self.max_summary_length,
            truncation=True,
            add_special_tokens=False
        )
        
        # Combine tokens with special tokens
        input_ids = (
            [self.bos_token_id] +  # Start token
            article_tokens +
            [self.sep_token_id] +  # Separator
            summary_tokens +
            [self.eos_token_id]    # End token
        )
        
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * len(input_ids)
        
        # Create labels for training
        # -100 is the ignore index for CrossEntropyLoss
        labels = (
            [-100] * (len(article_tokens) + 2) +  # +2 for bos and sep tokens
            summary_tokens +
            [self.eos_token_id]
        )
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        item = self.data[idx]
        
        # Get article and summary from data
        # Adjust these field names based on your data structure
        article = item.get('article', item.get('body', ''))
        summary = item.get('highlights', item.get('highlights', ''))
        
        # Prepare the input
        encoded = self.prepare_input(article, summary)
        
        # Convert to tensors
        return {
            'input_ids': torch.tensor(encoded['input_ids']),
            'attention_mask': torch.tensor(encoded['attention_mask']),
            'labels': torch.tensor(encoded['labels'])
        }

def create_dataloader(
    dataset: SummarizationDataset,
    batch_size: int = 8,
    shuffle: bool = True
) -> DataLoader:
    """
    Create a dataloader with padding collation.
    """
    def collate_fn(batch):
        # Find max length in batch
        max_length = max(len(item['input_ids']) for item in batch)
        
        # Initialize padded batch
        padded_batch = {
            'input_ids': [],
            'attention_mask': [],
            'labels': []
        }
        
        # Pad each item to max_length
        for item in batch:
            for key in padded_batch:
                padding_length = max_length - len(item[key])
                if key == 'labels':
                    padding_value = -100
                else:
                    padding_value = dataset.pad_token_id if key == 'input_ids' else 0
                    
                padded_item = torch.cat([
                    item[key],
                    torch.ones(padding_length, dtype=torch.long) * padding_value
                ])
                padded_batch[key].append(padded_item)
        
        # Stack all items in batch
        return {
            key: torch.stack(value)
            for key, value in padded_batch.items()
        }
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn
    )

# Example usage
def demonstrate_usage():
    
    # Initialize tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Create dataset
    dataset = SummarizationDataset(
        '/ssd_scratch/cvit/souvik/cnn_dailymail/train.csv',
        tokenizer,
        max_article_length=128,
        max_summary_length=32
    )
    
    # Create dataloader
    dataloader = create_dataloader(dataset, batch_size=2)
    
    # Get a batch
    batch = next(iter(dataloader))
    
    # Print example
    print("Input text:")
    print(tokenizer.decode(batch['input_ids'][0]))
    print("\nLabels (showing only summary part):")
    print(tokenizer.decode([
        token for token in batch['labels'][0] if token != -100
    ]))

if __name__ == "__main__":
    demonstrate_usage()

Input text:
<|startoftext|> (CNN) -- Everything looked perfectly normal as the plane prepared to land. I fastened my seat belt and looked through the window, watching the toy cars on the highway below gradually grow to life-size, and the airport runway rapidly -- too rapidly, perhaps -- move almost within arms' reach. The plane touched down, barreled ahead on the tarmac for a couple of seconds, and suddenly raised its nose and roared back up to the sky pressing us against our seat backs while climbing at a frighteningly steep angle. What happened, we wondered wordlessly. What's coming next? What came next was an anxious circling back to <|sep|> Frida Ghitis once on flight that made aborted landing, then landed safely.
She says fatalities rare but air travel draws primal fear that dangerous travel modes don<|endoftext|>

Labels (showing only summary part):
Frida Ghitis once on flight that made aborted landing, then landed safely.
She says fatalities rare but air travel draws primal fear

In [1]:
import pandas as pd
df = pd.read_csv('/ssd_scratch/cvit/souvik/cnn_dailymail/train.csv')

In [4]:
df1 = df[:21000]
df2 = df[21000:27000]
df3 = df[27000:30000]

In [5]:
df1.to_csv("/ssd_scratch/cvit/souvik/cnn_dailymail/train_small.csv",index=False)
df2.to_csv("/ssd_scratch/cvit/souvik/cnn_dailymail/val_small.csv",index=False)
df3.to_csv("/ssd_scratch/cvit/souvik/cnn_dailymail/test_small.csv",index = False)

In [6]:
df3.shape

(3000, 3)

In [8]:
df.shape

(287113, 3)

In [7]:
df.to_dict('records')

[{'id': '0001d1afc246a7964130f43ae940af6bc6c57f01',
  'article': "By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through c

In [7]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
special_tokens = {
            'sep_token': '<|sep|>',
            'pad_token': '<|pad|>',
            'bos_token': '<|startoftext|>',
            'eos_token': '<|endoftext|>',
            'summary_token' : '[summarize]'
        }

In [12]:
tokenizer.add_special_tokens({'additional_special_tokens': list(special_tokens.values())})

0

In [13]:
tokenizer.convert_tokens_to_ids('[summarize]')

50260

In [14]:
from transformers import GPT2LMHeadModel
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')



In [19]:
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)

In [22]:
gpt2.modules

<bound method Module.modules of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)>

In [24]:
for n, p in gpt2.named_modules():
    print(n)


transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.attn_dropout
transformer.h.2.attn.resid_dropout
transformer.h.2.ln_2
transformer.h.2.mlp
transformer.h.2.mlp.c_fc
transformer.h.2.mlp

In [16]:
gpt2.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [17]:
summarize_token_id = tokenizer.convert_tokens_to_ids('[summarize]')
summarize_embedding = gpt2.transformer.wte.weight[summarize_token_id].clone().detach()

In [18]:
summarize_embedding.shape

torch.Size([768])