In [16]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_csv("news_summary.csv", encoding='latin1')
df = df[["ctext", "text"]].dropna()
df = df.sample(min(1000, len(df)), random_state=42)

In [18]:
print(df.head())

                                                  ctext  \
4047  Washington, Mar 27 (PTI) A 38-year-old woman i...   
2731  The tea stall in Guajarat's Vadnagar where Pri...   
1565  Continuing a yearly tradition, outgoing US Pre...   
419   The Supreme Court on Thursday rejected a petit...   
4269  Captain Amarinder Singh, the Congress's Chief ...   

                                                   text  
4047  A woman, who was arrested twice last week for ...  
2731  A tea stall in Gujarat's Vadnagar railway stat...  
1565  US President Barack Obama has declared January...  
419   The Supreme Court on Thursday rejected a petit...  
4269  Congress leader Captain Amarinder Singh will t...  


In [19]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [20]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
    model = model.to(device)
except RuntimeError as e:
    # fallback if GPU OOM on model.to()
    print("Could not move model to GPU (OOM). Using CPU instead.")
    device = torch.device("cpu")
    model = model.to(device)

In [None]:
max_input_length = 512
max_target_length = 150

# Train encodings
train_inputs = ["summarize: " + t for t in train_df["ctext"].tolist()]
train_targets = train_df["text"].tolist()

train_enc = tokenizer(train_inputs, truncation=True, padding="max_length",
                      max_length=max_input_length, return_tensors="pt")
train_labels = tokenizer(train_targets, truncation=True, padding="max_length",
                         max_length=max_target_length, return_tensors="pt")["input_ids"]


train_labels[train_labels == tokenizer.pad_token_id] = -100
train_encodings = {
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": train_labels
}


val_inputs = ["summarize: " + t for t in val_df["ctext"].tolist()]
val_targets = val_df["text"].tolist()

val_enc = tokenizer(val_inputs, truncation=True, padding="max_length",
                    max_length=max_input_length, return_tensors="pt")
val_labels = tokenizer(val_targets, truncation=True, padding="max_length",
                       max_length=max_target_length, return_tensors="pt")["input_ids"]
val_labels[val_labels == tokenizer.pad_token_id] = -100
val_encodings = {
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": val_labels
}


class SummDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = encodings["labels"]
    def __len__(self):
        return self.input_ids.size(0)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

train_dataset = SummDataset(train_encodings)
val_dataset = SummDataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # batch_size=1 to avoid OOM
val_loader = DataLoader(val_dataset, batch_size=1)


In [22]:
optimizer = AdamW(model.parameters(), lr=3e-4)

In [None]:
num_epochs = 1    
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        if (step + 1) % 100 == 0:
            avg = total_loss / (step + 1)
            print(f"Epoch {epoch+1} Step {step+1} Avg Loss {avg:.4f}")

    avg_epoch_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} finished — Average Loss: {avg_epoch_loss:.4f}")

Epoch 1 Step 100 Avg Loss 2.1660
Epoch 1 Step 200 Avg Loss 2.1766
Epoch 1 Step 300 Avg Loss 2.1212
Epoch 1 Step 400 Avg Loss 2.1249
Epoch 1 Step 500 Avg Loss 2.0872
Epoch 1 Step 600 Avg Loss 2.0783
Epoch 1 Step 700 Avg Loss 2.0720
Epoch 1 Step 800 Avg Loss 2.0718
Epoch 1 Step 900 Avg Loss 2.0601
Epoch 1 finished — Average Loss: 2.0601


In [24]:
model.eval()
with torch.no_grad():
    for i in range(min(5, len(val_df))):
        text = val_df.iloc[i]["ctext"]
        input_ids = tokenizer("summarize: " + text, return_tensors="pt",
                              truncation=True, max_length=max_input_length).input_ids.to(device)

        summary_ids = model.generate(
            input_ids,
            max_length=150,
            min_length=30,
            num_beams=2,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print("\n--- SAMPLE", i+1, "---")
        print("Original (first 300 chars):", text[:300])
        print("\nTarget:", val_df.iloc[i]["text"])
        print("\nPredicted:", summary)
        print("-" * 80)


--- SAMPLE 1 ---
Original (first 300 chars): 24 Kawariyas, 6 ITBP and Police personnel were injured in stone pelting by a mob during the 'Kanwar Yatra' in Bareilly. It is alleged that the attack happened after the yatris played loud music while passing through a Muslim-dominated locality. The police had to resort to a lathi charge to disperse 

Target: As many as 24 kanwarias, 6 ITBP and Police personnel were recently injured in stone pelting by a mob during the 'Kanwar Yatra' in Bareilly, Uttar Pradesh. It is alleged that the attack happened after the yatris played loud music while passing through a Muslim-dominated locality. The police had to resort to a lathi charge to disperse the crowd.

Predicted: 24 Kawariyas, 6 ITBP and Police personnel were injured in stone pelting by a mob during the 'Kanwar Yatra' in Bareilly. It is alleged that the attack happened after the yatris played loud music while passing through a Muslim-dominated locality. The police had to resort to a lathi charg

In [25]:
model.save_pretrained("./t5_summarizer_model_manual")
tokenizer.save_pretrained("./t5_summarizer_model_manual")
print("\nModel saved to ./t5_summarizer_model_manual")


Model saved to ./t5_summarizer_model_manual
