In [13]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW

In [None]:
df = pd.read_csv("news_summary.csv", encoding='latin1') 
df = df[['text', 'ctext']].dropna().reset_index(drop=True)

# Convert columns to string to avoid tokenizer errors
df['text'] = df['text'].astype(str)
df['ctext'] = df['ctext'].astype(str)

In [16]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
max_input_length = 512
max_target_length = 150

input_ids_list = []
attention_mask_list = []
labels_list = []

for i in range(len(df)):
    text = "summarize: " + df['text'][i]
    summary = df['ctext'][i]

    input_enc = tokenizer(text, max_length=max_input_length, truncation=True,
                          padding="max_length", return_tensors="pt")
    target_enc = tokenizer(summary, max_length=max_target_length, truncation=True,
                           padding="max_length", return_tensors="pt")

    input_ids_list.append(input_enc['input_ids'])
    attention_mask_list.append(input_enc['attention_mask'])
    labels_list.append(target_enc['input_ids'])

input_ids = torch.vstack(input_ids_list).to(device)
attention_mask = torch.vstack(attention_mask_list).to(device)
labels = torch.vstack(labels_list).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1  # start with 1 for testing; increase later
batch_size = 4

model.train()
for epoch in range(epochs):
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        outputs = model(input_ids=batch_input_ids,
                        attention_mask=batch_attention_mask,
                        labels=batch_labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Loss:", loss.item())


Loss: 8.425366401672363
Loss: 7.950828552246094
Loss: 7.9670491218566895
Loss: 8.088427543640137
Loss: 6.992940425872803
Loss: 6.688812732696533
Loss: 6.465960502624512
Loss: 6.038684368133545
Loss: 6.001637935638428
Loss: 5.575855731964111
Loss: 5.0890350341796875
Loss: 4.689815998077393
Loss: 4.315896034240723
Loss: 4.225680828094482
Loss: 4.207030296325684
Loss: 3.6357429027557373
Loss: 3.965045690536499
Loss: 3.681044816970825
Loss: 3.8308866024017334
Loss: 3.174499750137329
Loss: 3.260751485824585
Loss: 3.708585500717163
Loss: 3.263336181640625
Loss: 4.174873352050781
Loss: 3.427185535430908
Loss: 3.3809382915496826
Loss: 3.428065299987793
Loss: 3.5400032997131348
Loss: 3.2546932697296143
Loss: 4.436769485473633
Loss: 3.1971943378448486
Loss: 3.3803763389587402
Loss: 3.5365967750549316
Loss: 3.6054186820983887
Loss: 2.9426610469818115
Loss: 2.9455809593200684
Loss: 3.298663854598999
Loss: 3.0589547157287598
Loss: 2.8676438331604004
Loss: 3.3505539894104004
Loss: 3.2869319915771484

KeyboardInterrupt: 

In [18]:
model.save_pretrained("./t5_summarization_model")
tokenizer.save_pretrained("./t5_summarization_model")

print("✅ Training complete and model saved!")

✅ Training complete and model saved!
