In [6]:
# Install Library
%pip install torch==2.2.2 pandas==2.2.2 torchtext==0.17.2 scikit-learn==1.4.2 tqdm==4.66.2 nltk==3.8.1

# Import Library
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu
import time
nltk.download('punkt')

# Load Dataset
df = pd.read_csv('scraped_news_technology.csv')
df = df.dropna(subset=['content'])
df = df[df['content'].str.strip() != '']
df = df.head(1000)
df['title'] = df['content'].apply(lambda x: x.split('.')[0])

# Build Vocab
def yield_tokens(texts):
    for text in texts:
        yield text.lower().split()
vocab = build_vocab_from_iterator(yield_tokens(df['content'].tolist() + df['title'].tolist()), specials=["<pad>", "<sos>", "<eos>"])
vocab.set_default_index(vocab['<pad>'])
pad_idx = vocab['<pad>']
vocab_size = len(vocab)

# Dataset & Loader
class NewsDataset(Dataset):
    def __init__(self, contents, titles):
        self.contents = contents
        self.titles = titles
    def __len__(self):
        return len(self.contents)
    def __getitem__(self, idx):
        content = self.contents[idx].lower().split()
        title = self.titles[idx].lower().split()
        content_idx = torch.tensor(vocab(content), dtype=torch.long)
        title_idx = torch.tensor([vocab['<sos>']] + vocab(title) + [vocab['<eos>']], dtype=torch.long)
        return content_idx, title_idx

def collate_fn(batch):
    contents, titles = zip(*batch)
    contents_pad = pad_sequence(contents, batch_first=True, padding_value=pad_idx)
    titles_pad = pad_sequence(titles, batch_first=True, padding_value=pad_idx)
    return contents_pad, titles_pad

contents = df['content'].tolist()
titles = df['title'].tolist()
train_contents, val_contents, train_titles, val_titles = train_test_split(contents, titles, test_size=0.2)
train_dataset = NewsDataset(train_contents, train_titles)
val_dataset = NewsDataset(val_contents, val_titles)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ================ Define Model ================

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, vocab_size)
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(DEVICE)

        hidden, cell = self.encoder(src)

        input = trg[:,0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:,t] = output
            top1 = output.argmax(1)
            input = trg[:,t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

# Attention Model
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.permute(0, 2, 1)
        v = self.v.repeat(encoder_outputs.size(0),1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention, dim=1)

class AttentionSeq2Seq(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.encoder = nn.LSTM(emb_dim, hid_dim, batch_first=True)
        self.decoder = nn.LSTMCell(emb_dim + hid_dim, hid_dim)
        self.attention = Attention(hid_dim)
        self.fc_out = nn.Linear(hid_dim, vocab_size)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        embedded_src = self.embedding(src)
        encoder_outputs, (hidden, cell) = self.encoder(embedded_src)

        batch_size = src.size(0)
        trg_len = trg.size(1)
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(DEVICE)

        input = trg[:,0]
        decoder_hidden, decoder_cell = hidden[-1], cell[-1]

        for t in range(1, trg_len):
            embedded_input = self.embedding(input)
            attn_weights = self.attention(decoder_hidden.unsqueeze(0), encoder_outputs)
            context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
            decoder_input = torch.cat((embedded_input, context), dim=1)
            decoder_hidden, decoder_cell = self.decoder(decoder_input, (decoder_hidden, decoder_cell))
            output = self.fc_out(decoder_hidden)
            outputs[:,t] = output
            top1 = output.argmax(1)
            input = trg[:,t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, nhead, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.transformer = nn.Transformer(emb_dim, nhead, num_layers, num_layers)
        self.fc_out = nn.Linear(emb_dim, vocab_size)

    def forward(self, src, trg):
        src_emb = self.embedding(src).permute(1,0,2)
        trg_emb = self.embedding(trg).permute(1,0,2)
        outputs = self.transformer(src_emb, trg_emb)
        output = self.fc_out(outputs)
        return output.permute(1,0,2)

# ================ Training Function ================
def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for src, trg in tqdm(loader):
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:,1:].reshape(-1, output_dim)
        trg = trg[:,1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

# ================ Evaluation ================
def generate(model, loader, max_len=20):
    model.eval()
    examples = []
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            outputs = model(src, trg, teacher_forcing_ratio=0.0) if isinstance(model, (Seq2Seq, AttentionSeq2Seq)) else model(src, trg)
            output = outputs.argmax(-1)
            for idx in range(src.size(0)):
                pred_tokens = [vocab.get_itos()[i] for i in output[idx].tolist() if i not in [pad_idx, vocab['<sos>'], vocab['<eos>']]]
                true_tokens = [vocab.get_itos()[i] for i in trg[idx].tolist() if i not in [pad_idx, vocab['<sos>'], vocab['<eos>']]]
                examples.append((" ".join(pred_tokens), " ".join(true_tokens)))
    return examples

# ================ Run All ================
models = {
    'Basic LSTM': Seq2Seq(Encoder(vocab_size, 256, 512), Decoder(vocab_size, 256, 512)),
    'Attention LSTM': AttentionSeq2Seq(vocab_size, 256, 512),
    'Transformer': TransformerModel(vocab_size, 256, 8, 3)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model = model.to(DEVICE)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    start = time.time()
    train(model, train_loader, optimizer, criterion)
    end = time.time()
    samples = generate(model, val_loader)
    bleu_scores = [sentence_bleu([true.split()], pred.split()) for pred, true in samples]
    results[name] = {'time': end-start, 'bleu': sum(bleu_scores)/len(bleu_scores), 'samples': samples[:5]}

# Show Results
for name, res in results.items():
    print(f"\n{name}")
    print(f"Training Time: {res['time']:.2f} sec")
    print(f"Average BLEU Score: {res['bleu']:.4f}")
    for pred, true in res['samples']:
        print(f"\nPredicted: {pred}\nGround Truth: {true}")


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training Basic LSTM...


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

100%|██████████| 13/13 [00:13<00:00,  1.04s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.


Training Attention LSTM...


100%|██████████| 13/13 [01:11<00:00,  5.48s/it]


Training Transformer...


100%|██████████| 13/13 [00:08<00:00,  1.52it/s]



Basic LSTM
Training Time: 13.51 sec
Average BLEU Score: 0.0000

Predicted: the the of
Ground Truth: (image credit: android central) this is android central's news weekly, your go-to source for a concise roundup of the week's most significant tech stories

Predicted: the the of the
Ground Truth: mechanical keycaps are one of the key components of mechanical keyboards, as they correspond to the alphanumeric characters they represent

Predicted: the the of
Ground Truth: editor’s note keeping dark circles and puffiness around the eye at bay is an easy way to keep your overall face looking younger and more vibrant

Predicted: the the of
Ground Truth: after stepping through a floo chamber and teleporting in a puff of green smoke to a vast and stunning re-creation of the ministry of magic, i found myself boarding an elevator

Predicted: the the of
Ground Truth: game pass subscribers can dive into a whimsical and magical world inspired by southern folklore now in south of midnight

Attention 