In [1]:
import spacy

import torch
from torch import nn
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset
from torchtext.data import Field, BucketIterator, TabularDataset
from torch import optim

import time
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

ARTICLE = Field(tokenize='spacy',
                tokenizer_language='en',
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                include_lengths=True
                )
SUMMARY = Field(tokenize='spacy',
                tokenizer_language='en',
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                )

In [None]:
train_data, valid_data, test_data = TabularDataset.splits(
                                                    path="./data",
                                                    train='train.csv',
                                                    validation='val.csv',
                                                    test='test.csv', 
                                                    format='csv',
                                                    fields=[("text",ARTICLE),('headline',SUMMARY)]
                                                        )

In [None]:
ARTICLE.build_vocab(train_data, min_freq=2)
SUMMARY.build_vocab(train_data, min_freq=2)

In [None]:
BATCH_SIZE = 64

train_loader, valid_loader, test_loader = BucketIterator.splits(
                                                            (train_data, valid_data, test_data),
                                                            batch_size=BATCH_SIZE,
                                                            sort_within_batch=True,
                                                            sort_key = lambda x:len(x.text),
                                                            device=device
                                                                )

In [None]:
class Encoder(nn.Module):
    
    
    def __init__(self, vocab, embeding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab, embeding_dim)
        self.rnn = nn.GRU(embeding_dim, encoder_hidden_dim, bidirectional = True)
        self.fc = nn.Linear(encoder_hidden_dim*2, decoder_hidden_dim)
        
        self.dropout = nn.Dropout(p=dropout)
   

    def forward(self, text, text_len):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_len)
        packed_outputs, hidden = self.rnn(packed_embedded) 
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        return outputs, hidden