In [1]:
import torch.nn as nn
import torch

In [2]:
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)

In [3]:
from fastai.text.all import URLs, untar_data, get_text_files

In [4]:
path = untar_data(URLs.IMDB)

In [6]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [7]:
txt = files[0].open().read(); txt[:75]

'"Loose Change" is a thought-provoking little documentary that draws attenti'

In [5]:
def train_one_epoch(inputs, targets, model, optim, criterion, zero_grad=False):
    if zero_grad:
        optim.zero_grad()

    outputs = model(inputs)
    loss = criterion(outputs, targets)

    loss.backward()
    optim.step()
    
    return loss.item()

In [6]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class TextDataset(Dataset):
    """ Transform CSV dataset intp Pytorch dataset """
    def __init__(self, dataframe=None, csv_file=None, root_dir=None, text_col=None, label_col=None):
        if dataframe is None and csv_file is None:
            raise ValueError("For initializing the dataset, you need to either pass a dataframe or a file name.")
        if dataframe is not None and (text_col is None or label_col is None):
            raise ValueError("You need to pass `text_col` and `label_col` when passing a dataframe.")
        if csv_file:
            self.df = pd.read_csv(csv_file)
        if dataframe is not None:
            self.df = dataframe
        self.root_dir = root_dir
        self.text_col = text_col
        self.label_col = label_col

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx  = idx.tolist()
        return self.df.loc[idx, self.text_col], self.df.loc[idx, self.label_col]
        

In [7]:
def train(dls, model, optim, criterion, epochs=10):
    loss = 0.0
    for epoch in epochs:
        for inputs, targets in dls:
            loss = train_one_epoch(inputs, targets, model, optim, criterion, zero_grad=True)
        print("Epoch: {}\t loss: {}\t")

In [8]:
# M476FCVSPS

In [9]:
embedding_model = TokenEmbedding(vocab_size=1000, embed_size=512)
learning_rate = 1e-5

optimizer = torch.optim.SGD(embedding_model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()


In [79]:
import datasets

In [80]:
imdb_ds = datasets.load_dataset('imdb')

Reusing dataset imdb (/home/re1372/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [81]:
train_df = imdb_ds['train'].data.to_pandas()
test_df = imdb_ds['test'].data.to_pandas()

print('Train: {:,}\tTest: {:,}'.format(train_df.shape[0], test_df.shape[0]))
train_df.head(2)

Train: 25,000	Test: 25,000


Unnamed: 0,label,text
0,1,"Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as ""Teachers"". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is ""Teachers"". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... ..."
1,1,"Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home,..."


In [82]:
train_ds = TextDataset(train_df, text_col="text", label_col='label')
test_ds = TextDataset(test_df, text_col="text", label_col='label')

In [83]:
train_ds[0]

('Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
 1)

In [100]:
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=23)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=23)

In [101]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [102]:
tokenizer = get_tokenizer('basic_english')

In [103]:
vocab = build_vocab_from_iterator(map(tokenizer, iter(train_df.text)))

25000lines [00:01, 13296.30lines/s]


In [104]:
def data_process(raw_text_iter):
    data = [torch.tensor([vocab[token] for token in tokenizer(item)]) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel()>0, data)))

In [41]:
data_process(iter(train_df.text[:10])).shape

torch.Size([2719])

## Process Datasets

In [105]:
train_processed = data_process(iter(train_df.text))
test_processed = data_process(iter(test_df.text))

print('Train Data: {:,}'.format(train_processed.shape[0]))
print('Test Data: {:,}'.format(test_processed.shape[0]))

Train Data: 6,767,187
Test Data: 6,615,916


# Alternative Dataset

In [59]:
from torchtext.utils import download_from_url, extract_archive
import io

url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer,
                                      iter(io.open(train_filepath,encoding="utf8"))
                                     ))


36718lines [00:00, 51660.29lines/s]


In [60]:
train_processed = data_process(iter(io.open(train_filepath, encoding="utf8")))
test_processed = data_process(iter(io.open(valid_filepath, encoding="utf8")))

# Batchify

In [106]:
def batchify(data, batch_size, device='cpu'):
    # Devide the dataset into batches with the size of `batch_size`
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit
    data = data.narrow(0, 0, nbatch*batch_size)
    # Evenly divide the data across the batches
    data = data.view(batch_size, -1).t().contiguous()
    return data.to(device)

In [107]:
batch_size = 40#96
eval_batch_size = 24 #96
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
train_processed = batchify(train_processed, batch_size, device=device)
test_processed = batchify(test_processed, batch_size, device=device)

Device:  cuda


In [108]:
print('Number of Batches: {:,} |\tTraining length of each batch: {:,}'.format(train_processed.shape[1], train_processed.shape[0]))

Number of Batches: 40 |	Training length of each batch: 169,179


In [109]:
def get_batch(source, i, bptt):
    seq_len = min(bptt, max(len(source)-1-i, 0))
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [110]:
get_batch(train_processed, 105734, 35)[0].shape

torch.Size([35, 40])

In [111]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F 

class TransformerCustomModel(nn.Module):
    def __init__(self, ntoken, ninput, nhead, nhidd, nlayer, dropout=0.5):
        super().__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer

        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninput, dropout)
        encoder_layers = TransformerEncoderLayer(ninput, nhead, nhidd, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayer)
        self.encoder = nn.Embedding(ntoken, ninput)
        self.ninput = ninput
        self.decoder = nn.Linear(ninput, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0,1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask==1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninput)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [112]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * self.pe[:x.size(0), :]
        return self.dropout(x)

In [113]:
ntoken = len(vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhidd = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayer = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerCustomModel(ntoken, emsize, nhead, nhidd, nlayer, dropout).to(device)

In [114]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train(train_data, model, criterion, optimizer, bptt, device, ):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate(range(0, train_processed.size(0) - 1, bptt)):
        data, targets = get_batch(train_processed, i, bptt)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntoken), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source, bptt):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = eval_model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i, batch_size)
            if data.size(0) != bptt:
                src_mask = eval_model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntoken)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [115]:
bptt = 35 #96
best_val_loss = float("inf")
epochs = 20 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(train_processed.cuda(), model.cuda(), criterion, optimizer, bptt, device)
    val_loss = evaluate(model, test_processed, bptt)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 4833 batches | lr 5.00 | ms/batch 25.93 | loss  8.18 | ppl  3582.69
| epoch   1 |   400/ 4833 batches | lr 5.00 | ms/batch 25.70 | loss  6.76 | ppl   860.42
| epoch   1 |   600/ 4833 batches | lr 5.00 | ms/batch 25.68 | loss  6.43 | ppl   622.30
| epoch   1 |   800/ 4833 batches | lr 5.00 | ms/batch 25.68 | loss  6.31 | ppl   548.56
| epoch   1 |  1000/ 4833 batches | lr 5.00 | ms/batch 25.68 | loss  6.20 | ppl   490.62
| epoch   1 |  1200/ 4833 batches | lr 5.00 | ms/batch 25.61 | loss  6.05 | ppl   424.06
| epoch   1 |  1400/ 4833 batches | lr 5.00 | ms/batch 25.61 | loss  6.00 | ppl   402.23
| epoch   1 |  1600/ 4833 batches | lr 5.00 | ms/batch 25.58 | loss  5.97 | ppl   391.25
| epoch   1 |  1800/ 4833 batches | lr 5.00 | ms/batch 25.57 | loss  5.92 | ppl   371.03
| epoch   1 |  2000/ 4833 batches | lr 5.00 | ms/batch 25.54 | loss  5.83 | ppl   338.85
| epoch   1 |  2200/ 4833 batches | lr 5.00 | ms/batch 25.62 | loss  5.81 | ppl   333.34
| epoch   1 |  2400/ 

# Inferencing

In [32]:
def process_sample(text):
    tokens = tokenizer(text)
    token_num = torch.tensor([vocab[tok] for tok in tokens]).cuda()
    
    src_mask = model.generate_square_subsequent_mask(len(tokens)).to(device)
    with torch.no_grad():
        outputs = model(token_num, src_mask)
    preds = outputs.view(-1, ntoken).argmax(1).tolist()
    
    return text + " " + " ".join([vocab.itos[tok_idx] for tok_idx in preds])

In [35]:
# sample_text = train_df[0][3]#train_df.text[2][:100]#"The idea of having Martin Scorsese in this movie"
# [vocab.itos[item] for item in process_sample(sample_text).argmax(dim=1).tolist()]

In [36]:
sample_text = "The idea of having a good movie"

In [37]:
process_sample(sample_text)

'The idea of having a good movie first , the to little . . first , the seen little . . movie . the to movie . . film . the a little . . movie . the to movie . . movie . a been little . . movie . the to movie . .'