# LSTM Language Models

Importing all the required libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, math
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


Checking for available CUDA machine 

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. Load data


### Compiling Harry Potter dataset:
I have compiled the text rich dataset related to Harry Potter from the wiki fandom (https://harrypotter.fandom.com/wiki/Main_Page) site. To extract all the text from the site, I have used two open source tools from github to scrape the text data:

- ScrapeFandom (https://github.com/JOHW85/ScrapeFandom) : To downland and dump all the pages related to harry potter in the wiki fandom site into XML files
- WikiTextExtractor (https://github.com/JOHW85/wikiextractor) : To extract text from the dumpted XML file and export it into json file

A custom python script was used to clean up the json from any metadata, and breakdown the paragraphs into sentencnes and formatted necessary to load the data from datasets library. The final result is saved at file `harrypotter.json`. The json file simply contains the list of the extracted sentences.

First lets load the data from the json file into pandas dataframe



In [5]:
json_file_path = 'harrypotter.json'

# Read the JSON file
with open(json_file_path, 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data)
df = df.reset_index(drop=True)

Now, lets split the datasets into three parts: train, test and validaton

In [6]:
train_df = df.sample(frac=0.8, random_state=42)
temp_df = df.drop(train_df.index)
validation_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(validation_df.index)

Convert the split df into datasets dictonaries

In [7]:
# Convert DataFrames to datasets
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset,
})

# Display information about the created DatasetDict
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 67030
    })
    validation: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 8379
    })
    test: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 8379
    })
})


## 2. Preprocessing

### Tokenizing

We will use tokenizer from torch text to tokenize our text from the dataset

In [8]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/67030 [00:00<?, ? examples/s]

Map: 100%|██████████| 67030/67030 [00:03<00:00, 20473.15 examples/s]
Map: 100%|██████████| 8379/8379 [00:00<00:00, 20052.31 examples/s]
Map: 100%|██████████| 8379/8379 [00:00<00:00, 19947.66 examples/s]


In [11]:
print(tokenized_dataset['train'][120]['tokens'])

['when', 'jacob', "'", 's', 'sibling', 'followed', 'a', 'trail', 'to', 'track', 'down', 'jacob', ',', 'penny', 'in', 'the', 'library', 'commented', 'she', 'saw', 'someone', 'looking', 'like', 'him', 'in', 'there', '.']


### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [12]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [13]:
print(len(vocab))

25082


In [14]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', 'the', ',', '.', 'of', 'to', 'and', 'a', 'in']


## 3. Prepare the batch loader

### Prepare data
 

In [15]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [16]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [17]:
train_data.shape

torch.Size([128, 26113])

## 4. Modeling

In [19]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim

        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)

        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)
        output = self.dropout(output)
        prediction =self.fc(output)
       
        return prediction, hidden

## 5. Training

In [20]:
vocab_size = len(vocab)
emb_dim = 1024
hid_dim = 1024 
num_layers = 2
dropout_rate = 0.65
lr = 1e-3

In [21]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 68,186,618 trainable parameters


In [22]:
def get_batch(data, seq_len, idx):
    src    = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]
    return src, target

In [23]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):

    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]

    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)

    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [24]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

Here we will be using a `ReduceLROnPlateau` learning scheduler which decreases the learning rate by a factor, if the loss don't improve by a certain epoch.

In [None]:
n_epochs = 20
seq_len  = 50
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion,
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size,
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')



	Train Perplexity: 446.488
	Valid Perplexity: 173.587




	Train Perplexity: 163.011
	Valid Perplexity: 107.957




	Train Perplexity: 118.585
	Valid Perplexity: 86.974




	Train Perplexity: 98.112
	Valid Perplexity: 75.793




	Train Perplexity: 85.448
	Valid Perplexity: 68.268




	Train Perplexity: 76.523
	Valid Perplexity: 63.031




	Train Perplexity: 69.909
	Valid Perplexity: 59.027




	Train Perplexity: 64.673
	Valid Perplexity: 56.184




	Train Perplexity: 60.542
	Valid Perplexity: 53.982




	Train Perplexity: 57.150
	Valid Perplexity: 52.287




	Train Perplexity: 54.352
	Valid Perplexity: 50.900




	Train Perplexity: 51.890
	Valid Perplexity: 49.745




	Train Perplexity: 49.869
	Valid Perplexity: 48.868




	Train Perplexity: 48.071
	Valid Perplexity: 47.954




	Train Perplexity: 46.489
	Valid Perplexity: 47.260




	Train Perplexity: 45.105
	Valid Perplexity: 46.760




	Train Perplexity: 43.791
	Valid Perplexity: 46.364




	Train Perplexity: 42.682
	Valid Perplexity: 45.893




	Train Perplexity: 41.675
	Valid Perplexity: 45.553




	Train Perplexity: 40.748
	Valid Perplexity: 45.254


## 6. Testing

In [None]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 46.358


## 7. Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)

            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab

            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)
            prediction = torch.multinomial(probs, num_samples=1).item()

            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
prompt = 'Harry Potter did '
max_seq_len = 30
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer,
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter did not believe that he was the heir of slytherin .

0.7
harry potter did not believe that he was the heir of slytherin . the title was inherited by his maternal grandfather , sirius black . it was said that he was a friend

0.75
harry potter did not believe that sirius black was a muggle , and did not spoil the boy , as the fact that he was the only one who was loyal to the

0.8
harry potter did not believe that sirius black was a muggle , and did not spoil the boy , as the fact that he was the only one who was loyal to the

1.0
harry potter did not believe that neville ' s parents had dark abilities after it , as he incorrectly believed the harm of many of the children that lord voldemort called .

