<a href="https://colab.research.google.com/github/sangeetsaurabh/PyTorch_Keras_Experiment/blob/master/Text_Number_Prediction/Text_Number_using_PyTorch_NLP_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Predict the next number in the sequence

Given a set of numbers, goal of the model is to predict next number in the sequence. 

For example, model can be given input like - eight thousand one , eight thousand two , eight thousand three , eight thousand four , eight thousand five , eight thousand six , eight thousand seven , eight thousand eight , eight thousand nine , eight thousand ten , eight thousand eleven , eight thousand twelve....

Model will predict next number given the one input. Model in this notebook predicts 21st word given 20 words like above (last step prediction).



### Prepare the Data

In [0]:
#### Install the right version of PyTorch
!pip install torchtext==0.6.0
import torchtext
print(torchtext.__version__)

0.6.0


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#### Setting up path to import important data preparation Python module
import sys
import os
sys.path.append('/content/drive/My Drive/Colab Notebooks/torch_pipe/')

In [0]:
os.getcwd()

'/content'

In [0]:
#### Importing all the appropriate Torch Module
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from Util.human_language_modeling import *
from torch.utils.data import DataLoader ### Custom dataloader to load the data
import torch.nn.functional as F
import time
import logging

In [0]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [0]:
BATCH_SIZE = 64 ## defining the batch size
bptt = 20 ## back propogration through LSTM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
NLP = spacy.load('en_core_web_sm')
def tokenizer(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;\.]", " ", 
        str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    cleanr = re.compile('<.*?>')
    cleanr = re.compile('<>')
    comment = re.sub(cleanr, '', comment)
    #if (len(comment) > MAX_CHARS):
    #   comment = comment[:MAX_CHARS]
    return[x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [0]:
tokenizer("I am king \n ,fdds  , king")

In [0]:
### A simplie python function to show text given an array of vectors
def show_text(input_vector):
    separator = ' '
    txt = separator.join([vocab.itos[i] for i in input_vector])
    return txt

#### Download the train and validation data

In [0]:
tokenizer = get_tokenizer("spacy")
train_dataset_old, valid_dataset_old = HumanNumbers(root='data',data_select=('train', 'valid'))
vocab = train_dataset_old.get_vocab()

<function tokenizer at 0x7f09ae72a048>


8001lines [00:00, 18687.71lines/s]


In [0]:
tokenizer = get_tokenizer("spacy")
train_dataset, valid_dataset = HumanNumbers(root='data',bptt=bptt,batch_size=BATCH_SIZE,data_select=('train', 'valid'))
vocab = train_dataset.get_vocab()




51200
51200
torch.Size([51200, 20])
torch.Size([51200, 20])
14080
14080
torch.Size([14080, 20])
torch.Size([14080, 20])


In [0]:
13056/64

204.0

#### Analyze the data

In [0]:
print (f"Train size: {len(train_dataset)}")
print (f"Validation size: {len(valid_dataset)}")

Train size: 51200
Validation size: 14080


In [0]:
print (f"The number of unique words in train dataset is {len(set(train_dataset_old.raw_data))}. ")
print (f"The number of unique words in Valid dataset is {len(set(valid_dataset_old.raw_data))}. ")

The number of unique words in train dataset is 32. 
The number of unique words in Valid dataset is 32. 


##### Analysis on Vocab

In [0]:
### See the all the tokens in the vocab
vocab.itos

['<unk>',
 '<pad>',
 '\n',
 'hundred',
 'thousand',
 'five',
 'four',
 'one',
 'seven',
 'six',
 'three',
 'two',
 'eight',
 'nine',
 'eighty',
 'fifty',
 'forty',
 'ninety',
 'seventy',
 'sixty',
 'thirty',
 'twenty',
 'eighteen',
 'eleven',
 'fifteen',
 'fourteen',
 'nineteen',
 'seventeen',
 'sixteen',
 'ten',
 'thirteen',
 'twelve',
 'xxBOF',
 'xxEOF']

In [0]:
len(vocab.itos)

In [0]:
#### See the frequency of each of the tokens in the train
vocab.freqs

#### See the texts in input and label data

In [0]:
train_dataset.input_data[1].numpy()

In [0]:
train_dataset.label_data[1].numpy()

In [0]:
show_text(train_dataset.input_data[1].numpy())

In [0]:
show_text(train_dataset.label_data[1].numpy())

In [0]:
show_text(train_dataset.input_data[-2].numpy())

In [0]:
show_text(train_dataset.label_data[-2].numpy())

### Implement a fully connected neural network (Model 10 & 11)

Given 20 words, this model predicts 21st word.

In [0]:
#### A simple Linear network that processes the word in a sequence
class Model0(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)  # green arrow
        self.h_h = nn.Linear(nh,nh)     # brown arrow
        self.h_o = nn.Linear(nh,nv)     # blue arrow
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = self.bn(F.relu(self.i_h(x[:,0])))
        #print(x.shape[1])
        if x.shape[1]>1:
            h = h + self.i_h(x[:,1])
            h = self.bn(F.relu(self.h_h(h)))
        if x.shape[1]>2:
            h = h + self.i_h(x[:,2])
            h = self.bn(F.relu(self.h_h(h)))
        #print(self.h_o(h))
        #print(self.h_o(h).shape)
        return self.h_o(h)

In [0]:
#### A simple Linear network that processes the word in a sequence
class Model1(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)  # green arrow
        self.h_h = nn.Linear(nh,nh)     # brown arrow
        self.h_o = nn.Linear(nh,nv)     # blue arrow
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = self.bn(F.relu(self.h_h(h)))
        return self.h_o(h)

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model1().to(device)
model

Model1(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
#### Function to return a batch of data
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

### Function to train the model
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        #print(cls.shape)
        output = model(text)
        #print(output.shape)
        loss = criterion(output, cls[:,-1])
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls[:,-1]).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

### Function to test the model using validation data
def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            loss = criterion(output, cls[:,-1])
            loss += loss.item()
            acc += (output.argmax(1) == cls[:,-1]).sum().item()

    return loss / len(data_), acc / len(data_)

In [0]:
#### Train the model
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 9 seconds
	Loss: 0.0396(train)	|	Acc: 35.8%(train)
	Loss: 0.0003(valid)	|	Acc: 46.5%(valid)
Epoch: 2  | time in 0 minutes, 9 seconds
	Loss: 0.0261(train)	|	Acc: 47.5%(train)
	Loss: 0.0003(valid)	|	Acc: 47.3%(valid)
Epoch: 3  | time in 0 minutes, 9 seconds
	Loss: 0.0229(train)	|	Acc: 51.3%(train)
	Loss: 0.0002(valid)	|	Acc: 48.8%(valid)
Epoch: 4  | time in 0 minutes, 9 seconds
	Loss: 0.0213(train)	|	Acc: 54.2%(train)
	Loss: 0.0002(valid)	|	Acc: 49.9%(valid)
Epoch: 5  | time in 0 minutes, 8 seconds
	Loss: 0.0202(train)	|	Acc: 56.6%(train)
	Loss: 0.0002(valid)	|	Acc: 50.5%(valid)
Epoch: 6  | time in 0 minutes, 9 seconds
	Loss: 0.0193(train)	|	Acc: 58.3%(train)
	Loss: 0.0002(valid)	|	Acc: 51.1%(valid)
Epoch: 7  | time in 0 minutes, 9 seconds
	Loss: 0.0186(train)	|	Acc: 59.8%(train)
	Loss: 0.0002(valid)	|	Acc: 51.8%(valid)
Epoch: 8  | time in 0 minutes, 8 seconds
	Loss: 0.0180(train)	|	Acc: 61.2%(train)
	Loss: 0.0002(valid)	|	Acc: 52.6%(valid)
Epoch: 9  | time

Results are not bad considering it's a simple custom sequential network.

In [0]:
torch.save(model.state_dict(),f='model11.pt')

#### Create the test data to test out models

In [0]:
test_data = valid_dataset.input_data[0:64]
test_label = valid_dataset.label_data[0:64]

In [0]:
test_data

tensor([[32,  2, 12,  ...,  2, 12,  4],
        [ 2, 12,  4,  ..., 12,  4,  5],
        [12,  4,  7,  ...,  4,  5,  2],
        ...,
        [ 2, 12,  4,  ..., 12,  4, 21],
        [12,  4, 28,  ...,  4, 21,  2],
        [ 4, 28,  2,  ..., 21,  2, 12]])

In [0]:
show_text(test_data[20])

'five \n eight thousand six \n eight thousand seven \n eight thousand eight \n eight thousand nine \n eight thousand'

#### Model 1 testing

In [0]:
model = Model1().to(device)
model.load_state_dict(torch.load('model11.pt'))
model.eval()

Model1(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
output = model(test_data.to(device))
output.shape

torch.Size([64, 34])

In [0]:
import numpy as np
test_predictions = np.argmax(output.cpu().detach().numpy(),axis=1)
test_predictions

array([13,  2, 10,  4, 10,  3,  5,  4, 13,  2,  6,  3, 13,  2,  5,  4, 13,
        2, 10,  3, 13,  2,  7,  4, 10,  2, 10,  4, 13,  2,  6,  4,  9,  2,
       10,  4, 10,  3, 10,  4, 10,  2,  7,  3,  9,  2,  6,  4, 13,  2, 10,
        4, 10,  2, 10,  3, 13,  2, 12,  3,  9,  7, 10,  3])

In [0]:
print (f"Prediction for 20th sentence is '{vocab.itos[test_predictions[20]]}'")
#print(repr(show_text(output.argmax(20))))

Prediction for 20th sentence is 'nine'


In [0]:
show_text(test_data[20])

'five \n eight thousand six \n eight thousand seven \n eight thousand eight \n eight thousand nine \n eight thousand'

### Multi fully connected model (Model 2)

Before, we were just predicting the last word in a line of text.  Given 20 tokens, what is token 21?  That approach was throwing away a lot of data.  Why not predict token 2 from token 1, then predict token 3, then predict token 4, and so on?  We will modify our model to do this.

In [0]:
class Model2(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        
    def forward(self, x):
        h = torch.zeros(x.shape[0], nh).to(device=x.device)
        res = []
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.h_o(self.bn(h)))
        return torch.stack(res, dim=1)

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model2().to(device)
model

Model2(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
#### Function to generate batch
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

#### Train the model
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        for i in range(output.shape[1]):
            loss = criterion(output[:,i,:], cls[:,i])
            train_loss += loss.item()
            train_acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

#### Test the model while training
def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            for i in range(output.shape[1]):
                loss = criterion(output[:,i,:], cls[:,i])
                loss += loss.item()
                acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 15 seconds
	Loss: 0.0375(train)	|	Acc: 36.8%(train)
	Loss: 0.0000(valid)	|	Acc: 40.7%(valid)
Epoch: 2  | time in 0 minutes, 15 seconds
	Loss: 0.0263(train)	|	Acc: 45.6%(train)
	Loss: 0.0000(valid)	|	Acc: 37.4%(valid)
Epoch: 3  | time in 0 minutes, 15 seconds
	Loss: 0.0246(train)	|	Acc: 45.9%(train)
	Loss: 0.0000(valid)	|	Acc: 31.8%(valid)
Epoch: 4  | time in 0 minutes, 15 seconds
	Loss: 0.0240(train)	|	Acc: 45.9%(train)
	Loss: 0.0000(valid)	|	Acc: 30.9%(valid)
Epoch: 5  | time in 0 minutes, 15 seconds
	Loss: 0.0237(train)	|	Acc: 46.0%(train)
	Loss: 0.0000(valid)	|	Acc: 30.9%(valid)
Epoch: 6  | time in 0 minutes, 15 seconds
	Loss: 0.0236(train)	|	Acc: 46.1%(train)
	Loss: 0.0000(valid)	|	Acc: 30.6%(valid)
Epoch: 7  | time in 0 minutes, 15 seconds
	Loss: 0.0235(train)	|	Acc: 46.0%(train)
	Loss: 0.0000(valid)	|	Acc: 30.6%(valid)
Epoch: 8  | time in 0 minutes, 15 seconds
	Loss: 0.0234(train)	|	Acc: 46.1%(train)
	Loss: 0.0000(valid)	|	Acc: 30.5%(valid)
Epoch: 9

In [0]:
torch.save(model.state_dict(),"./model2.pt")

#### Model 2 testing

In [0]:
model = Model2().to(device)
model.load_state_dict(torch.load('model2.pt'))
model.eval()

Model2(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
output = model(test_data.to(device))
output.shape

torch.Size([64, 20, 34])

In [0]:
import numpy as np
test_predictions = np.argmax(output.cpu().detach().numpy(),axis=2)
test_predictions

array([[ 4, 10,  3, ..., 11,  3, 10],
       [10,  3,  8, ...,  3, 10,  3],
       [ 2, 10,  3, ..., 10,  3,  6],
       ...,
       [10,  3,  8, ...,  3, 10,  8],
       [ 2, 10,  2, ..., 10,  8,  6],
       [10,  2,  6, ...,  8,  6,  3]])

In [0]:
test_predictions[20]

array([ 4, 10,  3, 10,  3,  6,  3, 10,  3,  5,  3, 10,  3,  6,  3,  8,  3,
        5,  3, 10])

In [0]:
#### Comparing the label and predictions
for i,word_idx in  enumerate(test_predictions[20]):
  print (f'Label: {repr(tokenizer(show_text(test_label[20]))[i])} ---> Prediction: {repr(vocab.itos[word_idx])} ')

Label: '\n ' ---> Prediction: 'thousand' 
Label: 'eight' ---> Prediction: 'three' 
Label: 'thousand' ---> Prediction: 'hundred' 
Label: 'six' ---> Prediction: 'three' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'four' 
Label: 'thousand' ---> Prediction: 'hundred' 
Label: 'seven' ---> Prediction: 'three' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'five' 
Label: 'thousand' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'three' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'four' 
Label: 'thousand' ---> Prediction: 'hundred' 
Label: 'nine' ---> Prediction: 'seven' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'five' 
Label: 'thousand' ---> Prediction: 'hundred' 
Label: 'ten' ---> Prediction: 'three' 


Note that our accuracy is worse now, because we are doing a harder task.  When we predict word k (k<20), we have less history to help us then when we were only predicting word 71.

### Multi fully connected model that maintains State (Model 3)

To address this issue, let's keep the hidden state from the previous line of text, so we are not starting over again on each new line of text.

In [0]:
class Model3(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.h_h = nn.Linear(nh,nh)
        self.h_o = nn.Linear(nh,nv)
        self.bn = nn.BatchNorm1d(nh)
        self.h = torch.zeros(BATCH_SIZE, nh).to(device)
        
    def forward(self, x):
        res = []
        h = self.h
        for i in range(x.shape[1]):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            res.append(self.bn(h))
        self.h = h.detach()
        res = torch.stack(res, dim=1)
        res = self.h_o(res)
        return res

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model3().to(device)
model

Model3(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
### Function to generate the batch dataset
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        for i in range(output.shape[1]):
            loss = criterion(output[:,i,:], cls[:,i])
            train_loss += loss.item()
            train_acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    #scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            for i in range(output.shape[1]):
                loss = criterion(output[:,i,:], cls[:,i])
                loss += loss.item()
                acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 13 seconds
	Loss: 0.0256(train)	|	Acc: 46.9%(train)
	Loss: 0.0000(valid)	|	Acc: 44.2%(valid)
Epoch: 2  | time in 0 minutes, 13 seconds
	Loss: 0.0305(train)	|	Acc: 52.3%(train)
	Loss: 0.0000(valid)	|	Acc: 45.6%(valid)
Epoch: 3  | time in 0 minutes, 13 seconds
	Loss: 0.0368(train)	|	Acc: 56.0%(train)
	Loss: 0.0000(valid)	|	Acc: 48.4%(valid)
Epoch: 4  | time in 0 minutes, 13 seconds
	Loss: 0.0424(train)	|	Acc: 58.8%(train)
	Loss: 0.0000(valid)	|	Acc: 49.4%(valid)
Epoch: 5  | time in 0 minutes, 13 seconds
	Loss: 0.0473(train)	|	Acc: 60.6%(train)
	Loss: 0.0000(valid)	|	Acc: 49.1%(valid)
Epoch: 6  | time in 0 minutes, 13 seconds
	Loss: 0.0512(train)	|	Acc: 62.0%(train)
	Loss: 0.0000(valid)	|	Acc: 49.7%(valid)
Epoch: 7  | time in 0 minutes, 13 seconds
	Loss: 0.0536(train)	|	Acc: 62.7%(train)
	Loss: 0.0000(valid)	|	Acc: 51.5%(valid)
Epoch: 8  | time in 0 minutes, 13 seconds
	Loss: 0.0569(train)	|	Acc: 63.6%(train)
	Loss: 0.0000(valid)	|	Acc: 49.6%(valid)
Epoch: 9

In [0]:
torch.save(model.state_dict(),"./model3.pt")

#### Model 3 testing

In [0]:
model = Model3().to(device)
model.load_state_dict(torch.load('model3.pt'))
model.eval()

Model3(
  (i_h): Embedding(34, 64)
  (h_h): Linear(in_features=64, out_features=64, bias=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
output = model(test_data.to(device))
output.shape

torch.Size([64, 20, 34])

In [0]:
import numpy as np
test_predictions = np.argmax(output.cpu().detach().numpy(),axis=2)
test_predictions

array([[ 2,  8,  4, ...,  9,  4,  9],
       [ 9,  4,  8, ...,  4,  9,  2],
       [ 1,  6,  3, ...,  9,  2,  9],
       ...,
       [ 9,  4,  8, ...,  4, 21,  2],
       [ 1,  6,  2, ..., 21,  2,  9],
       [32,  3, 16, ...,  2,  9,  4]])

In [0]:
test_predictions[20]

array([ 2,  9,  4,  8,  3, 16,  4, 16,  3,  9,  4, 18,  2,  9,  4, 12,  2,
        5,  4, 29])

In [0]:
#### Comparing the label and predictions
for i,word_idx in  enumerate(test_predictions[20]):
  print (f'Label: {repr(tokenizer(show_text(test_label[20]))[i])} ---> Prediction: {repr(vocab.itos[word_idx])} ')

Label: '\n ' ---> Prediction: '\n' 
Label: 'eight' ---> Prediction: 'six' 
Label: 'thousand' ---> Prediction: 'thousand' 
Label: 'six' ---> Prediction: 'seven' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'forty' 
Label: 'thousand' ---> Prediction: 'thousand' 
Label: 'seven' ---> Prediction: 'forty' 
Label: '\n ' ---> Prediction: 'hundred' 
Label: 'eight' ---> Prediction: 'six' 
Label: 'thousand' ---> Prediction: 'thousand' 
Label: 'eight' ---> Prediction: 'seventy' 
Label: '\n ' ---> Prediction: '\n' 
Label: 'eight' ---> Prediction: 'six' 
Label: 'thousand' ---> Prediction: 'thousand' 
Label: 'nine' ---> Prediction: 'eight' 
Label: '\n ' ---> Prediction: '\n' 
Label: 'eight' ---> Prediction: 'five' 
Label: 'thousand' ---> Prediction: 'thousand' 
Label: 'ten' ---> Prediction: 'ten' 


Accuracy is still not good. Let's try RNN.

### RNN (Model 4)

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model4(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.RNN(nh,nh, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(1, BATCH_SIZE, nh).to(device)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model4().to(device)
model

Model4(
  (i_h): Embedding(34, 64)
  (rnn): RNN(64, 64, batch_first=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1dFlat(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=False,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        for i in range(output.shape[1]):
            loss = criterion(output[:,i,:], cls[:,i])
            train_loss += loss.item()
            train_acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            for i in range(output.shape[1]):
                loss = criterion(output[:,i,:], cls[:,i])
                loss += loss.item()
                acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 7 seconds
	Loss: 0.0224(train)	|	Acc: 57.8%(train)
	Loss: 0.0000(valid)	|	Acc: 45.5%(valid)
Epoch: 2  | time in 0 minutes, 7 seconds
	Loss: 0.0187(train)	|	Acc: 61.8%(train)
	Loss: 0.0000(valid)	|	Acc: 41.1%(valid)
Epoch: 3  | time in 0 minutes, 7 seconds
	Loss: 0.0175(train)	|	Acc: 63.6%(train)
	Loss: 0.0000(valid)	|	Acc: 39.5%(valid)
Epoch: 4  | time in 0 minutes, 7 seconds
	Loss: 0.0171(train)	|	Acc: 66.2%(train)
	Loss: 0.0000(valid)	|	Acc: 43.2%(valid)
Epoch: 5  | time in 0 minutes, 7 seconds
	Loss: 0.0160(train)	|	Acc: 70.9%(train)
	Loss: 0.0000(valid)	|	Acc: 46.1%(valid)
Epoch: 6  | time in 0 minutes, 7 seconds
	Loss: 0.0155(train)	|	Acc: 73.4%(train)
	Loss: 0.0000(valid)	|	Acc: 46.8%(valid)
Epoch: 7  | time in 0 minutes, 7 seconds
	Loss: 0.0149(train)	|	Acc: 76.1%(train)
	Loss: 0.0000(valid)	|	Acc: 44.6%(valid)
Epoch: 8  | time in 0 minutes, 7 seconds
	Loss: 0.0144(train)	|	Acc: 76.9%(train)
	Loss: 0.0000(valid)	|	Acc: 51.8%(valid)
Epoch: 9  | time

In [0]:
torch.save(model.state_dict(),"./model4.pt")

Performance is better, but still not good enough. Let's try GRU.

### GRU (Model 5)

This is testing GRU for multiple steps predictions. So given the 20 words, it makes 20 predictions.

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model5(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.GRU(nh, nh, 2, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(2, BATCH_SIZE, nh)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model5().to(device)
model

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        for i in range(output.shape[1]):
            loss = criterion(output[:,i,:], cls[:,i])
            train_loss += loss.item()
            train_acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            for i in range(output.shape[1]):
                loss = criterion(output[:,i,:], cls[:,i])
                loss += loss.item()
                acc += (output[:,i,:].argmax(1) == cls[:,i]).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 32 seconds
	Loss: 0.0280(train)	|	Acc: 68.7%(train)
	Loss: 0.0000(valid)	|	Acc: 56.2%(valid)
Epoch: 2  | time in 0 minutes, 33 seconds
	Loss: 0.0391(train)	|	Acc: 76.7%(train)
	Loss: 0.0000(valid)	|	Acc: 54.9%(valid)
Epoch: 3  | time in 0 minutes, 32 seconds
	Loss: 0.0444(train)	|	Acc: 77.3%(train)
	Loss: 0.0000(valid)	|	Acc: 58.3%(valid)
Epoch: 4  | time in 0 minutes, 31 seconds
	Loss: 0.0483(train)	|	Acc: 76.8%(train)
	Loss: 0.0000(valid)	|	Acc: 56.6%(valid)
Epoch: 5  | time in 0 minutes, 31 seconds
	Loss: 0.0479(train)	|	Acc: 77.2%(train)
	Loss: 0.0000(valid)	|	Acc: 57.5%(valid)
Epoch: 6  | time in 0 minutes, 31 seconds
	Loss: 0.0492(train)	|	Acc: 77.4%(train)
	Loss: 0.0000(valid)	|	Acc: 57.5%(valid)
Epoch: 7  | time in 0 minutes, 31 seconds
	Loss: 0.0507(train)	|	Acc: 77.4%(train)
	Loss: 0.0000(valid)	|	Acc: 56.7%(valid)
Epoch: 8  | time in 0 minutes, 31 seconds
	Loss: 0.0502(train)	|	Acc: 78.0%(train)
	Loss: 0.0000(valid)	|	Acc: 56.3%(valid)
Epoch: 9

In [0]:
torch.save(model.state_dict(),"./model5.pt")

### GRU (Model 6)

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model6(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.GRU(nh, nh, 2, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(2, BATCH_SIZE, nh)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        #self.h = h.detach()
        return self.h_o(self.bn(res))

In [0]:
bptt=20
tokenizer = get_tokenizer("spacy")
train_dataset, valid_dataset = HumanNumbers(root='data',bptt=bptt,data_select=('train', 'valid'))
vocab = train_dataset.get_vocab()

3790lines [00:00, 37890.73lines/s]

<function tokenizer at 0x7f1d0bd07268>


8001lines [00:00, 36544.06lines/s]


51840
51840
torch.Size([51840, 30])
torch.Size([51840, 30])
13440
13440
torch.Size([13440, 30])
torch.Size([13440, 30])


In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model6().to(device)
model

Model6(
  (i_h): Embedding(34, 64)
  (rnn): GRU(64, 64, num_layers=2, batch_first=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1dFlat(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        #for i in range(output.shape[1]):
        loss = criterion(output[:,-1,:], cls[:,-1])
        train_loss += loss.item()
        train_acc += (output[:,-1,:].argmax(1) == cls[:,-1]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            #for i in range(output.shape[1]):
            loss = criterion(output[:,-1,:], cls[:,-1])
            loss += loss.item()
            acc += (output[:,-1,:].argmax(1) == cls[:,-1]).sum().item()

    return loss / len(data_), acc / len(data_)

In [0]:
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 42 seconds
	Loss: 0.0048(train)	|	Acc: 90.8%(train)
	Loss: 0.0003(valid)	|	Acc: 76.7%(valid)
Epoch: 2  | time in 0 minutes, 41 seconds
	Loss: 0.0012(train)	|	Acc: 97.8%(train)
	Loss: 0.0004(valid)	|	Acc: 75.8%(valid)
Epoch: 3  | time in 0 minutes, 41 seconds
	Loss: 0.0008(train)	|	Acc: 98.7%(train)
	Loss: 0.0003(valid)	|	Acc: 78.3%(valid)
Epoch: 4  | time in 0 minutes, 40 seconds
	Loss: 0.0006(train)	|	Acc: 99.0%(train)
	Loss: 0.0005(valid)	|	Acc: 74.2%(valid)
Epoch: 5  | time in 0 minutes, 41 seconds
	Loss: 0.0004(train)	|	Acc: 99.3%(train)
	Loss: 0.0004(valid)	|	Acc: 78.2%(valid)
Epoch: 6  | time in 0 minutes, 40 seconds
	Loss: 0.0004(train)	|	Acc: 99.4%(train)
	Loss: 0.0003(valid)	|	Acc: 76.4%(valid)
Epoch: 7  | time in 0 minutes, 41 seconds
	Loss: 0.0004(train)	|	Acc: 99.4%(train)
	Loss: 0.0004(valid)	|	Acc: 77.7%(valid)
Epoch: 8  | time in 0 minutes, 40 seconds
	Loss: 0.0003(train)	|	Acc: 99.5%(train)
	Loss: 0.0005(valid)	|	Acc: 74.0%(valid)
Epoch: 9

In [0]:
torch.save(model.state_dict(),"./model6.pt")

### LSTM (Model 7)

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model7(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(nv,nh)
        self.rnn = nn.LSTM(nh, nh, 2, batch_first=True)
        self.h_o = nn.Linear(nh,nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(2, BATCH_SIZE, nh)
        self.c = torch.zeros(2, BATCH_SIZE, nh)
        
        
    def forward(self, x):
        res,(h,c) = self.rnn(self.i_h(x), (self.h,self.c))
        #self.h = h.detach()
        return self.h_o(self.bn(res))

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
nh = BATCH_SIZE
nh

64

In [0]:
model = Model7().to(device)
model

Model7(
  (i_h): Embedding(34, 64)
  (rnn): LSTM(64, 64, num_layers=2, batch_first=True)
  (h_o): Linear(in_features=64, out_features=34, bias=True)
  (bn): BatchNorm1dFlat(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(text.shape)
        output = model(text)
        #print(output.shape)
        #print(cls.shape)
        #for i in range(output.shape[1]):
        loss = criterion(output[:,-1,:], cls[:,-1])
        train_loss += loss.item()
        train_acc += (output[:,-1,:].argmax(1) == cls[:,-1]).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            #for i in range(output.shape[1]):
            loss = criterion(output[:,-1,:], cls[:,-1])
            loss += loss.item()
            acc += (output[:,-1,:].argmax(1) == cls[:,-1]).sum().item()

    return loss / len(data_), acc / len(data_)

In [0]:
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 40 seconds
	Loss: 0.0046(train)	|	Acc: 90.8%(train)
	Loss: 0.0004(valid)	|	Acc: 78.4%(valid)
Epoch: 2  | time in 0 minutes, 40 seconds
	Loss: 0.0002(train)	|	Acc: 99.6%(train)
	Loss: 0.0004(valid)	|	Acc: 83.8%(valid)
Epoch: 3  | time in 0 minutes, 40 seconds
	Loss: 0.0001(train)	|	Acc: 99.8%(train)
	Loss: 0.0004(valid)	|	Acc: 82.4%(valid)
Epoch: 4  | time in 0 minutes, 40 seconds
	Loss: 0.0001(train)	|	Acc: 99.8%(train)
	Loss: 0.0005(valid)	|	Acc: 82.1%(valid)
Epoch: 5  | time in 0 minutes, 40 seconds
	Loss: 0.0001(train)	|	Acc: 99.8%(train)
	Loss: 0.0005(valid)	|	Acc: 83.5%(valid)
Epoch: 6  | time in 0 minutes, 41 seconds
	Loss: 0.0000(train)	|	Acc: 99.9%(train)
	Loss: 0.0005(valid)	|	Acc: 83.9%(valid)
Epoch: 7  | time in 0 minutes, 40 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0005(valid)	|	Acc: 83.7%(valid)
Epoch: 8  | time in 0 minutes, 39 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0004(valid)	|	Acc: 83.6%(valid)
Epoch:

KeyboardInterrupt: ignored

In [0]:
torch.save(model.state_dict(),"./model7.pt")

In [0]:
for i in model.parameters():
    print(i.shape)

torch.Size([34, 64])
torch.Size([256, 64])
torch.Size([256, 64])
torch.Size([256])
torch.Size([256])
torch.Size([256, 64])
torch.Size([256, 64])
torch.Size([256])
torch.Size([256])
torch.Size([34, 64])
torch.Size([34])
torch.Size([64])
torch.Size([64])


## Try Bidrectional GRU and LSTM

Given the pattern of the numbers, bidirectional LSTM and GRU may work best for this situation

In [0]:
#### Download the data
bptt=20
tokenizer = get_tokenizer("spacy")
train_dataset, valid_dataset = HumanNumbers(root='data',bptt=bptt,data_select=('train', 'valid'))
vocab = train_dataset.get_vocab()

3488lines [00:00, 34871.47lines/s]

<function tokenizer at 0x7f09ae72a048>


8001lines [00:00, 34403.68lines/s]


51200
51200
torch.Size([51200, 20])
torch.Size([51200, 20])
14080
14080
torch.Size([14080, 20])
torch.Size([14080, 20])


### Bidirectional GRU (Model 8)

This is testing GRU for multiple steps predictions. So given the 20 words, it makes 20 predictions.

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
BATCH_SIZE = 64
nh = 64
nh

64

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model8(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(num_embeddings=nv,embedding_dim = nh)
        self.rnn = nn.GRU(nh, nh, 2, batch_first=True,bidirectional=True)
        self.h_o = nn.Linear(2*nh,nv)
        self.bn = BatchNorm1dFlat(2*nh)
        self.h = torch.zeros(2*2, BATCH_SIZE, nh)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        return self.h_o(self.bn(res))

In [0]:
model = Model8().to(device)
model

Model8(
  (i_h): Embedding(34, 64)
  (rnn): GRU(64, 64, num_layers=2, batch_first=True, bidirectional=True)
  (h_o): Linear(in_features=128, out_features=34, bias=True)
  (bn): BatchNorm1dFlat(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(cls.shape)
        #print(text.shape)
        output = model(text)
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        cls = cls.view(-1)
        #print(output.shape)
        #print(cls.shape)
        
        loss = criterion(output, cls)
        train_loss += loss.item()
        #print((output.argmax(1) == cls).sum().item())
        train_acc += (output.argmax(1) == cls).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            output_dim = output.shape[-1]
            output = output.view(-1, output_dim)
            cls = cls.view(-1)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 99.1%(train)
	Loss: 0.0000(valid)	|	Acc: 89.4%(valid)
Epoch: 2  | time in 0 minutes, 50 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 89.9%(valid)
Epoch: 3  | time in 0 minutes, 50 seconds
	Loss: 0.0000(train)	|	Acc: 99.9%(train)
	Loss: 0.0000(valid)	|	Acc: 86.6%(valid)
Epoch: 4  | time in 0 minutes, 50 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 5  | time in 0 minutes, 50 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 91.8%(valid)
Epoch: 6  | time in 0 minutes, 52 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.6%(valid)
Epoch: 7  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.9%(valid)
Epoch: 8  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 94.0%(valid)
Ep

In [0]:
torch.save(model.state_dict(),"./model8.pt")

### Bidirectional LSTM (Model 9)

This is testing LSTM for multiple steps predictions. So given the 20 words, it makes 20 predictions.

In [0]:
nv = len(vocab.itos)
nv

34

In [0]:
BATCH_SIZE = 64
nh = 64
nh

64

In [0]:
class BatchNorm1dFlat(nn.BatchNorm1d):
    "`nn.BatchNorm1d`, but first flattens leading dimensions"
    def forward(self, x):
        if x.dim()==2: return super().forward(x)
        *f,l = x.shape
        x = x.contiguous().view(-1,l)
        return super().forward(x).view(*f,l)

In [0]:
class Model9(nn.Module):
    def __init__(self):
        super().__init__()
        self.i_h = nn.Embedding(num_embeddings=nv,embedding_dim = nh)
        self.rnn = nn.LSTM(nh, nh, 2, batch_first=True,bidirectional=True)
        self.h_o = nn.Linear(2*nh,nv)
        self.bn = BatchNorm1dFlat(2*nh)
        self.h = torch.zeros(2*2, BATCH_SIZE, nh)
        self.c = torch.zeros(2*2, BATCH_SIZE, nh)
        
    def forward(self, x):
        res,(h,c) = self.rnn(self.i_h(x), (self.h,self.c))
        return self.h_o(self.bn(res))

In [0]:
model = Model9().to(device)
model

Model9(
  (i_h): Embedding(34, 64)
  (rnn): LSTM(64, 64, num_layers=2, batch_first=True, bidirectional=True)
  (h_o): Linear(in_features=128, out_features=34, bias=True)
  (bn): BatchNorm1dFlat(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [0]:
def generate_batch(batch):
    #print(len(batch))
    text = []
    label = []
    for entry in batch:
        text.append(entry[0].numpy())
        label.append(entry[1].numpy())
    return torch.tensor(text), torch.tensor(label)

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    #print(len(sub_train_))
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch,drop_last=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        #print(cls.shape)
        #print(text.shape)
        output = model(text)
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        cls = cls.view(-1)
        #print(output.shape)
        #print(cls.shape)
        
        loss = criterion(output, cls)
        train_loss += loss.item()
        #print((output.argmax(1) == cls).sum().item())
        train_acc += (output.argmax(1) == cls).sum().item()
        loss.backward()
        optimizer.step()
        

    # Adjust the learning rate
    scheduler.step()

    return train_loss / (len(sub_train_)*bptt), train_acc / (len(sub_train_)*bptt)

def test_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch,drop_last=True)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            #print(output.shape)
            #print(cls.shape)
            output_dim = output.shape[-1]
            output = output.view(-1, output_dim)
            cls = cls.view(-1)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / (len(data_)*bptt), acc / (len(data_)*bptt)

In [0]:
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

#### Break test sets into 2 data sets validations and test data set

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(train_dataset)
    valid_loss, valid_acc = test_func(valid_dataset)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 99.9%(train)
	Loss: 0.0000(valid)	|	Acc: 86.8%(valid)
Epoch: 2  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 99.9%(train)
	Loss: 0.0000(valid)	|	Acc: 89.3%(valid)
Epoch: 3  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 99.9%(train)
	Loss: 0.0000(valid)	|	Acc: 91.6%(valid)
Epoch: 4  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.8%(valid)
Epoch: 5  | time in 0 minutes, 50 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 95.1%(valid)
Epoch: 6  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.4%(valid)
Epoch: 7  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.7%(valid)
Epoch: 8  | time in 0 minutes, 51 seconds
	Loss: 0.0000(train)	|	Acc: 100.0%(train)
	Loss: 0.0000(valid)	|	Acc: 93.7%(valid)
Epo

In [0]:
train_loss.argmax(1).shape

torch.Size([1260])

In [0]:
train_acc.shape

torch.Size([1260])

In [0]:
torch.save(model.state_dict(),"./model9.pt")