In [2]:
import sys, re
import numpy as np
import math

###############################################################################

def preprocess(s):
    """Tokenise a line"""
    o = re.sub('([^a-zA-Z0-9\']+)', ' \g<1> ', s.strip())
    return ['<BOS>','<BOS>'] + re.sub('  *', ' ', o).strip().split(' ')

###############################################################################

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

EMBEDDING_DIM = 4
CONTEXT_SIZE = 2 #!!!#
HIDDEN_DIM = 6

# Bigram Neural Network Model
class BigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(BigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view(
                (-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

# Trigram Neural Network Model
class TrigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(TrigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view(
                (-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

## Bigram Model

In [5]:
import sys, re
import numpy as np
import math

# from model import *

###############################################################################

training_samples = []
vocabulary = set(['<UNK>'])

f = open("/content/train.txt", "r")
train = f.read().split('\n')


for line in train:
    print(line)
    tokens = preprocess(line)
    for i in tokens: 
      vocabulary.add(i) 
    training_samples.append(tokens)
    # line = sys.stdin.readline()

word2idx = {k: v for v, k in enumerate(vocabulary)}
idx2word = {v: k for k, v in word2idx.items()}

x_train = []
y_train = []
for tokens in training_samples:
    for i in range(len(tokens) - 2): #!!!#
        x_train.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_train.append([word2idx[tokens[i+2]]]) #!!!#

x_train = np.array(x_train)
y_train = np.array(y_train)

###############################################################################

BATCH_SIZE = 1
NUM_EPOCHS = 10

train_set = np.concatenate((x_train, y_train), axis=1)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

loss_function = nn.NLLLoss()
model = BigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(NUM_EPOCHS):
    for i, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#

        model.zero_grad()

        log_probs = model(context_tensor)
        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimiser.step()    

    print('Epoch:', epoch, 'loss:', float(loss))

torch.save({'model': model.state_dict(), 'vocab': idx2word}, 'bigram.lm')

print('Model saved.')


are you still here ?
where are you ?
are you tired ?
i am tired .
are you in england ?
were you in mexico ?
Epoch: 0 loss: 2.399266004562378
Epoch: 1 loss: 2.140509605407715
Epoch: 2 loss: 1.8559238910675049
Epoch: 3 loss: 1.531304121017456
Epoch: 4 loss: 1.1923426389694214
Epoch: 5 loss: 0.8930347561836243
Epoch: 6 loss: 0.6651957035064697
Epoch: 7 loss: 0.5060691833496094
Epoch: 8 loss: 0.3975469470024109
Epoch: 9 loss: 0.32211777567863464
Model saved.


## Bigram Testing

In [6]:
import sys, re
import numpy as np
import math

# from feedforward.model import *

###############################################################################

blob = torch.load('/content/bigram.lm')
idx2word = blob['vocab']
word2idx = {k: v for v, k in idx2word.items()}
vocabulary = set(idx2word.values())

model = BigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
model.load_state_dict(blob['model'])

###############################################################################

BATCH_SIZE = 1

f = open("/content/test.txt", "r")
test = f.read().split('\n')
# print(line)
# train = sys.stdin.readline()
for line in test:
    tokens = preprocess(line)
    
    x_test = []
    y_test = []
    for i in range(len(tokens) - 2): #!!!#
        x_test.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_test.append([word2idx[tokens[i+2]]]) #!!!#
    
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    test_set = np.concatenate((x_test, y_test), axis=1)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)
    
    total_prob = 1.0
    for i, data_tensor in enumerate(test_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        predicted_label = int(torch.argmax(probs, dim=1)[0])
    
        true_label = y_test[i][0]
        true_word = idx2word[true_label]
    
        prob_true = float(probs[0][true_label])
        total_prob *= prob_true
    
    print('%.6f\t%.6f\t' % (total_prob, math.log(total_prob)), tokens)
    
    # line = sys.stdin.readline()

0.001799	-6.320797	 ['<BOS>', '<BOS>', 'where', 'are', 'you', '?']
0.006107	-5.098277	 ['<BOS>', '<BOS>', 'were', 'you', 'in', 'england', '?']
0.026388	-3.634832	 ['<BOS>', '<BOS>', 'are', 'you', 'in', 'mexico', '?']
0.000013	-11.245216	 ['<BOS>', '<BOS>', 'i', 'am', 'in', 'mexico', '.']
0.000119	-9.039762	 ['<BOS>', '<BOS>', 'are', 'you', 'still', 'in', 'mexico', '?']


## Trigram Model

In [3]:
import sys, re
import numpy as np
import math

# from model import *

###############################################################################

training_samples = []
vocabulary = set(['<UNK>'])

f = open("/content/train.txt", "r")
train = f.read().split('\n')


for line in train:
    print(line)
    tokens = preprocess(line)
    for i in tokens: 
      vocabulary.add(i) 
    training_samples.append(tokens)
    # line = sys.stdin.readline()

word2idx = {k: v for v, k in enumerate(vocabulary)}
idx2word = {v: k for k, v in word2idx.items()}

x_train = []
y_train = []
for tokens in training_samples:
    for i in range(len(tokens) - 2): #!!!#
        x_train.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_train.append([word2idx[tokens[i+2]]]) #!!!#

x_train = np.array(x_train)
y_train = np.array(y_train)

###############################################################################

BATCH_SIZE = 1
NUM_EPOCHS = 10

train_set = np.concatenate((x_train, y_train), axis=1)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

loss_function = nn.NLLLoss()
model = TrigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(NUM_EPOCHS):
    for i, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#

        model.zero_grad()

        log_probs = model(context_tensor)
        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimiser.step()    

    print('Epoch:', epoch, 'loss:', float(loss))

torch.save({'model': model.state_dict(), 'vocab': idx2word}, 'trigram.lm')

print('Model saved.')


are you still here ?
where are you ?
are you tired ?
i am tired .
are you in england ?
were you in mexico ?
Epoch: 0 loss: 2.6104564666748047
Epoch: 1 loss: 2.2813572883605957
Epoch: 2 loss: 1.9093862771987915
Epoch: 3 loss: 1.5117557048797607
Epoch: 4 loss: 1.1313836574554443
Epoch: 5 loss: 0.8182763457298279
Epoch: 6 loss: 0.5949320197105408
Epoch: 7 loss: 0.44837528467178345
Epoch: 8 loss: 0.3548448979854584
Epoch: 9 loss: 0.2945851981639862
Model saved.


## Trigram testing

In [4]:
import sys, re
import numpy as np
import math

# from feedforward.model import *

###############################################################################

blob = torch.load('/content/trigram.lm')
idx2word = blob['vocab']
word2idx = {k: v for v, k in idx2word.items()}
vocabulary = set(idx2word.values())

model = BigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
model.load_state_dict(blob['model'])

###############################################################################

BATCH_SIZE = 1

f = open("/content/test.txt", "r")
test = f.read().split('\n')
# print(line)
# train = sys.stdin.readline()
for line in test:
    tokens = preprocess(line)
    
    x_test = []
    y_test = []
    for i in range(len(tokens) - 2): #!!!#
        x_test.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_test.append([word2idx[tokens[i+2]]]) #!!!#
    
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    test_set = np.concatenate((x_test, y_test), axis=1)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)
    
    total_prob = 1.0
    for i, data_tensor in enumerate(test_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        predicted_label = int(torch.argmax(probs, dim=1)[0])
    
        true_label = y_test[i][0]
        true_word = idx2word[true_label]
    
        prob_true = float(probs[0][true_label])
        total_prob *= prob_true
    
    print('%.6f\t%.6f\t' % (total_prob, math.log(total_prob)), tokens)
    
    # line = sys.stdin.readline()

0.004271	-5.455930	 ['<BOS>', '<BOS>', 'where', 'are', 'you', '?']
0.005562	-5.191857	 ['<BOS>', '<BOS>', 'were', 'you', 'in', 'england', '?']
0.019459	-3.939468	 ['<BOS>', '<BOS>', 'are', 'you', 'in', 'mexico', '?']
0.000005	-12.184215	 ['<BOS>', '<BOS>', 'i', 'am', 'in', 'mexico', '.']
0.000044	-10.040571	 ['<BOS>', '<BOS>', 'are', 'you', 'still', 'in', 'mexico', '?']
