In [1]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
%matplotlib inline

In [2]:
w1 = "Word"
w2 = "Word2"

In [3]:
"""
w1,w2 -> [[0.2,0.2,0.4]
         [0.1,0.2,0.4]]
"""

'\nw1,w2 -> [[0.2,0.2,0.4]\n         [0.1,0.2,0.4]]\n'

In [4]:
# Neural model

# 1) Map all words from vocab to a real vector of size m
# 2) C matrix -> (len(vocab), m)
# 3) all items in C are trainable

# Probability function
# 1) function g(input sequence of feature vectors(words)) -> maps input sequence to a next possible 
# word using a conditional probability distribution
# output from g -> vector whose ith element estimates probability P(w_t|w_t-1)
# g is the neural network

# Combine both g and matrix C
# 1) function f(sequence_of_vectors(words)) -> g(i, C(w_t-1), C(w_t-n+1))

In [5]:
# Neural model in terms of matrix
"""
y = b + Wx + U * tanh(d + Hx)

x = concat of all input sequence feature vectors(words)
b = biases for W
d = biases for H
W = direct representation matrix
H = hidden layer matrix
U = another hidden to output layer matrix

y = (Wx + b) + (U * tanh(d+Hx))
y =  (1,|V|) +   (1, |V|) 
     
goes to two different models, addition = (1,|V|) + (1, |V|) = (1,|V|)
|V| -> length of vocabuluary

then (1,|V|) -> softmax -> probabilities for each word in vocab
"""

'\ny = b + Wx + U * tanh(d + Hx)\n\nx = concat of all input sequence feature vectors(words)\nb = biases for W\nd = biases for H\nW = direct representation matrix\nH = hidden layer matrix\nU = another hidden to output layer matrix\n\ny = (Wx + b) + (U * tanh(d+Hx))\ny =  (1,|V|) +   (1, |V|) \n     \ngoes to two different models, addition = (1,|V|) + (1, |V|) = (1,|V|)\n|V| -> length of vocabuluary\n\nthen (1,|V|) -> softmax -> probabilities for each word in vocab\n'

In [6]:
# Prep dataset

import re

words = []

with open("dataset.txt","r") as file:
    file_content = file.read()
    file_content = re.split('; |, |\*|\n', file_content)
    file_content = re.split(" ", str(file_content))
    words.extend(list(set(file_content)))

In [7]:
words = words[1:]

In [8]:
len(words)

18988

In [9]:
word_to_i = {}

for i in enumerate(words):
    word_to_i[i[1]] = i[0]

In [10]:
i_to_word = {}
for i in enumerate(words):
    i_to_word[i[0]] = i[1]

In [11]:
# X and Y labels

x = []
y = []
for i in range(len(words)-6):
    first_five = words[i:i+5]
    next = words[i+5:i+6]

    x.append(first_five)
    y.append(next[0])
    #print(first_five)
    #print(next)

In [12]:
# Train test split 
train_x = x[:int(len(x)*0.8)]
test_x = x[len(train_x)-1:]

train_y = y[:int(len(y)*0.8)]
test_y = y[len(train_y)-1:]

In [13]:
len(train_x), len(train_y), len(test_x), len(test_y)

(15185, 15185, 3798, 3798)

In [14]:
# Model config
"""
Hidden units: 50
m: 60
n: 5
direct: yes
"""

'\nHidden units: 50\nm: 60\nn: 5\ndirect: yes\n'

In [155]:
# Model 
# multiple sequence of words as input
feature_vector_len = 60
hidden_units = 50
vocab = len(words)
n = 5

hidden_layer = torch.randn(n*feature_vector_len, hidden_units, requires_grad=True)
U = torch.randn(hidden_units, vocab, requires_grad=True)
direct_layer = torch.randn(n*feature_vector_len, vocab, requires_grad=True)
C = torch.randn(vocab, feature_vector_len, requires_grad=True)
d = torch.tensor(1.0,requires_grad=True)
b = torch.tensor(1.0,requires_grad=True)
optimizer = optim.SGD([C, direct_layer, hidden_layer, U, d,b], lr=0.01, momentum=0.9, weight_decay=1e-5)

print("hidden layer: ", hidden_layer.shape)
print("U layer: ", U.shape)
print("Direct representation layer: ", direct_layer.shape)
print("C matrix: ", C.shape)

hidden layer:  torch.Size([300, 50])
U layer:  torch.Size([50, 18988])
Direct representation layer:  torch.Size([300, 18988])
C matrix:  torch.Size([18988, 60])


In [33]:
# Forward pass

tanh = nn.Tanh()
softmax = nn.Softmax(dim=1)
CLE = nn.CrossEntropyLoss()

def get_feature_vector(word):
    index = word_to_i[word]
    return C[index]
    
print("input sequence: ",train_x[0])
print("next word: ",train_y[0])

feature_vectors = torch.stack([get_feature_vector(word) for word in train_x[0]])
feature_vectors = torch.cat(torch.unbind(feature_vectors), dim=0)
feature_vectors = feature_vectors.view(1,-1)
print("feature vectors:", feature_vectors.shape)

label = get_feature_vector(train_y[0])
print("label: ", label.shape)

# Hidden layer

layer_1_output = torch.matmul(feature_vectors, hidden_layer) + d 
print("\nInput @ Hidden layer")
print("layer 1 output:", layer_1_output.shape)

layer_1_output = tanh(layer_1_output)

# Hidden to output layer
layer_2_output = torch.matmul(layer_1_output, U)
print("\n Output from layer 1 @ Output layer")
print("layer 2 output:", layer_2_output.shape)


# Direct representation

direct_output = torch.matmul(feature_vectors, direct_layer) + b
print("\n Input @ Direct rep")
print("Direct rep output:", direct_output.shape)

# Concat
final_output = layer_2_output + direct_output
print("\nFinal output - layer 2 + direct:", final_output.shape)

# Softmax
prob = softmax(final_output)
print("\nsoftmax output:", prob.shape)

answer = torch.argmax(prob)
print("\nprediction:", i_to_word[answer.item()])

# Loss
loss = CLE(final_output, torch.tensor([word_to_i[train_y[0]]]))
print(loss.item())

# Backward pass
loss.backward()
optimizer.step()

input sequence:  ["sudden',", "'“Then", "'colour", "slow',", "'then?"]
next word:  young.
feature vectors: torch.Size([1, 300])
label:  torch.Size([60])

Input @ Hidden layer
layer 1 output: torch.Size([1, 50])

 Output from layer 1 @ Output layer
layer 2 output: torch.Size([1, 18988])

 Input @ Direct rep
Direct rep output: torch.Size([1, 18988])

Final output - layer 2 + direct: torch.Size([1, 18988])

softmax output: torch.Size([1, 18988])

prediction: nail',
81.305908203125


In [156]:
def get_feature_vector(word):
    index = word_to_i[word]
    return C[index]
    
def get_batch(x,y, size):
    
    batch_x = []
    batch_y = []
    
    # Get list of 50 random indexes
    res = random.sample(range(0, len(x)), size)
    
    for i in res:
        # Get feature vectors for all 5 words, combined into one vector
        feature_vectors = torch.stack([get_feature_vector(word) for word in x[i]])
        feature_vectors = torch.cat(torch.unbind(feature_vectors), dim=0)
        #feature_vectors = feature_vectors.view(1,-1) # [1,n*m]
        batch_x.append(feature_vectors)
        batch_y.append(word_to_i[y[i]])
        
    batch_x = torch.stack(batch_x)
    batch_y = torch.tensor(batch_y)
    # Y label doesn't require feature vectors, loss is calculated directly using index
    return batch_x, batch_y

In [185]:
batch_x, batch_y = get_batch(train_x,train_y,len(train_x))

In [186]:
batch_x.shape, batch_y.shape

(torch.Size([15185, 300]), torch.Size([15185]))

In [193]:
epoch = 10
tanh = nn.Tanh()
softmax = nn.Softmax(dim=1)
CLE = nn.CrossEntropyLoss()

for i in range(epoch):

    loss = 0
    batch_x,batch_y = get_batch(train_x,train_y, 50)

    # Hidden layer
    layer_1_output = torch.matmul(batch_x, hidden_layer) + d
    layer_1_output = tanh(layer_1_output)
    
    # Hidden to output layer
    layer_2_output = torch.matmul(layer_1_output, U)

    # Direct representation
    direct_output = torch.matmul(batch_x, direct_layer) + b

    # Concat
    final_output = layer_2_output + direct_output
    #print(final_output.shape)

    # Loss
    loss = CLE(final_output, batch_y)
    print("Loss:",loss.item())
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

Loss: 69.13443756103516
Loss: 68.22632598876953
Loss: 71.65194702148438
Loss: 70.70344543457031
Loss: 76.0293197631836
Loss: 70.72949981689453
Loss: 70.98185729980469
Loss: 74.07401275634766
Loss: 71.87630462646484
Loss: 73.25398254394531


In [184]:
# Inference

def get_predictions(x,y):
    batch_x,batch_y = get_batch(x,y,len(x))
    
    layer_1_output = torch.matmul(batch_x, hidden_layer) + d
    layer_1_output = tanh(layer_1_output)
    
    # Hidden to output layer
    layer_2_output = torch.matmul(layer_1_output, U)

    # Direct representation
    direct_output = torch.matmul(batch_x, direct_layer) + b

    # Concat
    final_output = layer_2_output + direct_output
    #print(final_output.shape)
    
    # Softmax
    prob = softmax(final_output)
    for i in range(prob.shape[0]):
        print("sequence:", x[i])
        print("answer:", y[i])
        answer = torch.argmax(prob[i])
        print("prediction:", i_to_word[answer.item()])
        print("\n")

get_predictions(train_x[:10], train_y[:10])

sequence: ["sudden',", "'“Then", "'colour", "slow',", "'then?"]
answer: young.
prediction: late,”


sequence: ["'“Then", "'colour", "slow',", "'then?", 'young.']
answer: column,',
prediction: handy


sequence: ["'colour", "slow',", "'then?", 'young.', "column,',"]
answer: “Think
prediction: happy


sequence: ["slow',", "'then?", 'young.', "column,',", '“Think']
answer: 'sunburnt
prediction: warm


sequence: ["'then?", 'young.', "column,',", '“Think', "'sunburnt"]
answer: foretold',
prediction: kindly',


sequence: ['young.', "column,',", '“Think', "'sunburnt", "foretold',"]
answer: ha’
prediction: brow


sequence: ["column,',", '“Think', "'sunburnt", "foretold',", 'ha’']
answer: top-hat
prediction: serving


sequence: ['“Think', "'sunburnt", "foretold',", 'ha’', 'top-hat']
answer: me.
prediction: heartily


sequence: ["'sunburnt", "foretold',", 'ha’', 'top-hat', 'me.']
answer: 'unbuttoned
prediction: wisely,”


sequence: ["foretold',", 'ha’', 'top-hat', 'me.', "'unbuttoned"]
answer: se