In [1]:
import numpy as np
import torch

words = []
with open("google-10000-english-no-swears.txt", 'r') as file:
    words = file.read().splitlines()
    
print(words[:10])

['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that']


In [2]:
def getSplitData(words):
    
    xs = []
    ys = []
    
    for word in words:
        word = word.lower()
        word = "{{{"+word+"{"

        for i in range(len(word)-3):
            #print(word[i], word[i+1], word[i+2], "-->", word[i+3])
            
            x = [ord(word[i])-97, ord(word[i+1])-97, ord(word[i+2])-97]
            y = [ord(word[i+3])-97]

            xs.append(x)
            ys.append(y)

    # print(xs)
    # print(ys)
    return (xs, ys)
            
data = getSplitData(words[:10000])
xs = data[0]
ys = data[1]

In [3]:
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(xs.size())
print(ys.size())

torch.Size([75153, 3])
torch.Size([75153, 1])


In [None]:
# 1. first we vector embbed
# 2. then we do first layer and bias
# 3. then we tanh
# 4. then we do 2nd layer and bias and shrink down to just 27 outputs
# 5. then we softmax 
# 6. then we get loss 

#tunable params: embedding_table, w1, b1, w2, b2

embedding_table = torch.randn((27, 2), requires_grad=True)

W1 = torch.randn((6, 100), requires_grad=True)
B1 = torch.randn((100), requires_grad=True)

W2 = torch.randn((100, 27), requires_grad=True)
B2 = torch.randn((27), requires_grad=True)

params = (W1, B1, W2, B2, embedding_table)


for _ in range(5000):

    embedded_data = embedding_table[xs] #look up  for each xs find embedding
    embedded_data = embedded_data.view(embedded_data.shape[0], 6)

    
    layer1 = embedded_data @ W1 + B1
    tan_layer1 = torch.tanh(layer1)
    #print("Tan layer 1", tan_layer1.size())
    
    
    layer2 = tan_layer1 @ W2 + B2
    #print("Layer 2", layer2.size())
    
    #softmax to do the loss: But remember, this is the same as cross-entropy-loss: which is actually more efficient cuz pytorch implemntaiton
    
    #layer2 is basically the logits 
    counts = layer2.exp()
    #we need to make counts the probs
    probs = counts / counts.sum(1, keepdims=True)
    print("probs", probs.size())
    
    
    #loss function here: we basically want the probs predicted for the actual output to be max. 
    # to start: get the mean of the probs predicted for the actual output
    sum = 0
    for i in range(probs.shape[0]):
        actual_output = ys[i]
        predicted_prob_of_actual_output = probs[i][actual_output]
        sum += predicted_prob_of_actual_output
    
    sum = sum / probs.shape[0]
    
    #now we have sum, higher sum is good. but we need to turn into loss where lower loss is better. so we -negativeloglikelyhood
    neg_log_likelihood = -sum.log()

    
    print("Train #", _ , " Loss: ", neg_log_likelihood.item())

    for param in params:
        param.grad = None
        
    neg_log_likelihood.backward()
    
    for param in params:
        param.data += -.1 * param.grad

probs torch.Size([75153, 27])
Train # 0  Loss:  3.0778920650482178
probs torch.Size([75153, 27])
Train # 1  Loss:  2.947154998779297
probs torch.Size([75153, 27])
Train # 2  Loss:  2.833498954772949
probs torch.Size([75153, 27])
Train # 3  Loss:  2.736640691757202
probs torch.Size([75153, 27])
Train # 4  Loss:  2.6558570861816406
probs torch.Size([75153, 27])
Train # 5  Loss:  2.584667921066284
probs torch.Size([75153, 27])
Train # 6  Loss:  2.5112318992614746
probs torch.Size([75153, 27])
Train # 7  Loss:  2.4323761463165283
probs torch.Size([75153, 27])
Train # 8  Loss:  2.367539882659912
probs torch.Size([75153, 27])
Train # 9  Loss:  2.3268682956695557
probs torch.Size([75153, 27])
Train # 10  Loss:  2.2998104095458984
probs torch.Size([75153, 27])
Train # 11  Loss:  2.2795698642730713
probs torch.Size([75153, 27])
Train # 12  Loss:  2.263108253479004
probs torch.Size([75153, 27])
Train # 13  Loss:  2.248861312866211
probs torch.Size([75153, 27])
Train # 14  Loss:  2.23600292205810