In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.5/288.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01

In [1]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/prakhardixit/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [13]:
from nltk.corpus import brown
import torch.nn.functional as F
import torch

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
sentences = brown.sents()

In [5]:
len(sentences)

57340

In [6]:
# Clean each sentence by removing non-alphabetic tokens
cleaned_sentences = [
    [word.lower() for word in sentence if word.isalpha()]
    for sentence in sentences
]

In [7]:
# Let's create stoi and itos
words = list(sorted(set([word for sentence in cleaned_sentences for word in sentence])))

In [14]:
# words is our vocab
stoi = {}
itos = {} 

for index, word in enumerate(words):
    stoi[word] = index + 1
    itos[index + 1] = word

stoi['.'] = 0 
itos[0] = '.'

In [15]:
vocab_size = len(stoi)
block_size = 5

In [16]:
import random

def build_dataset(sentences):  
  X, Y = [], []
  for sentence in sentences:
    if len(sentence) < 5:
        continue

    context = [0] * block_size
    for word in sentence + ['.']:
      ix = stoi[word]
      X.append(context)
      Y.append(ix)
      # print(' '.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix]

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y
    
random.seed(42)
random.shuffle(cleaned_sentences)
n1 = int(0.8*len(cleaned_sentences))
n2 = int(0.9*len(cleaned_sentences))

Xtr, Ytr = build_dataset(cleaned_sentences[:n1])
Xdev, Ydev = build_dataset(cleaned_sentences[n1:n2])
Xte, Yte = build_dataset(cleaned_sentences[n2:])

torch.Size([814703, 5]) torch.Size([814703])
torch.Size([102509, 5]) torch.Size([102509])
torch.Size([99593, 5]) torch.Size([99593])


In [17]:
g = torch.Generator().manual_seed(2147483646)

In [56]:
# Let's build the embeddings 
n_embed = 50
C = torch.randn((vocab_size, n_embed), generator=g)

In [57]:
C.shape

torch.Size([40235, 50])

In [58]:
# Let's look at the shape of a single input. 
# Each input has 5 words, the embedding of each word is 10 dimensional. Hence each input will be 50 dimensional long. block_size * n_embed

In [59]:
# This is the embedding of the first input in Xtr
C[Xtr[0]].shape

torch.Size([5, 50])

In [60]:
# The network will have 2 hidden layers to begin with. Each hidden layer will have 100 neurons. Output layer will have vocab_size number of neurons
# Number of weights per neuron will be equal to n_embed * block_size

In [65]:
# Let's build the embeddings 
n_embed = 50
C = torch.randn((vocab_size, n_embed), generator=g)

n_hidden_1 = 200
W1 = torch.randn((n_embed * block_size, n_hidden_1), generator=g) * 1 /  (n_embed * block_size)**0.5
B1 = torch.randn((1, n_hidden_1), generator=g) * 0.1

n_hidden_2 = 200 
W2 = torch.randn((n_hidden_1, n_hidden_2), generator=g) * 1 / (n_hidden_1 ** 0.5)
B2 = torch.randn((1, n_hidden_2), generator=g) * 0

n_hidden_3 = 200 
W3 = torch.randn((n_hidden_2, n_hidden_3), generator=g) * 1 / (n_hidden_1 ** 0.5)
B3 = torch.randn((1, n_hidden_3), generator=g) * 0

# Output layer 
n_output = vocab_size
W4 = torch.randn((n_hidden_3, n_output), generator=g) *  1 / (n_hidden_2 ** 0.5)
B4 = torch.randn((1, n_output), generator=g)

parameters = [W1, B1, W2, B2, W3, B3, W4, B4]


# Let's set requires_grad to True for all parameters
for p in parameters: 
    p.requires_grad = True

In [66]:
batch_size = 1000

epoch = 5000
for i in range(epoch):
    # forward pass
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ))
    emb = C[Xtr[ix]]
    embeddings = emb.flatten(start_dim=1, end_dim=-1)

    hpreact_1 = embeddings @ W1 + B1
    h_1 = torch.tanh(hpreact_1)

    hpreact_2 = h_1 @ W2  + B2 
    h_2 = torch.tanh(hpreact_2)

    hpreact_3 = h_2 @ W3  + B3 
    h_3 = torch.tanh(hpreact_3)

    logits = h_3 @ W4 + B4 

    loss = F.cross_entropy(logits, Ytr[ix])
    
    # Backward pass
    # update the grad to None
    for p in parameters: 
        p.grad = None
    loss.backward()
    
    e = 0.1 if i < 500 else 0.05
    
    for p in parameters: 
        p.data -= e * p.grad
    if i % 100 == 0:
        print(f"loss at: {i}: {loss.item()}")


loss at: 0: 10.991805076599121
loss at: 100: 8.808565139770508
loss at: 200: 8.181946754455566
loss at: 300: 7.795853137969971
loss at: 400: 7.579355239868164
loss at: 500: 7.590691089630127
loss at: 600: 7.651228427886963
loss at: 700: 7.4881415367126465
loss at: 800: 7.689975738525391
loss at: 900: 7.489752769470215
loss at: 1000: 7.483675003051758
loss at: 1100: 7.383844375610352
loss at: 1200: 7.465594291687012
loss at: 1300: 7.448939323425293
loss at: 1400: 7.558730602264404
loss at: 1500: 7.315492630004883
loss at: 1600: 7.269989490509033
loss at: 1700: 7.366788864135742
loss at: 1800: 7.061119556427002
loss at: 1900: 7.210980415344238
loss at: 2000: 7.247604846954346
loss at: 2100: 7.379008769989014
loss at: 2200: 7.080563068389893
loss at: 2300: 7.298737525939941
loss at: 2400: 7.426915168762207
loss at: 2500: 7.107969284057617
loss at: 2600: 7.3432536125183105
loss at: 2700: 7.238056659698486
loss at: 2800: 7.119639873504639
loss at: 2900: 7.217613697052002
loss at: 3000: 7.08

In [67]:
@torch.no_grad()
def evaluate_loss(split):
    # Select data based on split
    x, y = {
        'train': (Xtr, Ytr),
        'validation': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]

    # Forward pass through the model
    emb = C[x]
    embeddings = emb.flatten(start_dim=1, end_dim=-1)
    
    # Hidden layer 1
    hpreact_1 = embeddings @ W1 + B1
    h_1 = torch.tanh(hpreact_1)
    
    # Hidden layer 2
    hpreact_2 = h_1 @ W2 + B2
    h_2 = torch.tanh(hpreact_2)
    
    # Output layer (logits)
    logits = h_2 @ W3 + B3
    
    # Calculate loss
    loss = F.cross_entropy(logits, y)
    return loss

In [29]:
# print(f"Train loss: {evaluate_loss('train')}")
print(f"Validation loss: {evaluate_loss('validation')}")

KeyboardInterrupt: 

In [68]:
import torch
import torch.nn.functional as F

def generate_sentence(start_sequence, max_length=10, period_idx=None):
    # Ensure the start sequence has exactly 5 tokens
    assert len(start_sequence) == 5, "The start sequence must be of length 5"
    
    # Initialize the generated sentence with the start sequence
    generated_sentence = start_sequence[:]

    # Generate words until max_length is reached
    for _ in range(max_length - len(start_sequence)):
        # Prepare input by embedding the current sequence
        input_tensor = torch.tensor(generated_sentence[-5:]).unsqueeze(0)  # Only use the last 5 tokens
        emb = C[input_tensor]
        embeddings = emb.flatten(start_dim=1, end_dim=-1)  # Flatten to match input shape

        # Forward pass through the model
        hpreact_1 = embeddings @ W1 + B1
        h_1 = torch.tanh(hpreact_1)
        
        hpreact_2 = h_1 @ W2 + B2
        h_2 = torch.tanh(hpreact_2)
        
        logits = h_2 @ W3 + B3

        # Get probabilities for the next word
        probs = F.softmax(logits, dim=-1)

        # Sample the next word from the probability distribution
        next_word_idx = torch.multinomial(probs, num_samples=1).item()

        # Append the next word to the generated sentence
        generated_sentence.append(next_word_idx)

        # Break if the generated word is a period
        if next_word_idx == period_idx:
            break

    return generated_sentence

# Example usage
# Assuming `start_sequence` is a list of 5 starting word indices and `period_idx` is the index of "."
start_sequence = [0, 0, 0, 0, 0]  # Replace with actual indices from your vocabulary
period_idx = 0  # Replace with the actual index of "." in your vocabulary

generated_sentence_indices = generate_sentence(start_sequence, period_idx=period_idx)

# Convert indices back to words if you have a `vocab` list or dictionary
print(' '.join(itos[i] for i in generated_sentence_indices))


. . . . . abscissa abolition abated abernathys abyss


In [34]:
emb = C[torch.tensor([0, 0, 0, 0, 0])]
embeddings.s

torch.Size([5, 30])