#  Naive  Models

Here we experiment with some naive models to compare the performance with transformers.

We try:

1) Bigram Model
2) Average embedding over sequence length model
3) Average embedding + final token embedding

The training data is Paradise Lost.

In [1]:
import torch

#check cuda
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
X = torch.rand(5, 5).cuda()
print(X)

True
1
NVIDIA GeForce RTX 3070 Laptop GPU
tensor([[0.1560, 0.5180, 0.2055, 0.3095, 0.0988],
        [0.6077, 0.3621, 0.9612, 0.8193, 0.2984],
        [0.1394, 0.2113, 0.3076, 0.9631, 0.4894],
        [0.7862, 0.2930, 0.4077, 0.2345, 0.9184],
        [0.7753, 0.2656, 0.9476, 0.4827, 0.8621]], device='cuda:0')


### Tokenization

Let's tokenize paradise lost

In [2]:
with open('paradise_lost.txt') as f:
    text = f.read()
print(len(text))   

456316


In [3]:
# train a BPE tokenizer
import sentencepiece as spm
spm.SentencePieceTrainer.train(input='paradise_lost.txt', model_prefix='bpe_model', vocab_size=5000, model_type='bpe')


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: paradise_lost.txt
  input_format: 
  model_prefix: bpe_model
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  dif

In [4]:
# Load the tokenizer model
sp = spm.SentencePieceProcessor()
tokenizer = sp.load('bpe_model.model')

5 active=1152 piece=▁Ten
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=3900 all=19660 active=1177 piece=▁pit
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=5 min_freq=4
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=3920 all=19682 active=1022 piece=elier
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=3940 all=19706 active=1046 piece=▁Cour
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=3960 all=19702 active=1042 piece=▁Seav
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=3980 all=19703 active=1043 piece=▁edge
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=4000 all=19719 active=1059 piece=▁seav
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=5 min_freq=4
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=4020 all=19747 active=1026 piece=Abdiel
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=4040 all=19775 active=1054 piece=orting
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5 size=4

In [5]:
# Tokenize a sentence
example = "This is a test sentence. My name is Stephen. I like to eat pizza. I am a student at the University of Washington. @#$%^&*()_+"
tokens = sp.encode(example, out_type=str)
print("Tokens:", tokens)

# Detokenize
detokenized_text = sp.decode(tokens)
print("Detokenized:", detokenized_text)

Tokens: ['▁This', '▁is', '▁a', '▁t', 'est', '▁sentence', '.', '▁My', '▁name', '▁is', '▁St', 'eph', 'en', '.', '▁I', '▁like', '▁to', '▁eat', '▁p', 'iz', 'z', 'a', '.', '▁I', '▁am', '▁a', '▁stud', 'ent', '▁at', '▁the', '▁Un', 'ivers', 'ity', '▁of', '▁Was', 'h', 'ing', 't', 'on', '.', '▁', '@#$%^', '&', '*()', '_', '+']
Detokenized: This is a test sentence. My name is Stephen. I like to eat pizza. I am a student at the University of Washington. @#$%^&*()_+


In [6]:
## view the vocab
with open("bpe_model.vocab", "r") as vocab_file:
    for line in vocab_file.readlines()[:10]:  # View first 10 tokens
        print(line.strip())

<unk>	0
<s>	0
</s>	0
th	-0
▁th	-1
▁a	-2
nd	-3
in	-4
re	-5
▁s	-6


### BIGRAM Model

In [7]:
tokenized_text = sp.encode(text)
print(len(tokenized_text))

120377


In [8]:
import random
def sample_batch(num_batches = 32, seq_len = 16):
    # Randomly sample a block of text
    X = torch.zeros(num_batches, seq_len, dtype=torch.long)
    Y = torch.zeros(num_batches, seq_len, dtype=torch.long)

    for i in range(num_batches):
        start_idx = random.randint(0, len(tokenized_text) - seq_len -1)
        end_idx = start_idx + seq_len
        X[i] = torch.tensor(tokenized_text[start_idx:end_idx])
        Y[i] = torch.tensor(tokenized_text[start_idx + 1:end_idx +1])
    return X, Y

sample_batch(num_batches=4)

(tensor([[ 642, 1669,  238,   32,  220, 3055,  294, 4954,  551,  504,   46,  166,
          4954,  112,  177,   95],
         [1350, 4954,  120,  188, 1617, 1245,  687, 4954, 1088, 3920,  426, 1794,
           118, 2483, 1490,  590],
         [ 380,  392, 4993, 4947, 1423,  267,   46,    5, 2735,  229,  406,   53,
           445, 2715, 4954, 1021],
         [2353,  473, 4974,   85,  403, 3021, 1491,   21, 2893, 4875, 1459, 4749,
           166, 4954,   53,  208]]),
 tensor([[1669,  238,   32,  220, 3055,  294, 4954,  551,  504,   46,  166, 4954,
           112,  177,   95,  833],
         [4954,  120,  188, 1617, 1245,  687, 4954, 1088, 3920,  426, 1794,  118,
          2483, 1490,  590,   62],
         [ 392, 4993, 4947, 1423,  267,   46,    5, 2735,  229,  406,   53,  445,
          2715, 4954, 1021,  165],
         [ 473, 4974,   85,  403, 3021, 1491,   21, 2893, 4875, 1459, 4749,  166,
          4954,   53,  208,  117]]))

In [9]:
class BigramModel(torch.nn.Module):
    """ takes a sequence of tokens and predicts the next token """
    def __init__(self, vocab_size, embedding_dim=32):
        super(BigramModel, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, X):
        # B: Batch Size, T: Sequence Length, E: Embedding Dimension, V: Vocabulary Size
        # X : B, T
        X = self.embeddings(X) # B, T, E
        X = X[:, -1]  # B, E (only the last token)
        logits = self.linear(X)  # B, V
        return logits 
model = BigramModel(vocab_size=5000)
X,  y = sample_batch(num_batches=4)
pred = model(X)
print(X.shape, y.shape, pred.shape)


torch.Size([4, 16]) torch.Size([4, 16]) torch.Size([4, 5000])


In [10]:
# write a generate function
# model is untrained so will be nonsense
seq_len = 16
def generate(input_text, model, tokenizer, num_output_tokens=100):
    model = model.to("cpu")
    model.eval()
    with torch.no_grad():
        tokens = sp.encode(input_text)
        for i in range(num_output_tokens):
            X = torch.tensor(tokens[-seq_len:]).reshape(1, -1)
            logits = model(X)
            next_token = torch.argmax(logits, dim=1)
            tokens.append(int(next_token))
    return sp.decode(tokens)

generate("This is a test", model, tokenizer, num_output_tokens=50)


'This is a testilellect wanderleworoyalwe Ark m Decree brut After Sex t laid wherefore though interp Sireipe bidapable Ra y Potentouth wee Wouldun Sireipe bidapable Ra y Potentouth wee Wouldun Sireipe bidapable Ra y Potentouth wee Would'

In [33]:
# train the model
import torch.optim as optim
import torch.nn.functional as F

model = BigramModel(vocab_size=5000, embedding_dim=32).to('cuda')
train_tokens  = tokenized_text[:int(len(tokenized_text) * 0.9)]
test_tokens = tokenized_text[int(len(tokenized_text) * 0.9):]


def eval_validation_set(model, seq_len=16, batch_size=32):
    # evaluate the validation set loss
    model.eval()
    
    running_loss = 0.0
    running_n = 0
    batch_element = 0
    X = torch.zeros(batch_size, seq_len, dtype=torch.long)
    Y = torch.zeros(batch_size, seq_len, dtype=torch.long)

    for start_idx in range(0, len(test_tokens) - seq_len - 1, seq_len):
        end_idx = start_idx + seq_len
        X[batch_element] = torch.tensor(tokenized_text[start_idx:end_idx])
        Y[batch_element] = torch.tensor(tokenized_text[start_idx + 1:end_idx +1])
        batch_element += 1
        if batch_element == batch_size:
            batch_element = 0
            X, Y = X.to('cuda'), Y.to('cuda')
            for token_idx in range(seq_len):
                with torch.no_grad():
                    logits = model(X[:, :token_idx + 1])
                    targets = Y[:, token_idx]
                    loss = F.cross_entropy(logits, targets)
                running_loss+=loss.item()
                running_n += 1
    return running_loss / running_n
            

def train_model(model, num_batches=2000, batch_size=32, seq_len=16, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    running_loss = 0.0
    running_n = 0
    for batch_idx in range(num_batches):
        # train step
        model.train()
        optimizer.zero_grad()
        X, y = sample_batch(num_batches=batch_size, seq_len=seq_len)
        X, y = X.to('cuda'), y.to('cuda')
        for token_idx in range(seq_len):
            logits = model(X[:, :token_idx + 1])
            targets = y[:, token_idx]
            loss = F.cross_entropy(logits, targets)
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()
            running_n += 1
        if (batch_idx+1) % (num_batches//10) == 0:
            print(f"Batch {batch_idx+1:>4} Running Average Loss: {running_loss / running_n:.3f}")
            running_n, running_loss = 0, 0.0
            validation_loss = eval_validation_set(model, seq_len=seq_len, batch_size=batch_size)
            print(f"Validation Loss: {validation_loss:.3f}")


train_model(model, num_batches=2000, batch_size=32, seq_len=16)

Batch  200 Running Average Loss: 6.807
Validation Loss: 6.079
Batch  400 Running Average Loss: 5.802
Validation Loss: 5.628
Batch  600 Running Average Loss: 5.489
Validation Loss: 5.422
Batch  800 Running Average Loss: 5.281
Validation Loss: 5.211
Batch 1000 Running Average Loss: 5.148
Validation Loss: 5.125
Batch 1200 Running Average Loss: 5.042
Validation Loss: 5.024
Batch 1400 Running Average Loss: 4.965
Validation Loss: 4.977
Batch 1600 Running Average Loss: 4.900
Validation Loss: 4.914
Batch 1800 Running Average Loss: 4.850
Validation Loss: 4.835
Batch 2000 Running Average Loss: 4.804
Validation Loss: 4.781


In [34]:
# this just repeats itself as only looking at the previous token
generate("This is a test", model, tokenizer, num_output_tokens=50)

'This is a test, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground, and from the ground'

### Average Sequence Model

In [35]:
class AvgramModel(torch.nn.Module):
    """ takes a sequence of tokens and predicts the next token """
    def __init__(self, vocab_size, embedding_dim=32):
        super(AvgramModel, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, X):
        # B: Batch Size, T: Sequence Length, E: Embedding Dimension, V: Vocabulary Size
        # X : B, T
        X = self.embeddings(X) # B, T, E
        X = X.mean(axis=1)  # B, E (average over the sequence)
        logits = self.linear(X)  # B, V
        return logits 
model = AvgramModel(vocab_size=5000)
X,  y = sample_batch(num_batches=4)
pred = model(X)
print(X.shape, y.shape, pred.shape)

torch.Size([4, 16]) torch.Size([4, 16]) torch.Size([4, 5000])


In [36]:
model = AvgramModel(vocab_size=5000, embedding_dim=32).to('cuda')
train_model(model, num_batches=2000, batch_size=32, seq_len=16)


Batch  200 Running Average Loss: 6.996
Validation Loss: 6.715
Batch  400 Running Average Loss: 6.540
Validation Loss: 6.501
Batch  600 Running Average Loss: 6.412
Validation Loss: 6.401
Batch  800 Running Average Loss: 6.329
Validation Loss: 6.298
Batch 1000 Running Average Loss: 6.246
Validation Loss: 6.237
Batch 1200 Running Average Loss: 6.170
Validation Loss: 6.165
Batch 1400 Running Average Loss: 6.128
Validation Loss: 6.118
Batch 1600 Running Average Loss: 6.087
Validation Loss: 6.062
Batch 1800 Running Average Loss: 6.024
Validation Loss: 6.027
Batch 2000 Running Average Loss: 5.991
Validation Loss: 5.983


In [37]:
# doesnt repeat itself but more nonsense
generate("This is a test", model, tokenizer, num_output_tokens=50)

'This is a test,, and and, and the the place, and the of of _ _ _ _PPus__P_Pooo_etsets’,,, and and the, and the _ _ _ _Adam_Adam_'

In [38]:
# average with residual connection
class AvResidualModel(torch.nn.Module):
    """ takes a sequence of tokens and predicts the next token """
    def __init__(self, vocab_size, embedding_dim=32):
        super(AvResidualModel, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, X):
        # B: Batch Size, T: Sequence Length, E: Embedding Dimension, V: Vocabulary Size
        # X : B, T
        X = self.embeddings(X) # B, T, E
        X = X.mean(axis=1) + X[:, -1, :]  # B, E (average + last)
        logits = self.linear(X)  # B, V
        return logits 
model = AvResidualModel(vocab_size=5000)
X,  y = sample_batch(num_batches=4)
pred = model(X)
print(X.shape, y.shape, pred.shape)

torch.Size([4, 16]) torch.Size([4, 16]) torch.Size([4, 5000])


In [39]:
model = AvResidualModel(vocab_size=5000, embedding_dim=32).to('cuda')
train_model(model, num_batches=2000, batch_size=32, seq_len=16)

Batch  200 Running Average Loss: 6.759
Validation Loss: 6.199
Batch  400 Running Average Loss: 5.922
Validation Loss: 5.754
Batch  600 Running Average Loss: 5.618
Validation Loss: 5.516
Batch  800 Running Average Loss: 5.423
Validation Loss: 5.339
Batch 1000 Running Average Loss: 5.272
Validation Loss: 5.206
Batch 1200 Running Average Loss: 5.185
Validation Loss: 5.118
Batch 1400 Running Average Loss: 5.086
Validation Loss: 5.070
Batch 1600 Running Average Loss: 5.029
Validation Loss: 4.983
Batch 1800 Running Average Loss: 4.968
Validation Loss: 4.946
Batch 2000 Running Average Loss: 4.919
Validation Loss: 4.901


In [40]:
# Small improvementr. Ends in repeatition
generate("This is a test", model, tokenizer, num_output_tokens=50)

'This is a test, and the place of Heav’n, and the place of Heav’n, and the Earth, and the _Adam_ and the _Adam_ with the _Raphael_ _Adam_ with the _Adam_ _Adam_ _Adam_'