In [36]:
import os
import sys
import time
import math
import argparse
from dataclasses import dataclass
from typing import List

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter

torch.manual_seed(5190)

<torch._C.Generator at 0x7f0a446c3310>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Read the text file
input_file_path = os.path.join('/content/drive/My Drive/', 'training_dataset.txt')
if not os.path.exists(input_file_path):
    print('File not found!')


In [9]:
# Examine the dataset
with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

length of dataset in characters: 52,780


In [10]:
# get the vocabulary (i.e. the unique chars)
# chars should be a SORTED list of unique characters in the data object

chars = list(set(data))
chars.sort()

vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !"'(),-.2;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


In [11]:
# a mapping from character to integer
ctoi = {s:i for i,s in enumerate(chars)}


# a mapping from integer to character
# that reverses the ctoi constructed above

itoc = {i:s for i,s in enumerate(chars)}

In [12]:
# encode the dataset
def encode(text):
  # input: a string
  # output: a list of integers that represents each character in the string
  return [ctoi[s] for s in text]


# decode the dataset
def decode(rep):
  # input:  a list of integers 
  # output: a string that was represented by the input list of integers

  str_list = [itoc[a] for a in rep]
  return ''.join(str_list)  

In [13]:
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode both to integers
train_ids = torch.tensor(encode(train_data),dtype=torch.long)
val_ids = torch.tensor(encode(val_data),dtype=torch.long)

In [29]:
# Define a block size, batch_size, and construct the batch
# Create the training/test datasets
# Recall that we want to take in the past block_size characeters (where we 
# choose block_size) and output the next charcater
block_size = 16

# Args: 
#   split: set to train when want to use train_ids, set to val when want to use
#          val_ids
#   batch_size: the size of the batch for training
# Outputs:
#   x: training/val data, where each row is a list of previous character ids
#   y: training/val ground truth, where each entry is the ground-truth character
#      id for the current character
# 
# In essence, we want to use the list of previous character ids ot predict the 
# current character id

# Note that the output x and y should be torch tensors. 

def get_batch(split,batch_size):
  data = train_ids if split=='train' else val_ids
  ix = torch.randint(len(data)-block_size,(batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.tensor([data[i+block_size] for i in ix])
  return x,y

In [37]:
# Initialize the embedding dictionary
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(5190)

n_embd = 80
n_embd2 = 256

class MLP_neural_language(nn.Module):
    """
    takes the previous block_size tokens, encodes them with a lookup table,
    concatenates the vectors and predicts the next token with an MLP.
    Reference:
    Bengio et al. 2003 https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
    """

    def __init__(self, block_size,vocab_size,n_embd,n_embd2):
        super().__init__()
        self.block_size =block_size
        self.vocab_size =vocab_size
        self.wte = nn.Embedding(vocab_size,n_embd) 
        self.mlp = nn.Sequential(
            nn.Linear(self.block_size *n_embd,n_embd2),
            nn.Tanh(),
            nn.Linear(n_embd2, self.vocab_size)
        )

    def get_block_size(self):
        return self.block_size

    def forward(self, inputs_idx, targets=None):
        inputs_embd = self.wte(inputs_idx)
        inputs_embd = inputs_embd.reshape(inputs_embd.shape[0],-1)
        logits = self.mlp(inputs_embd)
        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


m = MLP_neural_language(block_size,vocab_size,n_embd,n_embd2)

In [38]:
# a simple training training loop
m = m.to('cuda')
batch_size = 4096
num_epochs = 30
num_batches = 256
optimizer = torch.optim.Adam(m.parameters(),lr=1e-3)
for i in range(num_epochs):
  epoch_loss = 0.0
  for steps in range(num_batches):
    optimizer.zero_grad(set_to_none=True)

    # sample a batch of data
    xb,yb = get_batch('train', batch_size)

    xb = xb.to('cuda')
    yb = yb.to('cuda')

    # evaluate the loss
    logits, loss =  m(xb, yb)

    loss.backward()
    
    optimizer.step()

    epoch_loss += loss.item()

  print(epoch_loss/num_batches)

1.5562942894175649
0.31910447680274956
0.06418634732835926
0.025690279879199807
0.01433028594692587
0.009261041295758332
0.006516140361782163
0.004856060912970861
0.003729683976416709
0.0029881180789743667
0.002427669969620183
0.002013767864809779
0.0017262325422962022
0.0014298174492068938
0.0012763875752170861
0.0011215933318453608
0.0010130664668395184
0.0008864019132488465
0.000802357483621563
0.02412743557624708
0.008847959074500977
0.0012301118856612447
0.0009018886726153141
0.0007987368994690769
0.0006858012859538576
0.0006294185532169649
0.0005732847138801844
0.0005094436909871547
0.00048298538530389123
0.00047452504338707513


In [41]:
# Generation
# We use the first block_size number of characters in the val set to start 
# generating Shakespear-styled writings. 

m = m.to('cpu')
limit = 2000
# the first block_size number of characters in val_ids
context = val_ids[:block_size].tolist()
generation = context

for i in range(limit):
  # Please finish this for loop for generation at test time given the initial 
  # context. Note context is of data type list, but the trained model m 
  # expects torch tensor objects. Just a heads up. 
  context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
  
  i_embd = m.wte(context_tensor)
  i_embd = i_embd.reshape(i_embd.shape[0], -1)
  logits = m.mlp(i_embd)
  next_token = torch.multinomial(F.softmax(logits, dim=1).squeeze(0), 1).item()
  generation.append(next_token)
  context = generation[-block_size:]


In [42]:
# Decode the generation and see what the AI came up with :)
print(decode(generation))

PIO

It sounds like Han.

LANDO

(in the gilll. seru!

LANDO

Wh, wha beailes.

Thed Pre't sher.

REETSON

Come in!  Lake a chat ore of this

pithe pederays woll pearing andreq?

HAN

Youh, into That boply nowllly.

Ches I mand that'm  nit hemp,rie.

HAN

Prenger you got het your stoung

You be af, shute

This I din't no we

mige powe the go for beed?

LEIA

(takrend)

Okhan tra ais on!  We tuste quatong the kto

that me to stom the Ragy that ife

here in thit palind got to peat're fou donto the

mige ang the erve.  Monot manter

for the mandread wian whave are tareera

sight meno ght peacr.

LEIA

You po as unt ingat.  We're flint hes itt ourd

inet the fore weread I'm know.

YODA

Sou dout hime to falle can the

fack in are wirle, IM

the sereca dount wo lhit tha gore.

THREEPIO

Well, in the stams fecl is.

HAN

Mome Jes are thing, I mone herm?

ANDO

Com ion, I winl ie tiat in minlt

ave there it lo do the

areatere fore.  now manter.

LEIA

HAN

Year?

YODA

We't s otiy u tan wit 