In [118]:
import torch
import copy
import random
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# Hyperparmeters
batch_size = 32
block_size = 23
n_embd = 384
n_heads = 6
n_blocks = 6
dropout_ratio = 0.2
lr = 3e-4
max_iters = 5001
eval_interval = 500
eval_iters = 200
pad_token = 57
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(1337)

<torch._C.Generator at 0x10d7ac550>

In [7]:
# EDA
text = open("Dataset/names.txt").read()
vocab = sorted(set(text))
vocab_size = len(vocab) + 1     # 1 refers to pad token
print("-"*80)
print("EDA")
print("-"*80)
print(f"Vocabulary : \n{vocab}\n\nVocab size : {vocab_size}\n")

data = open("Dataset/names.txt").read().splitlines()
print(f"First ten samples before shuffling : \n{data[:10]}\n")
random.seed(13377)
random.shuffle(data)
print(f"First ten samples after shuffling : \n{data[:10]}\n")
max_ix = 0
for ix, name in enumerate(data):
    if len(name) > len(data[max_ix]):
        max_ix = ix
print(f"Longest input : {data[max_ix]}\t\tLength : {len(data[max_ix])}\n")
print("-"*80)


# Data preprocessing
# Convert the names into their bytes
for i, name in enumerate(data):
    data[i] = name.encode('utf-8')

--------------------------------------------------------------------------------
EDA
--------------------------------------------------------------------------------
Vocabulary : 
['\n', ' ', '!', '.', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~']

Vocab size : 58

First ten samples before shuffling : 
['~Aacharya.', '~Aadanyan.', '~Aadarshan.', '~Aadesh.', '~Aadhavan.', '~Aadhi.', '~Aadhiraiyan.', '~Aadhirayan.', '~Aadhisan.', '~Aadhithan.']

First ten samples after shuffling : 
['!Charithra.', '!Yarakan.', '!Karunkuzhali.', '~Hariesh.', '~Divahar.', '!Racshana.', '!Shanushana.', '!Rithanyaa.', '!Mythura.', '!Shajeena.']

Longest input : ~Jeyachandraramanthanan.		Length : 24

--------------------------------------------------------------------------------


In [None]:
def encode(data):
    for i,name in enumerate(data):
        ix = []
        for ch in name:
            ix.append(stoi[ch])
        data[i] = torch.tensor(ix, dtype = torch.long)
        
decode = lambda ix: ''.join([itos[i] for i in ix])

def pad_sequences(data, pad_token, max_length):
    for i,name in enumerate(data):
        if len(name) != max_length:
            pad_tensor = torch.full((max_length - len(name),), pad_token)
            data[i] = torch.cat((name, pad_tensor))

def split(data):
    n = int(0.9*len(data))
    xd = [d[:block_size] for d in data]
    yd = [d[1:] for d in data]
    xtr = torch.stack(xd[:n])
    ytr = torch.stack(yd[:n])
    xval = torch.stack(xd[n:])
    yval = torch.stack(yd[n:])
    return xtr, ytr, xval, yval

def get_batch(mode):
    if mode == "train":
        x = xtr
        y = ytr
    else:
        x = xval
        y = yval
    ix = torch.randint(len(x), (batch_size,))
    xb = x[ix]
    yb = y[ix]
    return xb, yb

encode(data)
pad_sequences(data, pad_token, max_length = 24)
xtr, ytr, xval, yval = split(data)
print(f"Train data size : {len(xtr)}\n")
print(f"Val data size : {len(xval)}\n")
print("-"*80)

In [175]:
def encode():
    names_enc = []
    for name in names:
        names_enc.append(list(map(int, name.encode('utf-8'))))
    return names_enc


def  get_counts(names_enc_data):
    counts = {}
    for name in names_enc_data:
        for b_pair in zip(name, name[1:]):
            counts[b_pair] = counts.get(b_pair, 0) + 1
    return counts
#print(sorted(((v,k) for k,v in counts.items()), reverse = True))


def merge(names_enc_modified, max_pair, ix):
    for j, name in enumerate(names_enc_modified):
        new_bytes = []
        i = 0
        while i < len(name):
            if i < len(name)-1 and name[i] == max_pair[0] and name[i+1] == max_pair[1]:
                new_bytes.append(ix)
                i += 2
            else:
                new_bytes.append(name[i])
                i += 1
        names_enc_modified[j] = new_bytes
    return names_enc_modified


def create_merges(names_enc_copy):
    merges = {}
    for i in range(num_merges):
        counts = get_counts(names_enc_copy)
        ix = 256 + i
        max_pair = max(counts, key = counts.get)
        print(f"merging pair {max_pair} into {ix}")
        names_enc_copy = merge(names_enc_copy, max_pair, ix)
        merges[max_pair] = ix
    return names_enc_copy, merges


def prepend_start_token_and_append_end_token(names_enc_copy):
    for i, b_s in enumerate(names_enc_copy):
        if i < 18268:
            names_enc_copy[i] = torch.tensor([126] + b_s + [46])
        else:
            names_enc_copy[i] = torch.tensor([33] + b_s + [46])
    return names_enc_copy

In [176]:
with open('Dataset/names.txt', 'r') as f:
    names = f.read().splitlines()
for i, name in enumerate(names):
    names[i] = name[1:-1]

print("Encoding data...")
names_enc = encode()
vocab_size = 276
num_merges = vocab_size - 256
names_enc_copy = copy.deepcopy(names_enc)
print("Applying BPE..Creating merges...")
names_enc_copy, merges = create_merges(names_enc_copy)
print("Prepending and appending start and end tokens...")
names_enc_copy = prepend_start_token_and_append_end_token(names_enc_copy)

Encoding data...
Applying BPE..Creating merges...
merging pair (97, 110) into 256
merging pair (116, 104) into 257
merging pair (115, 104) into 258
merging pair (97, 114) into 259
merging pair (105, 110) into 260
merging pair (97, 257) into 261
merging pair (121, 97) into 262
merging pair (101, 101) into 263
merging pair (105, 257) into 264
merging pair (105, 107) into 265
merging pair (114, 97) into 266
merging pair (104, 97) into 267
merging pair (105, 258) into 268
merging pair (101, 110) into 269
merging pair (97, 118) into 270
merging pair (97, 108) into 271
merging pair (260, 105) into 272
merging pair (97, 109) into 273
merging pair (256, 257) into 274
merging pair (97, 115) into 275
Prepending and appending start and end tokens...


In [177]:
max_ix = 0
data = names_enc_copy
for ix, name in enumerate(data):
    if len(name) > len(data[max_ix]):
        max_ix = ix
print(f"Longest input : {data[max_ix]}\t\tLength : {len(data[max_ix])}\n")

Longest input : tensor([ 33,  84, 104, 105, 114, 117, 110, 105, 266, 105,  99, 104, 115, 101,
        108, 118, 105,  46])		Length : 18



In [143]:
names[18268]

'Aaby'

In [147]:
list('~'.encode())

[126]

In [148]:
list('!'.encode())

[33]

In [149]:
list('.'.encode())

[46]