In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from IPython.display import display, Markdown
import tiktoken
import wandb
import os
import random
import pickle
%matplotlib inline

In [22]:
train_files = os.listdir("../1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled")
test_files = os.listdir("../1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled")

len(train_files), len(test_files)

(100, 50)

In [5]:
train_files[0]

'news.en-00041-of-00100'

In [23]:
train_data, test_data = [], []

for train_file in train_files:
    train_file_path = os.path.join("../1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled", train_file)

    with open(train_file_path) as file:
        train_data_tmp = file.read().splitlines()
        train_data.extend(train_data_tmp)

for test_file in test_files:
    test_file_path = os.path.join("../1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled", test_file)

    with open(test_file_path) as file:
        test_data_tmp = file.read().splitlines()
        test_data.extend(test_data_tmp)

len(train_data), len(test_data)

(30607716, 306688)

In [24]:
train_data = [i.strip() for i in train_data]
test_data = [i.strip() for i in test_data]
train_data = [i for i in train_data if len(i) > 0]
test_data = [i for i in test_data if len(i) > 0]
len(train_data), len(test_data)

(30607716, 306688)

In [25]:
# shuffle this headlines and then join them
random.seed(42)
random.shuffle(train_data)
random.shuffle(test_data)

In [None]:
split_ratio = 0.8
split_idx = int(len(train_data) * split_ratio)
val_data = train_data[split_idx:]
train_data = train_data[:split_idx]

len(train_data), len(val_data)

(24486172, 6121544)

In [28]:
train_str = "\n".join(train_data)
val_str = "\n".join(val_data)
test_str = "\n".join(test_data)

len(train_str), len(val_str), len(test_str)

(3350678828, 837563352, 42010908)

In [8]:
tokenizer = tiktoken.encoding_for_model("gpt-2")

In [30]:
train_tokens = tokenizer.encode(train_str)
val_tokens = tokenizer.encode(val_str)
test_tokens = tokenizer.encode(test_str)

len(train_tokens), len(val_tokens), len(test_tokens)

(721763228, 180428249, 9050773)

In [32]:
len(train_tokens) + len(val_tokens) + len(test_tokens)

911242250

In [33]:
with open('../1-billion-word-language-modeling-benchmark-r13output/train.pkl', 'wb') as file:
    pickle.dump(train_tokens, file)

with open('../1-billion-word-language-modeling-benchmark-r13output/val.pkl', 'wb') as file:
    pickle.dump(val_tokens, file)

with open('../1-billion-word-language-modeling-benchmark-r13output/test.pkl', 'wb') as file:
    pickle.dump(test_tokens, file)

In [2]:
# Read it back
with open('../1-billion-word-language-modeling-benchmark-r13output/train.pkl', 'rb') as file:
    train_tokens = pickle.load(file)

with open('../1-billion-word-language-modeling-benchmark-r13output/val.pkl', 'rb') as file:
    val_tokens = pickle.load(file)

with open('../1-billion-word-language-modeling-benchmark-r13output/test.pkl', 'rb') as file:
    test_tokens = pickle.load(file)

len(train_tokens), len(val_tokens), len(test_tokens)


(721763228, 180428249, 9050773)

In [3]:
# create the model

class MHA(nn.Module):
    def __init__(self, emb_dim, block_size, n_heads, head_dim, dropout):
        super().__init__()

        self.n_heads = n_heads
        self.head_dim = head_dim

        # 1st LayerNorm
        self.ln1 = nn.LayerNorm(emb_dim)

        # first Linear to get from emb_dim --> 3 * n_heads*head_dim, to get k,q,v, then proj back to emb_dim
        self.c_proj = nn.Linear(emb_dim, 3 * n_heads * head_dim, bias=False)
        self.proj = nn.Linear(n_heads * head_dim, emb_dim)

        # 2nd LayerNorm
        self.ln2 = nn.LayerNorm(emb_dim)

        # finally thinking layer
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4 * emb_dim, emb_dim)
        )

        self.dropout1 = nn.Dropout(dropout)

        # finally register the tril matrix
        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))

    def forward(self, x):
        # get the shape
        B, T, C = x.shape

        # Layer norm
        ln_x = self.ln1(x)

        # Project and extract k,q,v
        c = self.c_proj(ln_x) # (B,T,C)  --> (B,T,3*nh*H)
        c = c.view(B, T, self.n_heads, 3 * self.head_dim) # (B,T,nh,3*H)
        k, q, v = torch.split(c, self.head_dim, dim=-1) # each of shape B,T,nh,H
        k, q, v = k.transpose(-3, -2), q.transpose(-3, -2), v.transpose(-3, -2) # B, nh, T, H

        # Get the attention weights
        wei = q @ k.transpose(-2, -1) * (self.head_dim**-0.50) # (B,nh,T,H) @ (B,nh,H,T) -> (B,nh,T,T)
        wei = wei.masked_fill(self.mask[:, :, :T, :T] == 0, -float("inf"))
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout1(wei)

        # Apply to v
        act = wei @ v # (B,nh,T,T) @ (B,nh,T,H) -> (B,nh,T,H)
        act = act.transpose(-3, -2) # B,T,nh,H
        act = act.contiguous().view(B, T, self.n_heads * self.head_dim)

        # Transform to emb_dim and skip connection
        act = self.proj(act) # (B, T,C)
        act = x + act

        # Think and skip connections
        ln_act = self.ln2(act)
        out = self.ffn(ln_act) # (B,T,C)
        out = x + out # x shape (B,T,C)

        return out


class NanoGPT(nn.Module):
    def __init__(self, vocab_size, block_size, emb_dim, n_layers, n_heads, head_dim, dropout, device):
        super().__init__()

        # helper variables
        self.block_size = block_size
        self.device = device

        # Embedding lookup table
        self.token_embbeding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(block_size, emb_dim)

        # MHA head
        self.MHA = nn.Sequential(*[MHA(emb_dim, block_size, n_heads, head_dim, dropout) for _ in range(n_layers)])

        # Layernorm
        self.ln = nn.LayerNorm(emb_dim)

        # final linear layer
        self.lm_layer = nn.Linear(emb_dim, vocab_size)

        # init weights
        self.apply(self._init_weights)

        print(f"Number of parameters: {sum([p.numel() for p in self.parameters()])}")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, x, targets=None):
        # x shape (B, T)
        B, T = x.shape

        token_emb = self.token_embbeding_table(x)
        pos_emb = self.position_embedding_table(torch.arange(0, T).to(self.device))
        emb = token_emb + pos_emb

        emb = self.MHA(emb)
        emb = self.ln(emb)
        logits = self.lm_layer(emb) # (B, T, V)

        loss = None

        if targets is not None:
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

        return logits, loss

    def generate(self, max_tokens=1000):
        with torch.no_grad():
            cur_window, idx_list = torch.LongTensor([[0]]).to(self.device), [0] # (1, 1)

            for i in range(max_tokens):
                cur_window = cur_window[:, -self.block_size:] # (1, B)
                logits, _ = self.forward(cur_window) # (1,B,V)
                probs = torch.softmax(logits, dim=-1).squeeze(dim=0) # (B,V)
                idx = torch.multinomial(probs, num_samples=1, replacement=True)[-1].item()
                cur_window = torch.concat([cur_window, torch.LongTensor([[idx]]).view(1, 1).to(self.device)], dim=-1)
                idx_list.append(idx)

            generated_text = tokenizer.decode(idx_list)

            return generated_text

In [4]:
def get_batch(tokens, block_size, batch_size):
    batch = torch.randint(0, len(tokens)-block_size, (batch_size,)) # B dimension array of random indices
    Xb = torch.stack([torch.LongTensor(tokens[i:i+block_size]) for i in batch], dim=0) # Create (B, T) dimension array
    yb = torch.stack([torch.LongTensor(tokens[i+1:i+block_size+1]) for i in batch], dim=0) # Create (B, T) dimension array
    return Xb, yb

In [5]:
@torch.no_grad()
def compute_loss(tokens, block_size, batch_size, model, device):
    loss_values = []
    for _ in range(100):
        Xb, yb = get_batch(tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        _, loss = model(Xb, yb)
        loss_values.append(loss.item())

    mean_loss = torch.FloatTensor(loss_values).mean().item()
    return mean_loss

In [10]:
def train(train_tokens, val_tokens, model, optimizer, device, block_size, batch_size, n_iters, eval_interval):
    train_lossi, val_lossi = [], []

    for i in range(n_iters):
        model.train()
        Xb, yb = get_batch(train_tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        # forward
        _, loss = model(Xb, yb)

        # set grads to zero
        optimizer.zero_grad(set_to_none=True)

        # do backward
        loss.backward()

        # optimizer step
        optimizer.step()

        if (i % eval_interval == 0) or (i == n_iters - 1):
            model.eval()
            train_loss = compute_loss(train_tokens, block_size, batch_size, model, device)
            val_loss = compute_loss(val_tokens, block_size, batch_size, model, device)

            train_lossi.append(train_loss)
            val_lossi.append(val_loss)
            print(f"Step {i:4d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


             # log metrics to wandb
            # wandb.log({"train_loss": train_loss, "val_loss": val_loss})
        # break

    return train_lossi, val_lossi

In [15]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
n_iters = 5000
eval_interval = n_iters//10
lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
emb_dim = 192
n_heads = 6
head_dim = emb_dim // n_heads
n_layers = 3
dropout = 0.2
vocab_size = tokenizer.n_vocab

In [16]:
model = NanoGPT(emb_dim=emb_dim, vocab_size=vocab_size, block_size=block_size, n_heads=n_heads,\
                 n_layers=n_layers, head_dim=head_dim, device=device, dropout=dropout)
model = model.to(device)

Number of parameters: 20706769


In [17]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

In [18]:
train_lossi, val_lossi = train(train_tokens=train_tokens, val_tokens=val_tokens, model=model, optimizer=optimizer,\
      device=device, block_size=block_size, batch_size=batch_size, n_iters=n_iters, eval_interval=eval_interval)

Step    0 | Train Loss: 10.6372 | Val Loss: 10.6377
Step  500 | Train Loss: 6.0246 | Val Loss: 6.0152
Step 1000 | Train Loss: 5.6117 | Val Loss: 5.6094
Step 1500 | Train Loss: 5.3612 | Val Loss: 5.3678
Step 2000 | Train Loss: 5.1962 | Val Loss: 5.2095
Step 2500 | Train Loss: 5.0789 | Val Loss: 5.0804
Step 3000 | Train Loss: 4.9879 | Val Loss: 4.9864
Step 3500 | Train Loss: 4.9144 | Val Loss: 4.9170
Step 4000 | Train Loss: 4.8485 | Val Loss: 4.8548
Step 4500 | Train Loss: 4.8007 | Val Loss: 4.7980
Step 4999 | Train Loss: 4.7613 | Val Loss: 4.7592


In [19]:
model.eval();

In [21]:
txt = "\n\n".join(model.generate().split("\n"))
display(Markdown(txt))

! this year - and there will be .

We 're keen to catch up with scuttles we play here in Sochi with some big games , " said Professor Hudore , vice presidents of the statistical virtueside .

" We 're not quick hit my own first season at 200 the time .

In the meantime , the bank could provide a $ 10 billion increase to a valuable $ 8m-a pound while Barclays had to reduce as a result in trading years on Sept .

The successful seemingly ruined , at the Communist Party 5pp is near the same antiaogical symbol 's " cloud of light-effectively self-ciplined " revolution in flux .

IF Ind .

S Markets is established by CDSAI , which was posted in the Investor Relations online on heart failure at home to Cumbria landters , EC1 rans / enterprise by IBM & needs ( approximately 2 to 4.44 dollars ) , will represent the corporate legislature , including the Government 's state .

The man may be seen as a smart ones from his normal life , arriving for a small to become a senior person .

INMANYT : The ocrano Horse is one of the Fourth Leave Le Davis Cup of display , a " English studio in the Olympic cricket , " and a lion 's personality , an admire plastic Dutch icon , named Linda Sndy 's second -- followed a coin-like walk .

I don 't have the president a piece of there via a complaint by irrigation services that g Hashumco , Jacques Nagija Panhandle Scotkagojans , an Marines divport specializing in art throughout the flight within Watchers .

The reduction of those polled reduce its rating divided through the Annual Report on 718 million euros ( 97,000 REidders listed via a compensation exchange ) with the authorities , WSIFAA ( WHO ) .

Lyllateovich and the Maxim magazine Hand Fort Worth ( A. Zhangande and Morgan Zhaapova compound dispat May 2005 , in June .

" If we put our aspirations at the meeting at the end of this country , the country is China has it , " Mr Roberts said .

But property sales at more public activity were purchase because it had also been be moved towards recession in 2011 by 2015 , according to Iranian-born U.S. Local Court .

Just 3 percent of cases of homesteaders and mighty farmers go under the blink of a little complex .

Edcio , a congressman and Didier Menz pulled out of the arm ?

Joweiness and subscriber waals refusing a hearts by parents alike , the board finds a factor in that , as an explanation after the ceremony were destined for the struggles it had been .

After allowing farewell to Menciers who tried to be one person on their differences -- were part of the book before the transport market , but not anyone in federal remotely-connected dolls by influential veteran designers .

Our practice rates are muted by PGA 's current ratio must be the energy can be sent to Securities and Exchange Commission and the publisher .

They and Basically thieves are forced to buy many authorities .

But aimed at rush to pay overspersed under the inclined enjoyment of electrical conditions and gastroenter sauce orlees , perhaps in fine exchange than two leading whites .

Carbon fell on each day after missed a bench at 18 by taking a 55-43 3-6 in the fourth centuries ago , his first record was 14-yard to win in the final 25 minutes left by 67 .

So far , Posada or Milliter have two other groups above 500,000 more of the energy line of success , it saw its decision hike from the seizure of the rebuilding capacity to the flight collection .

( AP ) - Rick Reynolds announced a baby-in-lawingel on Tuesday as he came defense party Superceived statewide in nine months but to raise her far to the second yard of a row -- not in Thailand .

The latter was in 46 hospital during a year after hearing .

This will hand runs a flash open game by posting the selfish spots .

The US ambassador of the Eye Service ( AA ) allegedly found dead as the preception of a seizureogen of a resettlement tool in developing countries who have a cutcode on the sky. were also available to www.illcations / family insider .

Or worth , more easily cluttered with tired fitness , so why will cause the sound with everyday frol tissue , but by mosterk cats : an economics of lifestyle and love ?

( AP ) - incumbent Steve Smarter caught on the year in questionable brilliant Candne Brown took advantage within the Milwaukee Vikings to the top 11 victory over the season , he puts a three-game winning streak of a nine-game eighth wrist injury and disappeared in the third quarter .

Two lines went on Sunday night and the bond scandal hurt by the San Francisco troops being crushed