# Simple Language Model
---

### 1. **Load** and **Process** news

In [7]:
file_path = '../data/dummy.txt'

def process_raw_news(news):
    news = news\
            .replace('\n', '')\
            .lower()
    return news

with open(file_path, 'r') as file:
    raw_news = process_raw_news(file.read())

news = process_raw_news(raw_news)
news[:100]

'nvidia nvda has solidified its position as a leader in the technology industry, especially within th'

### 2. **Extract** vocabulary

In [8]:
vocab = list(set(news.split(' ')))
vocab_size = len(vocab)
vocab_size, vocab[:10] 

(465,
 ['investors',
  'maintaining',
  'developers',
  'infrastructure',
  'blackwell',
  'prospects',
  'than',
  'intelligence',
  '135%',
  'delivered'])

### 3. **Build** *enconder* and *decoder*

In [9]:
encode = lambda ts: [vocab.index(t) for t in ts.split(' ')]
decode = lambda il: ' '.join([vocab[i] for i in il])

In [10]:
import torch

data = torch.tensor(encode(news), dtype=torch.long)
data[:100]

tensor([235,  54, 179, 205,  23,  24,  65, 429, 232, 241, 285, 164, 313,  92,
        456, 285, 385, 435, 289, 459, 385, 304, 179, 458,  57,  65, 433,  65,
        367, 134, 351, 413,  18, 275,  96, 436, 172, 125, 429, 168, 146, 460,
        355, 285, 394, 242, 285, 305,   7, 452, 185, 381, 251, 179, 222, 172,
        109, 414, 285, 249, 424, 269, 346, 411, 285, 374, 215, 237, 127, 339,
         23, 167, 460, 319, 242, 405, 246, 235, 179, 285, 143, 342, 464, 429,
        113, 146, 430, 456, 285,  13, 250, 172, 148,  23, 266, 241, 372, 443,
        417, 226])

In [14]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [12]:
block_size = 8
train_data[:block_size + 1]

tensor([235,  54, 179, 205,  23,  24,  65, 429, 232])

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target  = y[t]
    print(f'context:{context} -> target:{target}')

context:tensor([235]) -> target:54
context:tensor([235,  54]) -> target:179
context:tensor([235,  54, 179]) -> target:205
context:tensor([235,  54, 179, 205]) -> target:23
context:tensor([235,  54, 179, 205,  23]) -> target:24
context:tensor([235,  54, 179, 205,  23,  24]) -> target:65
context:tensor([235,  54, 179, 205,  23,  24,  65]) -> target:429
context:tensor([235,  54, 179, 205,  23,  24,  65, 429]) -> target:232


In [22]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix   = torch.randint(len(data) - block_size, (batch_size,))
    x    = torch.stack([data[i:i+block_size] for i in ix])
    y    = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train_data')
print('train:')
print(xb)
print('validate:')
print(yb)

train:
tensor([[264, 150, 323,  99, 401, 165,   0, 153],
        [ 60,  59, 449, 417, 298, 241, 145, 398],
        [ 17, 363, 357, 111, 416, 276, 241, 306],
        [417, 298, 241, 145, 398,  42, 231, 401]])
validate:
tensor([[150, 323,  99, 401, 165,   0, 153, 342],
        [ 59, 449, 417, 298, 241, 145, 398,  42],
        [363, 357, 111, 416, 276, 241, 306, 242],
        [298, 241, 145, 398,  42, 231, 401, 239]])


In [29]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx)
        
        B, T, C = logits.shape
        logits = logits.view(B*T, C)

        targets = targets.view(B*T)

        loss  = F.cross_entropy(logits, targets)
        return logits, loss
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)


torch.Size([32, 465])
tensor(6.6312, grad_fn=<NllLossBackward0>)
