In [None]:
# IMPORT LIBRARIES

import torch
import torch.nn as nn
from torch.nn import functional as F
import os

# Check if GPU is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


# Baseline Bigram Model

## Dataset

In [None]:
# Let's start with a dataset, we will use sp500_earnings_transcripts which contains thousands of real corporate earnings call.

# 1. Install the Hugging Face datasets library
!pip install datasets

from datasets import load_dataset
print("Downloading full S&P 500 earnings transcripts...")

# 2. Load the proper text-based dataset
dataset = load_dataset("Bose345/sp500_earnings_transcripts", split="train")

# 3. Extract the 'content' column (the full raw transcript text)
# Pulling 45 full transcripts for a sizeable dataset
transcripts = dataset['content'][:45]
text = "\n\n--- NEXT EARNINGS CALL ---\n\n".join(transcripts)

# 4. Save it locally as input.txt
with open('input.txt', 'w', encoding='utf-8') as f:
    f.write(text)

file_size_mb = os.path.getsize('input.txt') / (1024 * 1024)
print(f"File saved. Size: {file_size_mb:.2f} MB")
print(f"Length of dataset in characters: {len(text)}")

Downloading full S&P 500 earnings transcripts...
File saved. Size: 2.30 MB
Length of dataset in characters: 2404755


In [None]:
# Read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:1000])

Operator: Good afternoon, and welcome to the Agilent Technologies Fourth Quarter Earnings Conference Call. All lines have been placed on mute to prevent any background noise. After the speakers' remarks, there will be a question-and-answer session. [Operator Instructions] Thank you. And now, I'd like to introduce you to the host for today's conference, Ankur Dhingra, Vice President of Investor Relations. Sir, please go ahead.
Ankur Dhingra: Thank you, and welcome everyone to Agilent's fourth quarter and full-year conference call for fiscal year 2020. With me are Mike McMullen, Agilent's President and CEO; and Bob McMahon, Agilent's Senior Vice President and CFO. Joining in the Q&A after Bob's comments will be: Jacob Thaysen, President of Agilent's Life Sciences & Applied Markets Group; Sam Raha, President of Agilent's Diagnostics and Genomics Group; and Padraig McDonnell, President of Agilent CrossLab Group. This presentation is being webcast live. The news release, Investor presentati

**NOTE:**
 Keeping speaker labels like "Operator" and "Ankur Dhingra" teaches the model the structural flow of a corporate dialogue, making text generation more realistic and contextually aware. A --- NEXT EARNINGS CALL --- string separates each call and acts as a end of text token so the model knows the previous conversation has ended and a brand new topic is beginning.

## Tokenization

In [None]:
# 1. Get all unique characters that occur in this text (The Vocabulary)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")
print(f"Vocabulary: {''.join(chars)}")

# 2. Create the Tokenizer mapping
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: string -> list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: list of ints -> string

# 3. Test the tokenizer
test_string = "Q3 revenue increased"
encoded_str = encode(test_string)
print(f"\nEncoded '{test_string}': {encoded_str}")
print(f"Decoded back: {decode(encoded_str)}")

Vocabulary size: 94
Vocabulary: 
 !"#$%&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyzÉàè–—‘’“”…⁠

Encoded 'Q3 revenue increased': [45, 19, 1, 74, 61, 78, 61, 70, 77, 61, 1, 65, 70, 59, 74, 61, 57, 75, 61, 60]
Decoded back: Q3 revenue increased


Tokenization simply converts raw text as a string to some sequence of integers according to some vocabulary of possible elements. Here we have 94 possible elements (including spaces). Each unique element is converted to a unique integer.

In [None]:
# We take all the text, enocde it and wrap it into a torch.tensor, giving us the data tensor
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([2404755]) torch.int64
tensor([43, 72, 61, 74, 57, 76, 71, 74, 26,  1, 35, 71, 71, 60,  1, 57, 62, 76,
        61, 74, 70, 71, 71, 70, 12,  1, 57, 70, 60,  1, 79, 61, 68, 59, 71, 69,
        61,  1, 76, 71,  1, 76, 64, 61,  1, 29, 63, 65, 68, 61, 70, 76,  1, 48,
        61, 59, 64, 70, 71, 68, 71, 63, 65, 61, 75,  1, 34, 71, 77, 74, 76, 64,
         1, 45, 77, 57, 74, 76, 61, 74,  1, 33, 57, 74, 70, 65, 70, 63, 75,  1,
        31, 71, 70, 62, 61, 74, 61, 70, 59, 61,  1, 31, 57, 68, 68, 14,  1, 29,
        68, 68,  1, 68, 65, 70, 61, 75,  1, 64, 57, 78, 61,  1, 58, 61, 61, 70,
         1, 72, 68, 57, 59, 61, 60,  1, 71, 70,  1, 69, 77, 76, 61,  1, 76, 71,
         1, 72, 74, 61, 78, 61, 70, 76,  1, 57, 70, 81,  1, 58, 57, 59, 67, 63,
        74, 71, 77, 70, 60,  1, 70, 71, 65, 75, 61, 14,  1, 29, 62, 76, 61, 74,
         1, 76, 64, 61,  1, 75, 72, 61, 57, 67, 61, 74, 75,  8,  1, 74, 61, 69,
        57, 74, 67, 75, 12,  1, 76, 64, 61, 74, 61,  1, 79, 65, 68, 68,  1, 58,
      

## Train/val split

In [None]:
# Split data into training and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Training data length: {len(train_data):,} tokens")
print(f"Validation data length: {len(val_data):,} tokens")

Training data length: 2,164,279 tokens
Validation data length: 240,476 tokens


## Data Loader

We don't feed the Trasnsformer the entire dataset all at once as its computationally expensive. We sample and train small chunks at a time, defined by the block size or context window. We can train on context between 1 all the wat to block size. For example if block size is 8, that chunk will have 8 different predictions to train on.

Iteratively, after we reach the end of the block size, the Transformer starts truncating to accomodate its most recent prediction to be part of the new context. It never receives more than block size input when predicting the next character.

Everytime we feed a chunk of text into a Transformer, we have many batches of multiple chunks of text that are stacked up in a single tensor, for efficiency and parallelization.

How many independent sequences will we process in parallel?
Input size = [batch size, context window]

In [None]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range (block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is {target}.")

When input is tensor([43]), the target is 72.
When input is tensor([43, 72]), the target is 61.
When input is tensor([43, 72, 61]), the target is 74.
When input is tensor([43, 72, 61, 74]), the target is 57.
When input is tensor([43, 72, 61, 74, 57]), the target is 76.
When input is tensor([43, 72, 61, 74, 57, 76]), the target is 71.
When input is tensor([43, 72, 61, 74, 57, 76, 71]), the target is 74.
When input is tensor([43, 72, 61, 74, 57, 76, 71, 74]), the target is 26.


In [None]:
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions?

def get_batch(split):
    # Generate a small batch of data for inputs X and target Y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # 4 random indexes to scoop out data from
    x = torch.stack([data[i:i+block_size]for i in ix])
    y = torch.stack([data[i+1:i+block_size+1]for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('---')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()}, the target is {target}')

inputs:
torch.Size([4, 8])
tensor([[76,  1, 81, 71, 77,  1, 69, 57],
        [76, 64, 61, 74, 57, 72, 81,  1],
        [71,  1, 79, 64, 65, 59, 64,  1],
        [ 1, 71, 70,  1, 76, 64, 61, 75]])
targets:
torch.Size([4, 8])
tensor([[ 1, 81, 71, 77,  1, 69, 57, 81],
        [64, 61, 74, 57, 72, 81,  1, 60],
        [ 1, 79, 64, 65, 59, 64,  1, 65],
        [71, 70,  1, 76, 64, 61, 75, 61]])
---
When input is [76], the target is 1
When input is [76, 1], the target is 81
When input is [76, 1, 81], the target is 71
When input is [76, 1, 81, 71], the target is 77
When input is [76, 1, 81, 71, 77], the target is 1
When input is [76, 1, 81, 71, 77, 1], the target is 69
When input is [76, 1, 81, 71, 77, 1, 69], the target is 57
When input is [76, 1, 81, 71, 77, 1, 69, 57], the target is 81
When input is [76], the target is 64
When input is [76, 64], the target is 61
When input is [76, 64, 61], the target is 74
When input is [76, 64, 61, 74], the target is 57
When input is [76, 64, 61, 74, 57],

## Bigram Language Model, loss and generation

A Bigram Language Model creates a token embedding table of (vocab_size, vocab_size). Every single integer is going to refer to the embedding table and pluck out a row of that table coressponding to its index. PyTorch arranges that into (B,T,C) which is interpreted as logits which is basically the scores for the next character in the sequence.

B = batch, T = context window, C = vocab_size

We use -log likelihood loss on predictions and target. We are expecting the loss to be -ln(1/94) = ~ 4.543 in which the model gives every possible character rougly equal probabilities, essentially just guessing.

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits from the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None

        else:
          B, T, C = logits.shape
          logits = logits.view(B*T, C)
          targets = targets.view(B*T)
          loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 94])
tensor(5.0279, grad_fn=<NllLossBackward0>)


In [None]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens = 100)[0].tolist())) # Printing garbage as model is not trained


R“w((%O”— ;nHMrIg⁠Bk–k–9&YNnsL0)-P
O3IdYG)?N8,0'JEs–kBohR&"Oè2[–Éspr’E1nJ‘oY3al"+‘Lcy1-P+$zGgK1LsLSà


## Training the Bigram Model

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10000):

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4587090015411377


In [None]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens = 600)[0].tolist())) # Printing something a bit more reasonable as our Bigram Model trains and decreases its loss to 2.46


Ok f w iount nke trtharonicMcowevoreto ARins stomole lels Ch wahex
Micon f cacr jusk bureavee bu ou'sengoull r asorenuce tind idengma maus, a, s e Mas on ld. t, I e benecck: thand Ourithisem, ghinvexcor wer thel. inigu lonnad- t cerulll fonacor ictt wt d t ts f s ande.97. f dariMconever aid We t qut omer.7-Phare Bustminthe harowecey ad asua recy, aren negicincoug cank d l, thandun towiow. age s nuikepu les ithempathron llincty, big An W.
RUn imathe bomof tha g ase thedido ifecs We tour wexce is, cade. a aredoures bllon'sowind s. aly Aneron shingro t bld tlongon s wnok iout abu tesevioberore ac


# Self-Attention

A token should only talk to previous tokens and not future tokens because you are trying to predict the future. We were able to use batch matrix multiply to perform a weighted aggregation in which the weights are specified by a T,T array with torch.tril.

You can convert all 0's to be negative infinity in the tril matrix so that when softmax is applied, they become 0 whereas the lower side of the triangle averages out.

In [None]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# Single head of attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) = (B, T, T)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 32])

**The need for Self Attention**

We don't want previous tokens to be all uniform. Different tokens should find other tokens more or less interesting and we want it to be data dependent. A vowel should be interested in what previous consonants are and for that information to flow to that vowel.

Self attention solves this by emitting three vectors for each token, a query, key and value.

**Query** = What am I looking for?

**Key** = What do I contain?

A dot product is computed as a measure of 'alignment', looking for keys which aligns well or 'answer' certain queries. This means a token places a higher emphasis on other tokens that have keys well aligned with their own queries.

**Attention Weights** =  The dot product of Q and K, calculates the "How much" is passed.

**Value** = If you find me relevant, here is how much I will actually tell you.

When we do aggregation, we don't aggregate the tokens exactly. We aggregate a vector V instead of the raw value.

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0865, 0.9135, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6873, 0.3053, 0.0074, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0083, 0.1928, 0.6549, 0.1440, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0893, 0.3930, 0.4423, 0.0285, 0.0469, 0.0000, 0.0000, 0.0000],
        [0.8100, 0.0827, 0.0333, 0.0159, 0.0483, 0.0098, 0.0000, 0.0000],
        [0.0395, 0.0994, 0.0386, 0.5205, 0.2477, 0.0471, 0.0072, 0.0000],
        [0.0095, 0.1311, 0.2246, 0.1646, 0.0944, 0.2340, 0.0520, 0.0898]],
       grad_fn=<SelectBackward0>)

**The need for Feed-Forward Neural Networks**

Tokens looked at each other but didn't really have time to think on what they've found from the other tokens. A FeedFoward layer is added after the self-attend, it works on the per token level. The self attention is the communication, once they gathered all the data, they need to think on that data individually.

**The need for Residual Connections**

Starting to get a deep NN means it can suffer from optimzation issues. Here is where we need skip conncections (residual). In skip connections, computations happens from top to bottom (residual pathway), you are free to fork off from the pathway, perform some computation and then project back to the residual pathway via addition.
During backpropogation, addition distributes gradients equally to both of its branches that fed as the input. The gradients from the loss hop through every addition note all the way to the input, also fork off through the residual blocks. We have this gradient superhighway from the supervision all the way to the input unimpeded. Residual blocks are initilized in the beginning to contribute very little to the residual pathway. In the beginning they are basically not there but as the optimization occurs, they come online over time which dramatically helps with the optimization.

**The need for Layer Normalization**

Batch Normalization made sure any individual neuron have unit gaussian distribution (0 mean, 1 s.d.). In LayerNorm you dont normalise the columns, you normalize the rows.