In [68]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [4]:
batch_size = 32
block_size = 64
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128
n_head = 4
n_layer = 4
dropout = .2

# Loading, Encoding, and Splitting the Data

In [6]:
df = pd.read_csv('./merged_data.csv')
full_str = ''
for i, row in df.iterrows():
    content = row.content
    full_str += ' ' + content

chars = sorted(list(set(full_str)))
vocab_size = len(chars)

In [7]:
mapping = {ch: i for i, ch in enumerate(chars)}
gnippam = {i: ch for i, ch in enumerate(chars)}

encode = lambda st: [mapping[ch] for ch in st]
decode = lambda ls: ''.join([gnippam[i] for i in ls])

In [8]:
data = torch.tensor(data=encode(full_str), dtype=torch.long)
n = int(.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
def get_batch(split):
    '''generate a small batch of data of inputs x and targets y'''
    
    data = train_data if split == 'train' else val_data

    ix = torch.randint(
        low=0,
        high=len(data) - block_size,
        size=(batch_size,)
    )
    x = torch.stack(tensors=[data[i:i+block_size] for i in ix]).to(device)
    y = torch.stack(tensors=[data[i+1:i+1+block_size] for i in ix]).to(device)

    return x, y

In [60]:
test_tensor = get_batch(split='test')[0]
test_list = []
for row in test_tensor:
    for element in row:
        test_list.append(element.item())

In [61]:
decode(test_list)

" each was about 4%. We are expecting low-single-digit, slightly egory when we get more funding.\nOperator: Thank you. Our next quve for next year is $750 million of COVID testing related revenu comparisons will be chunky this year. Turning to regulatory filt you're seeing tremendous leasing demand, restaurant demand, etarting to flatten but just to catch up on all the other inflatiohis year, demonstrate the strength and diversity of our portfoli the transportation for the long haul. I'm not telling you that VAC. And can you give a sense of sell-through you saw in the fir Carestio, maybe I can give you a little information on the mark envelope math, it sounds like it could be a little bit negativeis firmly on track. Moving now on to Slide 8. The strength of ouDay, we shared with you our plans and commitment for C-Band and think, probably the best way to think about that as we're workinadvantage of rate cases if we need to.  But we've done a really and active oriented franchises. Vans r

# Building Blocks

## 1. Self-Attention head

In [25]:
class Head(nn.Module):
    '''A single head of self attention.'''

    def __init__(self, head_size):
        super().__init__()                                                                  # Access to nn functions/classes
        self.key = nn.Linear(in_features=n_embd, out_features=head_size, bias=False)        # Determines attention to other elements
        self.query = nn.Linear(in_features=n_embd, out_features=head_size, bias=False)      # Determines similarity/relevance of other elements to input element
        self.value = nn.Linear(in_features=n_embd, out_features=head_size, bias=False)      # Actual info associated with the element
        self.register_buffer(                                                               # Attention only applied to past elements
            name='tril',
            tensor=torch.tril(input=torch.ones(block_size, block_size))     # Lower triangular tensor of ones
        )
        self.dropout = nn.Dropout(p=dropout)                                                # Randomly drop `dropout` fraction of the neurons

    def forward(self, x):
        B, T, C = x.shape                                                   # B=batch size, T=sequence length, C=feature dim    
        k = self.key(x)                                                     # applies key layer to x
        q = self.query(x)                                                   # applies query layer to x
        v = self.value(x)                                                   # applies value layer to x

        wei = q @ k.transpose(-2, -1) * C ** -0.5                           # computes pairwise similarity of keys and queries -> weights
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))        # applies masking
        wei = F.softmax(wei, dim=-1)                                        # normalize the weights so they form a probability distribution
        wei = self.dropout(wei)                                             # applies dropout layer
        out = wei @ v                                                       # aggregate values based on weights

        return out

In [26]:
head = Head(head_size=16)

In [29]:
key_weights = head.key.weight
query_weights = head.query.weight

In [44]:
unsqueeze_test_tensor = torch.tensor([[1,2,3],[4,5,6]])         # shape (2,3)
unsqueezed_tensor = unsqueeze_test_tensor.unsqueeze(dim=-1)     # inserts singleton dimension at idx -1
expanded_tensor = unsqueezed_tensor.expand(-1,-1,n_embd)        # replicates the tensor along specified dims
print(unsqueeze_test_tensor.shape, unsqueezed_tensor.shape, expanded_tensor.shape)

torch.Size([2, 3]) torch.Size([2, 3, 1]) torch.Size([2, 3, 128])


In [62]:
x, y = get_batch('train')
print(x.shape)
x = x.unsqueeze(-1).expand(-1,-1,n_embd)
print(x.shape)

torch.Size([32, 64])
torch.Size([32, 64, 128])


In [63]:
x = x.to(dtype=torch.float32)

In [64]:
B,T,C = x.shape
k = head.key(x)
q = head.query(x)
v = head.value(x)

In [65]:
k_transpose = k.transpose(-2,-1)
print(k.shape, k_transpose.shape)

torch.Size([32, 64, 16]) torch.Size([32, 16, 64])


In [91]:
wei = q @ k.transpose(-2, -1) * C ** -0.5                           # computes pairwise similarity of keys and queries -> weights
wei = wei.masked_fill(head.tril[:T, :T] == 0, float('-inf'))        # applies masking

In [92]:
wei = F.softmax(wei, dim=-1)                                        # normalize the weights so they form a probability distribution
wei = head.dropout(wei)                                             # applies dropout layer
out = wei @ v                                                       # aggregate values based on weight

In [94]:
out_arr = out.detach().cpu().numpy()

In [103]:
def plot_out_arr(idx):
    plt.imshow(out_arr[idx])  # Assuming batch_size=1
    plt.colorbar(label="Attention Weight")
    plt.title("Attention Weights Heatmap")
    plt.xlabel("Input Sequence Position")
    plt.ylabel("Input Sequence Position")
    plt.show()