In [1]:
import numpy as np
import time

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
probabilites = torch.tensor([0.1, 0.4, 0.5]) # they should always end up being equal to 1
# there is 10% chance of getting 0, 40% chance of getting 1 and 50% chance of getting 2. 
# Meaning that we have probability of getting the corresponding index.
samples = torch.multinomial(probabilites, num_samples=10, replacement=True)
print(samples)

# This concept will be used for getting the probabilities for next character prediction. 

tensor([0, 2, 0, 1, 1, 1, 2, 1, 1, 2])


In [6]:
# Concatenation
first_tensor = torch.tensor([1, 2, 3, 4, 5])
concatenated_tensor = torch.concatenate((first_tensor, torch.tensor([6, 7, 8, 9, 10])), axis=0)
print(concatenated_tensor)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])


In [8]:
triangular_lower = torch.tril(torch.ones(8,8))
print(triangular_lower)

# This will be used for setting the predictions that we have made. 
# Meaning, we have the first token (already set to 1) but rest of them have to predicted so they are set as 0. 
# As we move forward, we make predictions and consequtively set them to 1. 
# This makes sure that we do not interact with the future!

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


In [9]:
# Similar to the previous case, we have
triangular_upper = torch.triu(torch.ones(8,8))
print(triangular_upper)

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1.]])


In [12]:
# So, this is one of the most important concepts for biagram model. We exponentiate our tensors.
exp_triang_lower = torch.zeros(8,8).masked_fill(torch.tril(torch.ones(8,8))==0, float('-inf'))
print(exp_triang_lower)

# When we raise the tensor with exp. 
# e^{-inf} = 1; e^{0} = 1
print(torch.exp(exp_triang_lower))

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


In [19]:
# Yet another simple function
zeros_tensor = torch.zeros((5, 2, 3))
transponsed_zeros_tensor = zeros_tensor.transpose(0, 2)
print(transponsed_zeros_tensor.shape)

torch.Size([3, 2, 5])


In [20]:
# Looking at quite an inportant function: `torch.stack`
# We need this for parallel processing where we stack multiple blocks with fixed length (no. of tokens)
tensor_1 = torch.tensor([1, 2, 3])
tensor_2 = torch.tensor([4, 5, 6])
tensor_3 = torch.tensor([7, 8, 9])

stacked_tensor = torch.stack((tensor_1, tensor_2, tensor_3))
print(stacked_tensor)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [29]:
# Simple neural network
import torch.nn as nn
import torch.nn.functional as F
input_tensor = torch.tensor([2., 3., 4]) # takes float
linear_layer = nn.Linear(in_features=3, out_features=4, bias=False) # can change the output features but not the input
output_tensor = linear_layer(input_tensor)
print(output_tensor)

# Lets try to store the output and then apply the activation function
out_after_act = F.softmax(output_tensor, dim=0)
print(out_after_act)

tensor([-1.3292, -0.2261, -1.2684,  1.5387], grad_fn=<SqueezeBackward3>)
tensor([0.0441, 0.1329, 0.0469, 0.7761], grad_fn=<SoftmaxBackward0>)


In [10]:
# Using nn.Embedding
import torch.nn as nn

vocab_size = 4000
embedding_dim = 5 # In a sense, this is 5 dimensional space where each element of vocab is represented by a vector in that space.

# Initialise the embedding layer
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some example input data
input_data = torch.LongTensor([[1, 2, 3, 4], [5, 6, 7, 8]])

# Pass the input data through the embedding layer
embedded_data = embedding(input_data)

print(embedded_data.shape)
print(embedded_data)
# 2 is the batch size, 4 is the sequence length, and 5 is the embedding dimension


torch.Size([2, 4, 5])
tensor([[[-0.4159, -0.0625,  0.8476, -0.1073, -1.0979],
         [ 1.3862,  0.3097, -0.5616,  0.5898,  0.7091],
         [ 0.2522,  0.9657,  0.3001,  2.0171,  0.0416],
         [ 0.1790, -1.0683, -0.8860, -0.1840,  0.4192]],

        [[-0.8543,  0.9594,  0.5434,  0.6422,  0.3213],
         [ 1.7646,  1.2693,  0.0756, -2.1041,  1.0057],
         [ 1.3222,  1.1394,  2.1730, -1.0531,  0.4475],
         [-0.1500, -0.7864,  0.1142,  0.9328, -2.0393]]],
       grad_fn=<EmbeddingBackward0>)


In [11]:
# Tensor multiplication
a = torch.tensor([[1,2],[3,4],[5,6]])
b = torch.tensor([[7,8,9],[10,11,12]])
print(a @ b)
print(torch.matmul(a, b))

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])
tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])


In [15]:
matrix_a = torch.randint(low=0, high=10, size=(2, 3, 3))
print(matrix_a)
# B, T, C = matrix_a.shape
# reshaped_matrix_a = matrix_a.view(B*T, C)
print(matrix_a[:,-1,:])

tensor([[[2, 8, 6],
         [8, 5, 4],
         [4, 8, 2]],

        [[6, 0, 3],
         [6, 8, 6],
         [9, 7, 7]]])
tensor([[4, 8, 2],
        [9, 7, 7]])
