In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import time
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

mps


In [2]:
%%time
start_time = time.time()
# matrix operations here
zeros = torch.zeros(1, 1)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

0.00126505
CPU times: user 655 μs, sys: 835 μs, total: 1.49 ms
Wall time: 1.91 ms


using the time library to record execution time
%%time to record the time taken for the cell to execute

In [4]:
torch_rand1 = torch.rand(100, 100, 100, 100).to(device)
torch_rand2 = torch.rand(100, 100, 100, 100).to(device)
np_rand1 = torch.rand(100, 100, 100, 100)
np_rand2 = torch.rand(100, 100, 100, 100)

start_time = time.time()

rand = (torch_rand1 @ torch_rand2)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"MPS time => {elapsed_time:.8f}")


start_time = time.time()

rand = np.multiply(np_rand1, np_rand2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"CPU time => {elapsed_time:.8f}")

MPS time => 0.04584384
CPU time => 0.09187579


In [7]:
# embeddings, torch.stack, torch.multinomial, torch.tril, torch.triu, input.T / input.transpose, nn.Linear, torch.cat, F.softmax (show all the examples of functions/methods with pytorch docs)


# Define a probability tensor
probabilities = torch.tensor([0.1, 0.9])
# 10% or 0.1 => 0, 90% or 0.9 => 1. each probability points to the index of the probability in the tensor
# Draw 5 samples from the multinomial distribution
samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
print(samples)
# Each value in this tensor represents the probability of selecting a corresponding index.


tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [8]:
tensor = torch.tensor([1, 2, 3, 4])
out = torch.cat((tensor, torch.tensor([5])), dim=0)
out

tensor([1, 2, 3, 4, 5])

In [9]:
out = torch.tril(torch.ones(5, 5))
out

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [7]:
out = torch.triu(torch.ones(5, 5))
out

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

# Understanding Masking in Sequence Models

* And then we have a mask fill. 
* This one's going to be very important later because in order to actually get to this point, all we do is we just exponentiate every element in here. 
* So if you exponentiate zero, if you exponentiate zero, it'll become one. If you exponentiate negative infinity, it'll become zero. 
* All that's going on here is we're doing approximately 2.71. And this is a constant that we use in the dot exp function. 
* And then we're putting this to whatever power is in that current slot. So we have a zero here. So 2.71 to the zeroth is equal to one, 2.71 to the one is equal to 2.71. 
* And then, 2.71 to the negative infinity is, of course, zero. So that's pretty much how we get from this to this. And we're simply just masking these over. 
* So that's great. And I sort of showcase what the exp does. We're just using this one right here. We're using this output and we're just plugging it into here. 
* So it'll go from negative infinity to zero and then zero to one. So that's how we get from here to here.

## Why Did We Make the TRIL Values -inf?

We set the above-diagonal values to `-inf` instead of simply making them zero because of how the **softmax** function operates in neural networks, particularly in autoregressive models.

1. **Masking Future Information**:
   In autoregressive models, we need to predict the next element in a sequence using only the elements that have already been seen. The above-diagonal elements represent future information that should not be accessible during prediction. Masking these elements ensures that the model does not "peek" at future values.

2. **Effect of Exponentiation**:
   When we exponentiate the tensor values during calculations, setting above-diagonal elements to `-inf` results in:
   - \( e^{-\infty} = 0 \)
   - This means these positions contribute zero to the softmax computation, effectively removing their influence.

3. **Using Zeroes**:
   If we set the above-diagonal elements directly to zero:
   - \( e^0 = 1 \)
   - These values would still contribute to the sum in the softmax denominator, allowing them to influence the output probabilities, which we do not want.

### Example Matrix
Consider this example matrix:
```python
tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
```
 - Here, the values above the diagonal (2, 3, 6) represent future information that the model should not use for predictions.
 - By masking them with -inf, we ensure that when the softmax function is applied, those positions will contribute zero probabilities.
### Conclusion
The use of -inf for the upper triangular elements is a crucial step in maintaining the causal relationship in sequence predictions. It ensures that the model learns to make predictions based solely on the available past and present information, thereby preventing it from using any future data.



In [8]:
out = torch.zeros(5, 5).masked_fill(torch.tril(torch.ones(5, 5)) == 0, float('-inf'))
out

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [10]:
torch.exp(out)

tensor([[2.7183, 1.0000, 1.0000, 1.0000, 1.0000],
        [2.7183, 2.7183, 1.0000, 1.0000, 1.0000],
        [2.7183, 2.7183, 2.7183, 1.0000, 1.0000],
        [2.7183, 2.7183, 2.7183, 2.7183, 1.0000],
        [2.7183, 2.7183, 2.7183, 2.7183, 2.7183]])

In [11]:
input = torch.zeros(2, 3, 4)
out1 = input.transpose(0, 1)
out2 = input.transpose(-2,-1)
print(out1.shape)
print(out2.shape)
# torch.permute works the same but you provide the new order of dimensions instead of the dimensions you'd like to swap.

torch.Size([3, 2, 4])
torch.Size([2, 4, 3])


In [12]:
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor3 = torch.tensor([7, 8, 9])

# Stack the tensors along a new dimension
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
stacked_tensor


tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [31]:
import torch.nn as nn
import torch

sample = torch.tensor([10.0, 10.0, 10.0])
linear = nn.Linear(3, 3, bias=False)

# Print the initialized weights
print("Initialized weights:")
print(linear.weight)

# Compute the output
output = linear(sample)
print("Output:")
print(output)

Initialized weights:
Parameter containing:
tensor([[-0.2160,  0.4554,  0.4358],
        [ 0.0618, -0.4993, -0.4779],
        [-0.3244,  0.2271,  0.0778]], requires_grad=True)
Output:
tensor([ 6.7517, -9.1543, -0.1949], grad_fn=<SqueezeBackward4>)


In [32]:
linear.weight

Parameter containing:
tensor([[-0.2160,  0.4554,  0.4358],
        [ 0.0618, -0.4993, -0.4779],
        [-0.3244,  0.2271,  0.0778]], requires_grad=True)

# Softmax Function

The **Softmax function** is commonly used in machine learning, particularly in classification tasks. It converts a vector of raw scores (logits) into probabilities, making it easier to interpret the model's outputs as probabilities for each class.

## Mathematical Definition

Given a vector \(\mathbf{z} = [z_1, z_2, \ldots, z_n]\), the softmax function transforms this vector into a probability distribution over \(n\) classes as follows:

We basically exponentiate each value in the tensor, add them up, and then individual exponentiated value divided by the total.


## Properties of Softmax

1. **Output Range**: The output of the softmax function is in the range \((0, 1)\), making it interpretable as probabilities.
2. **Sum to One**: The sum of all the probabilities output by softmax is equal to 1:
   \[
   \sum_{i=1}^{n} \sigma(\mathbf{z})_i = 1
   \]
3. **Sensitive to Input Values**: The softmax function is sensitive to the input values. Even small changes in the input can significantly change the output probabilities.

## Example Usage in PyTorch

Here’s an example of how to use the softmax function in PyTorch:

```python
import torch
import torch.nn.functional as F

# Create a tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])

# Apply softmax using torch.nn.functional.softmax()
softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output) 


In [33]:
import torch.nn.functional as F

# Create a tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])

# Apply softmax using torch.nn.functional.softmax()
softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output) 

tensor([0.0900, 0.2447, 0.6652])


# Understanding the Embedding Layer in PyTorch

1. **What is an Embedding Layer?**
    - An embedding layer is a way to convert discrete items (like words or tokens) into numerical vectors (lists of numbers). These vectors are useful because they allow machine learning models to understand and process the items more effectively.

2. **Setting Up the Embedding Layer**:
    ```python
    vocab_size = 80
    embedding_dim = 6
    embedding = nn.Embedding(vocab_size, embedding_dim)
    ```
    - **`vocab_size = 80`**: This means you have 80 unique items (like words) that you want to represent. Think of this as having a vocabulary of 80 words.
    - **`embedding_dim = 6`**: Each of these items will be represented as a vector with 6 numbers. So each word will be converted to a list of 6 numbers.
    - **`embedding = nn.Embedding(vocab_size, embedding_dim)`**: This line creates the actual embedding layer, which will learn to represent these 80 words in 6-dimensional space.

3. **Creating Input Indices**:
    ```python
    input_indices = torch.LongTensor([1, 5, 3, 2])
    ```
    - This creates a tensor (a kind of array) that contains indices (or positions) of words you want to convert. Here, `1`, `5`, `3`, and `2` are the indices of the words you want to look up in the embedding layer. 

4. **Using the Embedding Layer**:
    ```python
    embedded_output = embedding(input_indices)
    ```
    - This line takes the indices you've created and looks up their corresponding embedding vectors in the layer. Each index will be replaced with its associated vector of 6 numbers.

5. **Output Shape and Content**:
    ```python
    print(embedded_output.shape)
    print(embedded_output)
    ```
    - **`embedded_output.shape`**: This will show the size of the output. Since you provided 4 indices, the output shape will be `(4, 6)`, meaning you will get 4 vectors, each with 6 numbers.
    - **`embedded_output`**: This will display the actual vectors for the input indices. For example, if the embedding layer learned specific vectors for those indices, the output might look something like this:
      ```
      tensor([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
              [0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
              [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
              [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]])
      ```
    - Here, each row corresponds to one of the input indices, and each column contains one of the 6 values that represent the corresponding word.

**Summary**

- **Embeddings**: They convert items (like words) into numerical representations (vectors) that are easier for models to process.
- **Look-up**: You use indices to look up the corresponding vectors in the embedding layer.
- **Output**: The output is a collection of vectors that represent the words or tokens based on the input indices.


In [17]:
# Initialize an embedding layer
vocab_size = 80
embedding_dim = 6
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some input indices
input_indices = torch.LongTensor([1, 5, 3, 2])

# Apply the embedding layer
embedded_output = embedding(input_indices)

# The output will be a tensor of shape (4, 100), where 4 is the number of inputs
# and 100 is the dimensionality of the embedding vectors
print(embedded_output.shape)
print(embedded_output)


torch.Size([4, 6])
tensor([[-0.1182, -0.0551,  2.0661, -0.1224,  0.8691, -0.2577],
        [-1.9364,  0.1501,  0.0839, -1.7630,  0.7601,  0.5639],
        [ 0.3985,  0.8211, -1.7631, -0.2576, -0.7638,  0.1854],
        [ 1.1235, -0.2815, -0.2051,  0.3126, -0.2904,  2.0155]],
       grad_fn=<EmbeddingBackward0>)


# Matrix Multiplication of 3x2 and 2x3 matrix

In [4]:
a = torch.tensor([[1,2],[3,4],[5,6]])
b = torch.tensor([[7,8,9],[10,11,12]])
# print(a @ b)
print(torch.matmul(a, b))

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])


In [37]:
int_64 = torch.randint(1, (3, 2)).float()  
print(int_64)
# 1 is the lower bound, (3, 2) is the shape of the tensor, which will have 3 rows and 2 columns.
# type int64
float_32 = torch.rand(2,3) #This converts the integer tensor to a float tensor
# print(int_64.dtype, float_32.dtype)
print(float_32)
result = torch.matmul(int_64, float_32)
print(result)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.5358, 0.3800, 0.2044],
        [0.2654, 0.4078, 0.5171]])
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


# 3D Tensors and Shape
Check the following code
- The first dimension (2) represents the number of "batches" or 2 "slices."
- The second dimension (3) represents the number of rows in each slice.
- The third dimension (5) represents the number of elements in each row.

In [38]:
a = torch.rand(2, 3, 5)
print(a.shape)
x, y, z = a.shape
a = a.view(x,y,z)
# print(x, y, z)
print(a.shape)

torch.Size([2, 3, 5])
torch.Size([2, 3, 5])


In [10]:
input = torch.rand((4, 8, 10))
B, T, C = input.shape
# B: a batch size or number of sequences
# T: the sequence length or the number of time steps (T)
# C: the number of features or channels (C) at each time step.
output = input.view(B*T, C) #Making it a 2D vector
print(output)
# print(input)
print(output[:, -1, :]) #Will throw an error as we only have a 2d view now

tensor([[8.4784e-01, 7.1257e-01, 5.8318e-02, 1.1662e-01, 5.6686e-01, 1.9303e-01,
         6.6632e-01, 6.3045e-01, 1.1654e-01, 2.2591e-01],
        [4.4540e-01, 1.8662e-01, 1.6590e-01, 7.8310e-01, 2.8952e-01, 3.6106e-01,
         2.4187e-01, 5.7216e-01, 5.1018e-01, 8.8388e-01],
        [2.1610e-01, 4.2283e-01, 2.7590e-01, 1.3142e-02, 2.8317e-02, 9.7419e-01,
         9.3365e-01, 6.6609e-01, 5.2375e-01, 8.5033e-02],
        [7.0206e-01, 5.1114e-01, 7.0352e-03, 2.6985e-01, 9.7451e-01, 1.1618e-01,
         5.5879e-01, 8.5950e-01, 3.5737e-01, 6.6099e-01],
        [8.6752e-01, 1.4063e-01, 1.3130e-01, 4.8753e-01, 1.3561e-01, 2.6779e-01,
         4.9860e-01, 9.8521e-01, 2.7027e-01, 8.7181e-01],
        [9.0724e-04, 9.4394e-01, 5.9852e-01, 2.3980e-01, 2.1402e-02, 2.2790e-01,
         9.2466e-01, 1.5136e-01, 3.5337e-01, 2.2062e-01],
        [8.0447e-01, 4.4472e-01, 5.8952e-01, 4.1328e-01, 8.5890e-01, 6.8109e-01,
         4.5790e-01, 6.8573e-01, 7.8548e-01, 2.3513e-01],
        [1.1809e-01, 5.6395

IndexError: too many indices for tensor of dimension 2

In [12]:
x = torch.tensor([10], dtype=torch.float32)
y = F.tanh(x)
print(y)

tensor([1.])
