In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PytorchMultiHeadAttention(nn.Module):
    def __init__(self, n_embd, n_head, causal=True, p_dropout=0.1, bias=True):
        super().__init__()
        self.n_embd = n_embd
        self.n_head = n_head
        self.causal = causal
        self.attn_hidden_dim = n_embd // n_head
        self.dropout = p_dropout

        self.q_projection = nn.Linear(n_embd, n_embd, bias=bias)
        self.k_projection = nn.Linear(n_embd, n_embd, bias=bias)
        self.v_projection = nn.Linear(n_embd, n_embd, bias=bias)
        self.out_projection = nn.Linear(n_embd, n_embd, bias=bias)

    def create_causal_mask(self, seq_len, device):
        # (1, 1, T, T) mask filled with -inf in upper triangle
        mask = torch.triu(torch.ones((1, 1, seq_len, seq_len), device=device), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf'))
        return mask

    def reshape_and_multiply_layer(self, out, m, x):
        B, T, E = x.shape
        x_flattened = x.reshape(B * T, E)                  # (B*T, E)
        out = m(x_flattened)                            # (B*T, E)
        out = out.view(B, T, self.n_head, self.attn_hidden_dim)  # (B, T, H, D)
        out = out.permute(0, 2, 1, 3).contiguous()   # (B, H, T, D)
        return out

    def project_to_query_key_value(self, x):
        B, T, E = x.shape
        self.x_flattened = x.reshape(B * T, E)                  # (B*T, E)

        self.Xq1 = self.q_projection(self.x_flattened)
        self.Xq2 = self.Xq1.view(B,T,self.n_head,self.attn_hidden_dim)
        self.Xq3 = self.Xq2.permute(0,2,1,3).contiguous()
        self.Xqflattened = self.x_flattened
        
        self.Xk1 = self.k_projection(self.x_flattened)
        self.Xk2 = self.Xk1.view(B,T,self.n_head,self.attn_hidden_dim)
        self.Xk3 = self.Xk2.permute(0,2,1,3).contiguous()
        self.Xkflattened = self.x_flattened

        self.Xv0 = self.v_projection(self.x_flattened)
        self.Xv1 = self.Xv0.view(B,T,E)
        self.Xv2 = self.Xv1.view(B,T,self.n_head,self.attn_hidden_dim)
        self.Xv3 = self.Xv2.permute(0,2,1,3).contiguous()
        self.Xvflattened = self.x_flattened
        
        self.Xq1.retain_grad()
        self.Xq2.retain_grad()
        self.Xq3.retain_grad()
        self.Xk1.retain_grad()
        self.Xk2.retain_grad()
        self.Xk3.retain_grad()
        self.Xv0.retain_grad()
        self.Xv1.retain_grad()
        self.Xv2.retain_grad()
        self.Xv3.retain_grad()
        self.Xqflattened.retain_grad()
        self.Xkflattened.retain_grad()
        self.Xvflattened.retain_grad()
        self.XkT = self.Xk3.permute(0, 1, 3, 2).contiguous()                            # (B, H, D, T)
        self.XkT.retain_grad()
        return self.Xq3, self.XkT, self.Xv3

    def self_attention(self, q, kT, v):
        B, H, T, D = q.shape
        scale = D ** 0.5
        self.attn_scores = torch.matmul(q, kT) / scale                          # (B, H, T, T)
        self.v = v
        self.v.retain_grad()
        if self.causal:
            mask = self.create_causal_mask(T, q.device)                   # (1, 1, T, T)
            self.attn_scores = self.attn_scores + mask                              # broadcasted
        self.attn_scores.retain_grad()
        self.attn_weights = F.softmax(self.attn_scores, 3)                     # (B, H, T, T)
        self.attn_weights.retain_grad()
        self.attn_output = torch.matmul(self.attn_weights, self.v)                       # (B, H, T, D)
        self.attn_output.retain_grad()
        self.permute_output = self.attn_output.permute(0, 2, 1, 3).contiguous()             # (B, T, H, D)
        self.permute_output.retain_grad()
        self.output = self.permute_output.view(B, T, H * D)                                 # (B, T, E)
        self.output.retain_grad()
        return self.output

    def forward(self, x):
        B, T, E = x.shape
        q, kT, v = self.project_to_query_key_value(x)
        out = self.self_attention(q, kT, v)
        out = self.out_projection(out.view(B * T, E)).view(B, T, E)
        return out

In [2]:
import numpy as np
import torch
import os

def load_numpy_array(arr_path):
    with open(arr_path, 'rb') as f:
        return np.load(f).astype(np.float32)

def test_multihead_attention_student(batch_size, queries_len, n_embd, num_heads, p_dropout):
    test_dir = f'./tests/data/multihead_attention'
    test_str = '_'.join(map(str, (batch_size, queries_len, n_embd, num_heads)))

    # Load numpy test data
    data = load_numpy_array(os.path.join(test_dir, f'{test_str}_data.npy'))
    w_q = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_q.npy'))
    w_k = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_k.npy'))
    w_v = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_v.npy'))
    w_out = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_out.npy'))

    result_ = load_numpy_array(os.path.join(test_dir, f'{test_str}_result.npy'))
    x_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_x_grad.npy'))
    w_q_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_q_grad.npy'))
    w_k_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_k_grad.npy'))
    w_v_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_v_grad.npy'))
    w_out_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_out_grad.npy'))

    # Create tensors with requires_grad=True to compute gradients
    X = torch.from_numpy(data).requires_grad_(True)

    # Initialize your PyTorch implementation
    layer = PytorchMultiHeadAttention(n_embd, num_heads, causal=True, bias=False)

    # Load provided weights into PyTorch layers
    with torch.no_grad():
        layer.q_projection.weight.copy_(torch.from_numpy(w_q.T))
        layer.k_projection.weight.copy_(torch.from_numpy(w_k.T))
        layer.v_projection.weight.copy_(torch.from_numpy(w_v.T))
        layer.out_projection.weight.copy_(torch.from_numpy(w_out.T))

    # Forward pass
    result = layer(X)

    # Check forward pass output
    np.testing.assert_allclose(result.detach().numpy(), result_, atol=1e-5, rtol=1e-5)

    # Backward pass to compute gradients
    result.sum().backward()

    # # Check gradients
    np.testing.assert_allclose(X.grad.numpy(), x_grad, atol=1e-5, rtol=1e-5)
    np.testing.assert_allclose(layer.q_projection.weight.grad.numpy(), w_q_grad.T, atol=1e-5, rtol=1e-5)
    np.testing.assert_allclose(layer.k_projection.weight.grad.numpy(), w_k_grad.T, atol=1e-5, rtol=1e-5)
    np.testing.assert_allclose(layer.v_projection.weight.grad.numpy(), w_v_grad.T, atol=1e-5, rtol=1e-5)
    np.testing.assert_allclose(layer.out_projection.weight.grad.numpy(), w_out_grad.T, atol=1e-5, rtol=1e-5)
    print("All tests passed successfully!")

In [3]:
m = test_multihead_attention_student(128,32,256,8,0.0)

All tests passed successfully!


In [4]:
from minitorch import MultiHeadAttention
import minitorch
from minitorch.cuda_kernel_ops import CudaKernelOps
import numpy as np
from minitorch.tensor import tensor, tensor_from_numpy
from minitorch.module import Module, Parameter
from minitorch.tensor_ops import *


backend = minitorch.TensorBackend(CudaKernelOps)
def test_multihead_attention_student_minitorch(batch_size, queries_len, n_embd, num_heads, p_dropout, backend):
    test_dir = f'./tests/data/multihead_attention'
    test_str = '_'.join(map(str, (batch_size, queries_len, n_embd, num_heads)))

    data = load_numpy_array(os.path.join(test_dir, f'{test_str}_data.npy'))
    w_q = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_q.npy'))
    w_k = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_k.npy'))
    w_v = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_v.npy'))
    w_out = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_out.npy'))
    result_ = load_numpy_array(os.path.join(test_dir, f'{test_str}_result.npy'))
    x_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_x_grad.npy'))
    w_q_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_q_grad.npy'))
    w_k_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_k_grad.npy'))
    w_v_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_v_grad.npy'))
    w_out_grad = load_numpy_array(os.path.join(test_dir, f'{test_str}_w_out_grad.npy'))

    X    = minitorch.tensor_from_numpy(data, backend, True)

    layer = minitorch.MultiHeadAttention(n_embd, num_heads, True, p_dropout, bias=False, backend=backend)
    
    layer.q_projection.weights.value   = minitorch.tensor_from_numpy((w_q), backend=backend, requires_grad=True)
    layer.k_projection.weights.value   = minitorch.tensor_from_numpy((w_k), backend=backend, requires_grad=True)
    layer.v_projection.weights.value   = minitorch.tensor_from_numpy((w_v), backend=backend, requires_grad=True)
    layer.out_projection.weights.value = minitorch.tensor_from_numpy((w_out), backend=backend, requires_grad=True)
    result = layer(X)
    np.testing.assert_allclose(result.to_numpy(), result_, atol=1e-5, rtol=1e-5)

    result.sum().backward()
    
    #np.testing.assert_allclose(X.grad.to_numpy(), x_grad, atol=1e-5, rtol=1e-5)
    np.testing.assert_allclose(layer.out_projection.weights.value.grad.to_numpy(), w_out_grad, atol=1e-5, rtol=1e-5)
    # np.testing.assert_allclose(layer.v_projection.weights.value.grad.to_numpy(), w_q_grad, atol=1e-5, rtol=1e-5)


In [5]:
m2 = test_multihead_attention_student_minitorch(128,32,256,8,0.0,backend)

In [6]:
x = tensor_from_numpy(np.random.randn(2, 3, 4),backend, requires_grad=True)
y = x.permute(0, 2, 1)  # now shape (2, 4, 3)
out = y.sum()
out.backward()

print(x.grad)  # should be all ones, same shape as x


[
	[
		[1.000000 1.000000 1.000000 1.000000]
		[1.000000 1.000000 1.000000 1.000000]
		[1.000000 1.000000 1.000000 1.000000]]
	[
		[1.000000 1.000000 1.000000 1.000000]
		[1.000000 1.000000 1.000000 1.000000]
		[1.000000 1.000000 1.000000 1.000000]]]


In [7]:
heads = 2
np.random.seed(10)
np10x10_ones = np.random.normal(0,1,(10,10)).astype(np.float32)
# np_ones  = np.random.normal(0,1,(1,5,10)).astype(np.float32)
np_ones = np.ones((1,5,10)).astype(np.float32)

minitorchAttention=MultiHeadAttention(10, heads, True, 0.0, False, backend)  
minitorchAttention.q_projection.weights.value = tensor_from_numpy(np10x10_ones, backend, True)
minitorchAttention.k_projection.weights.value = tensor_from_numpy(np10x10_ones, backend, True)
minitorchAttention.v_projection.weights.value = tensor_from_numpy(np10x10_ones, backend, True)
minitorchAttention.out_projection.weights.value = tensor_from_numpy(np10x10_ones, backend, True)
x =  tensor_from_numpy(np_ones, backend, True)
y=minitorchAttention(x)
y.sum().backward()

pytorchAttention = PytorchMultiHeadAttention(10, heads, causal=True, p_dropout=0.0, bias=False)
with torch.no_grad():
    pytorchAttention.q_projection.weight.copy_(torch.from_numpy(np10x10_ones.T))
    pytorchAttention.k_projection.weight.copy_(torch.from_numpy(np10x10_ones.T))
    pytorchAttention.v_projection.weight.copy_(torch.from_numpy(np10x10_ones.T))
    pytorchAttention.out_projection.weight.copy_(torch.from_numpy(np10x10_ones.T))
inputTensor = torch.from_numpy(np_ones).requires_grad_(True)
# inputTensor.retain_grad()
pytorch_y=pytorchAttention(inputTensor)
pytorch_y.sum().backward()

In [8]:
minitorchAttention.v_projection.weights.value.grad


[
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]
	[2.990415 8.881533 17.826761 4.652535 -8.729689 21.574926 -8.850663 8.025425 -9.896193 3.233282]]

In [9]:
pytorchAttention.v_projection.weight.grad.T

tensor([[ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  8.8815, 17.8268,  4.6525, -8.7297, 21.5749, -8.8507,  8.0254,
         -9.8962,  3.2333],
        [ 2.9904,  

In [10]:
minitorchAttention.Xqflattened.grad

AttributeError: 'NoneType' object has no attribute 'grad'

In [None]:
minitorchAttention.x_flattened.grad

In [None]:
minitorchAttention.v_projection.weights.value.shape[0]

In [None]:
minitorchAttention.Xv1.grad

In [None]:
pytorchAttention.Xv2.grad.stride()

In [None]:
pytorchAttention.Xv1.grad

In [None]:
# minitorchAttention.Xv2.grad.view(1,5,10)


In [None]:
pytorchAttention.Xv2.grad.shape

In [None]:
pytorchAttention.Xv1.grad.shape

In [None]:
t = minitorchAttention.Xv3
print(t._tensor._strides)
print(t.shape)

In [None]:
u = pytorchAttention.Xv3
print(u.stride())
print(u.shape)

In [11]:
from minitorch import DecoderLM

In [12]:
import datasets

In [13]:
from datasets import list_datasets

In [14]:
a = list_datasets()

In [15]:
a

['nvidia/OpenMathReasoning',
 'nvidia/Nemotron-CrossThink',
 'nvidia/OpenCodeReasoning',
 'rajpurkarlab/ReXGradient-160K',
 'OpenGVLab/InternVL-Data',
 'deepseek-ai/DeepSeek-ProverBench',
 'nyuuzyou/svgfind',
 'fka/awesome-chatgpt-prompts',
 'Eureka-Lab/PHYBench',
 'nvidia/Llama-Nemotron-Post-Training-Dataset',
 'nvidia/When2Call',
 'BramVanroy/CommonCrawl-CreativeCommons',
 'FreedomIntelligence/medical-o1-reasoning-SFT',
 'Anthropic/values-in-the-wild',
 'HuggingFaceFW/fineweb',
 'Amod/mental_health_counseling_conversations',
 'zwhe99/DeepMath-103K',
 'openai/gsm8k',
 'open-r1/OpenR1-Math-220k',
 'LLM360/MegaMath',
 'Giova-tech/sentiment-analysis-test',
 'Mxode/Chinese-Instruct',
 'syCen/CameraBench',
 'ZennyKenny/synthetic_vc_financial_decisions_reasoning_dataset',
 'reyavir/PromptEvals',
 'Aleph-Alpha/Aleph-Alpha-GermanWeb',
 'qwertychri/sentiment-analysis-test',
 'nvidia/dynpose-100k',
 'nvidia/describe-anything-dataset',
 'quotientai/HalluMix',
 'Felipeit/sentiment-analysis-test',

In [16]:
len(a)

378777

In [17]:
datasets.load_dataset('nvidia/OpenMathReasoning', split='train')

FileNotFoundError: Couldn't find a dataset script at /home/alien/projects/llmsys_s25_hw2/nvidia/OpenMathReasoning/OpenMathReasoning.py or any data file in the same directory. Couldn't find 'nvidia/OpenMathReasoning' on the Hugging Face Hub either: FileNotFoundError: Dataset 'nvidia/OpenMathReasoning' doesn't exist on the Hub. If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`.

In [22]:
from datasets import load_dataset

ds = load_dataset("iwslt14-de-en-preprocess")

Downloading and preparing dataset iwslt14-de-en-preprocess/de-en to /home/alien/.cache/huggingface/datasets/iwslt14-de-en-preprocess/de-en/1.0.0/c16e61361ef4d92649321b04532f0d57c5a783ad7fd32afe4a9649a6b6107f8a...


Downloading data:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset iwslt14-de-en-preprocess downloaded and prepared to /home/alien/.cache/huggingface/datasets/iwslt14-de-en-preprocess/de-en/1.0.0/c16e61361ef4d92649321b04532f0d57c5a783ad7fd32afe4a9649a6b6107f8a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]