
## Load Weights in C


In [1]:

import torch


In [2]:

from transformers import GPT2LMHeadModel

# Load GPT-2 model
model_name = "gpt2"  # You can also use "gpt2-medium" or other variants
model      = GPT2LMHeadModel.from_pretrained(model_name)

# Access the weights for a specific layer (e.g., final linear layer)
weights = model.lm_head.weight.detach().numpy()

# Save the weights to a binary file
import numpy as np
weights.astype(np.float32).tofile("gpt2_weights.bin")



In [3]:

import numpy as np
from transformers import GPT2LMHeadModel

# Load GPT-2 model
model_name = "gpt2"  # Or "gpt2-medium", "gpt2-large", etc.
model = GPT2LMHeadModel.from_pretrained(model_name)

# Collect weights into a dictionary
weights = {}

# Save embedding weights
weights['token_embedding'] = model.transformer.wte.weight.detach().numpy()
weights['position_embedding'] = model.transformer.wpe.weight.detach().numpy()

# Save transformer block weights
for i, block in enumerate(model.transformer.h):
    prefix = f"block_{i}"
    weights[f"{prefix}_attn_qkv"] = block.attn.c_attn.weight.detach().numpy()
    weights[f"{prefix}_attn_proj"] = block.attn.c_proj.weight.detach().numpy()
    weights[f"{prefix}_mlp_fc"] = block.mlp.c_fc.weight.detach().numpy()
    weights[f"{prefix}_mlp_proj"] = block.mlp.c_proj.weight.detach().numpy()
    weights[f"{prefix}_ln1_weight"] = block.ln_1.weight.detach().numpy()
    weights[f"{prefix}_ln1_bias"] = block.ln_1.bias.detach().numpy()
    weights[f"{prefix}_ln2_weight"] = block.ln_2.weight.detach().numpy()
    weights[f"{prefix}_ln2_bias"] = block.ln_2.bias.detach().numpy()

# Save final layer normalization
weights['ln_f_weight'] = model.transformer.ln_f.weight.detach().numpy()
weights['ln_f_bias'] = model.transformer.ln_f.bias.detach().numpy()

# Save final linear layer (logits projection)
weights['lm_head'] = model.lm_head.weight.detach().numpy()

# Save all weights to a binary file
with open("gpt2_weights_ALL.bin", "wb") as f:
    for key, value in weights.items():
        np.array(value.shape, dtype=np.int32).tofile(f)  # Save the shape
        value.astype(np.float32).tofile(f)  # Save the data




## C code


In [None]:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// Matrix multiplication
void matmul(float* A, float* B, float* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            C[i * K + j] = 0;
            for (int k = 0; k < N; k++) {
                C[i * K + j] += A[i * N + k] * B[k * K + j];
            }
        }
    }
}

// Simplified forward pass
void forward(float* input, float* weights, float* output, int input_dim, int output_dim) {
    matmul(input, weights, output, 1, input_dim, output_dim);  // Single layer example
}

int main() {
    int input_dim = 768;    // GPT hidden size
    int output_dim = 50257; // Vocabulary size

    // Load weights
    FILE* weight_file = fopen("gpt2_weights.bin", "rb");
    float* weights = malloc(input_dim * output_dim * sizeof(float));
    fread(weights, sizeof(float), input_dim * output_dim, weight_file);
    fclose(weight_file);

    // Input vector
    float input[input_dim];
    for (int i = 0; i < input_dim; i++) input[i] = 1.0f;  // Example input

    // Output vector
    float output[output_dim];
    forward(input, weights, output, input_dim, output_dim);

    // Print top predictions
    for (int i = 0; i < 10; i++) {
        printf("Output[%d]: %f\n", i, output[i]);
    }

    free(weights);
    return 0;
}



## ALL weights


In [None]:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// Constants
#define MAX_SEQ_LEN 1024
#define DIM 768
#define NUM_HEADS 12
#define NUM_BLOCKS 12
#define VOCAB_SIZE 50257

// Helper: Matrix multiplication
void matmul(float* A, float* B, float* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            C[i * K + j] = 0;
            for (int k = 0; k < N; k++) {
                C[i * K + j] += A[i * N + k] * B[k * K + j];
            }
        }
    }
}

// Helper: Layer normalization
void layer_norm(float* input, float* output, float* weight, float* bias, int dim) {
    float mean = 0, variance = 0;
    for (int i = 0; i < dim; i++) mean += input[i];
    mean /= dim;

    for (int i = 0; i < dim; i++) variance += (input[i] - mean) * (input[i] - mean);
    variance /= dim;

    float epsilon = 1e-5;
    for (int i = 0; i < dim; i++) {
        output[i] = (input[i] - mean) / sqrt(variance + epsilon) * weight[i] + bias[i];
    }
}

// Helper: Self-attention
void self_attention(float* qkv, float* output, int seq_len, int head_dim, int n_heads) {
    int dim = head_dim * n_heads;
    float* Q = qkv;
    float* K = qkv + seq_len * dim;
    float* V = qkv + 2 * seq_len * dim;

    float scores[seq_len * seq_len];
    float attention[seq_len * seq_len];

    // Compute scores
    matmul(Q, K, scores, seq_len, head_dim, seq_len);
    for (int i = 0; i < seq_len * seq_len; i++) scores[i] /= sqrt(head_dim);

    // Softmax
    for (int i = 0; i < seq_len; i++) {
        float sum = 0;
        for (int j = 0; j < seq_len; j++) {
            scores[i * seq_len + j] = exp(scores[i * seq_len + j]);
            sum += scores[i * seq_len + j];
        }
        for (int j = 0; j < seq_len; j++) {
            attention[i * seq_len + j] = scores[i * seq_len + j] / sum;
        }
    }

    // Compute attention output
    matmul(attention, V, output, seq_len, seq_len, head_dim);
}

// Forward pass for one block
void forward_block(float* input, float* output, float* qkv_weight, float* proj_weight, float* fc_weight, float* proj_fc_weight, float* ln1_weight, float* ln1_bias, float* ln2_weight, float* ln2_bias, int seq_len, int dim) {
    float qkv[3 * seq_len * dim];
    float attn_output[seq_len * dim];
    float ln1_output[seq_len * dim];
    float fc_output[seq_len * dim];
    float ln2_output[seq_len * dim];

    // Layer norm 1
    layer_norm(input, ln1_output, ln1_weight, ln1_bias, dim);

    // Self-attention
    matmul(ln1_output, qkv_weight, qkv, seq_len, dim, 3 * dim);
    self_attention(qkv, attn_output, seq_len, dim / NUM_HEADS, NUM_HEADS);

    // Project attention output
    matmul(attn_output, proj_weight, ln2_output, seq_len, dim, dim);

    // Add residual
    for (int i = 0; i < seq_len * dim; i++) ln2_output[i] += input[i];

    // Layer norm 2
    layer_norm(ln2_output, ln2_output, ln2_weight, ln2_bias, dim);

    // Feedforward
    matmul(ln2_output, fc_weight, fc_output, seq_len, dim, dim);
    matmul(fc_output, proj_fc_weight, output, seq_len, dim, dim);

    // Add residual
    for (int i = 0; i < seq_len * dim; i++) output[i] += ln2_output[i];
}

// Forward pass for GPT
void gpt_forward(float* input, float* token_embedding, float* position_embedding, float* lm_head, float* block_weights[][8], int seq_len, int dim) {
    float embedded[MAX_SEQ_LEN * DIM];

    // Token and position embeddings
    for (int i = 0; i < seq_len; i++) {
        for (int j = 0; j < dim; j++) {
            embedded[i * dim + j] = input[i * dim + j] +
                                    token_embedding[input[i] * dim + j] +
                                    position_embedding[i * dim + j];
        }
    }

    // Transformer blocks
    float current[MAX_SEQ_LEN * DIM];
    float next[MAX_SEQ_LEN * DIM];
    for (int i = 0; i < NUM_BLOCKS; i++) {
        forward_block(embedded, next, block_weights[i][0], block_weights[i][1], block_weights[i][2], block_weights[i][3], block_weights[i][4], block_weights[i][5], block_weights[i][6], block_weights[i][7], seq_len, dim);
        float* temp = embedded;
        embedded = next;
        next = temp;
    }

    // Final projection
    float logits[MAX_SEQ_LEN * VOCAB_SIZE];
    matmul(embedded, lm_head, logits, seq_len, dim, VOCAB_SIZE);
}

// Load weights from file
void load_weights(const char* filename, float** token_embedding, float** position_embedding, float** lm_head, float* block_weights[][8], int num_blocks) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        printf("Failed to open weights file.\n");
        exit(1);
    }

    // Allocate and load token and position embeddings
    *token_embedding = (float*)malloc(DIM * VOCAB_SIZE * sizeof(float));
    fread(*token_embedding, sizeof(float), DIM * VOCAB_SIZE, file);

    *position_embedding = (float*)malloc(MAX_SEQ_LEN * DIM * sizeof(float));
    fread(*position_embedding, sizeof(float), MAX_SEQ_LEN * DIM, file);

    // Allocate and load transformer block weights
    for (int i = 0; i < num_blocks; i++) {
        for (int j = 0; j < 8; j++) {
            block_weights[i][j] = (float*)malloc(DIM * DIM * sizeof(float)); // Simplified size allocation
            fread(block_weights[i][j], sizeof(float), DIM * DIM, file);
        }
    }

    // Allocate and load logits projection
    *lm_head = (float*)malloc(DIM * VOCAB_SIZE * sizeof(float));
    fread(*lm_head, sizeof(float), DIM * VOCAB_SIZE, file);

    fclose(file);
}

int main() {
    int seq_len = 8;  // Example sequence length
    int dim = DIM;

    // Allocate memory for weights
    float* token_embedding;
    float* position_embedding;
    float* lm_head;
    float* block_weights[NUM_BLOCKS][8];

    // Load weights
    load_weights("gpt2_weights.bin", &token_embedding, &position_embedding, &lm_head, block_weights, NUM_BLOCKS);

    // Example input: token IDs
    float input[MAX_SEQ_LEN] = {0, 1, 2, 3, 4, 5, 6, 7};

    // Perform GPT forward pass
    gpt_forward(input, token_embedding, position_embedding, lm_head, block_weights, seq_len, dim);

    printf("GPT Forward Pass Complete.\n");

    // Free allocated memory
    free(token_embedding);
    free(position_embedding);
    free(lm_head);
    for (int i = 0; i < NUM_BLOCKS; i++) {
        for (int j = 0; j < 8; j++) {
            free(block_weights[i][j]);
        }
    }

    return 0;
}




## faster c


In [None]:

#include <stdio.h>
#include <stdlib.h>
#include <cblas.h>

// Matrix multiplication using BLAS
void matmul(float* A, float* B, float* C, int M, int N, int K) {
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                M, K, N, 1.0, A, N, B, K, 0.0, C, K);
}

// Simplified feedforward pass of a single layer
void feedforward(float* input, float* weights, float* output, int input_dim, int output_dim) {
    matmul(input, weights, output, 1, input_dim, output_dim);
}

// Load weights from a binary file
float* load_weights(const char* filename, int size) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        printf("Error: Unable to open file %s\\n", filename);
        exit(1);
    }
    float* weights = malloc(size * sizeof(float));
    fread(weights, sizeof(float), size, file);
    fclose(file);
    return weights;
}

// Simple softmax function
void softmax(float* logits, int size) {
    float max = logits[0];
    for (int i = 1; i < size; i++) {
        if (logits[i] > max) max = logits[i];
    }
    float sum = 0.0;
    for (int i = 0; i < size; i++) {
        logits[i] = expf(logits[i] - max);  // Prevent overflow
        sum += logits[i];
    }
    for (int i = 0; i < size; i++) {
        logits[i] /= sum;
    }
}

int main() {
    // Model dimensions
    int input_dim = 768;    // GPT hidden size
    int output_dim = 50257; // GPT vocabulary size

    // Load pre-trained weights (e.g., from PyTorch or custom binary format)
    float* weights = load_weights("gpt2_weights.bin", input_dim * output_dim);

    // Input vector (example input, normally generated from embeddings)
    float input[input_dim];
    for (int i = 0; i < input_dim; i++) input[i] = 1.0f;  // Example input

    // Output vector
    float* output = malloc(output_dim * sizeof(float));

    // Perform a forward pass
    feedforward(input, weights, output, input_dim, output_dim);

    // Apply softmax to logits
    softmax(output, output_dim);

    // Print top predictions
    for (int i = 0; i < 10; i++) {
        printf("Logit[%d]: %f\\n", i, output[i]);
    }

    // Free memory
    free(weights);
    free(output);

    return 0;
}


In [None]:

sudo apt-get install libopenblas-dev

gcc -o gpt_optimized gpt_optimized.c -lopenblas -lm
gcc -o gpt_forward gpt_forward.c -lm


./gpt_optimized


In [None]:

import torch

# Save weights in binary format
model = torch.load("gpt2_model.pth")
weights = model["decoder"]["linear.weight"].cpu().numpy()
with open("gpt2_weights.bin", "wb") as f:
    f.write(weights.tobytes())


In [None]:
'''

Logit[0]: 0.123456
Logit[1]: 0.098765
'''



## C faster again


In [None]:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cblas.h>  // Include BLAS header

// Constants
#define MAX_SEQ_LEN 1024
#define DIM 768
#define NUM_HEADS 12
#define NUM_BLOCKS 12
#define VOCAB_SIZE 50257

// Helper: Optimized matrix multiplication using BLAS
void matmul(float* A, float* B, float* C, int M, int N, int K) {
    // BLAS matrix multiplication: C = alpha * A * B + beta * C
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                M, K, N,
                1.0, A, N,
                B, K,
                0.0, C, K);
}

// Helper: Layer normalization
void layer_norm(float* input, float* output, float* weight, float* bias, int dim) {
    float mean = 0, variance = 0;
    for (int i = 0; i < dim; i++) mean += input[i];
    mean /= dim;

    for (int i = 0; i < dim; i++) variance += (input[i] - mean) * (input[i] - mean);
    variance /= dim;

    float epsilon = 1e-5;
    for (int i = 0; i < dim; i++) {
        output[i] = (input[i] - mean) / sqrt(variance + epsilon) * weight[i] + bias[i];
    }
}

// Helper: Self-attention using BLAS
void self_attention(float* qkv, float* output, int seq_len, int head_dim, int n_heads) {
    int dim = head_dim * n_heads;
    float* Q = qkv;
    float* K = qkv + seq_len * dim;
    float* V = qkv + 2 * seq_len * dim;

    float scores[seq_len * seq_len];
    float attention[seq_len * seq_len];

    // Compute attention scores
    matmul(Q, K, scores, seq_len, head_dim, seq_len);
    for (int i = 0; i < seq_len * seq_len; i++) scores[i] /= sqrt(head_dim);

    // Apply softmax
    for (int i = 0; i < seq_len; i++) {
        float sum = 0;
        for (int j = 0; j < seq_len; j++) {
            scores[i * seq_len + j] = exp(scores[i * seq_len + j]);
            sum += scores[i * seq_len + j];
        }
        for (int j = 0; j < seq_len; j++) {
            attention[i * seq_len + j] = scores[i * seq_len + j] / sum;
        }
    }

    // Compute attention output
    matmul(attention, V, output, seq_len, seq_len, head_dim);
}

// Forward pass for one block
void forward_block(float* input, float* output, float* qkv_weight, float* proj_weight, float* fc_weight, float* proj_fc_weight, float* ln1_weight, float* ln1_bias, float* ln2_weight, float* ln2_bias, int seq_len, int dim) {
    float qkv[3 * seq_len * dim];
    float attn_output[seq_len * dim];
    float ln1_output[seq_len * dim];
    float fc_output[seq_len * dim];
    float ln2_output[seq_len * dim];

    // Layer norm 1
    layer_norm(input, ln1_output, ln1_weight, ln1_bias, dim);

    // Self-attention
    matmul(ln1_output, qkv_weight, qkv, seq_len, dim, 3 * dim);
    self_attention(qkv, attn_output, seq_len, dim / NUM_HEADS, NUM_HEADS);

    // Project attention output
    matmul(attn_output, proj_weight, ln2_output, seq_len, dim, dim);

    // Add residual
    for (int i = 0; i < seq_len * dim; i++) ln2_output[i] += input[i];

    // Layer norm 2
    layer_norm(ln2_output, ln2_output, ln2_weight, ln2_bias, dim);

    // Feedforward
    matmul(ln2_output, fc_weight, fc_output, seq_len, dim, dim);
    matmul(fc_output, proj_fc_weight, output, seq_len, dim, dim);

    // Add residual
    for (int i = 0; i < seq_len * dim; i++) output[i] += ln2_output[i];
}

// Forward pass for GPT
void gpt_forward(float* input, float* token_embedding, float* position_embedding, float* lm_head, float* block_weights[][8], int seq_len, int dim) {
    float embedded[MAX_SEQ_LEN * DIM];

    // Token and position embeddings
    for (int i = 0; i < seq_len; i++) {
        for (int j = 0; j < dim; j++) {
            embedded[i * dim + j] = token_embedding[input[i] * dim + j] +
                                    position_embedding[i * dim + j];
        }
    }

    // Transformer blocks
    float current[MAX_SEQ_LEN * DIM];
    float next[MAX_SEQ_LEN * DIM];
    for (int i = 0; i < NUM_BLOCKS; i++) {
        forward_block(embedded, next, block_weights[i][0], block_weights[i][1], block_weights[i][2], block_weights[i][3], block_weights[i][4], block_weights[i][5], block_weights[i][6], block_weights[i][7], seq_len, dim);
        float* temp = embedded;
        embedded = next;
        next = temp;
    }

    // Final projection
    float logits[MAX_SEQ_LEN * VOCAB_SIZE];
    matmul(embedded, lm_head, logits, seq_len, dim, VOCAB_SIZE);
}

// Load weights from file
void load_weights(const char* filename, float** token_embedding, float** position_embedding, float** lm_head, float* block_weights[][8], int num_blocks) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        printf("Failed to open weights file.\n");
        exit(1);
    }

    // Allocate and load token and position embeddings
    *token_embedding = (float*)malloc(DIM * VOCAB_SIZE * sizeof(float));
    fread(*token_embedding, sizeof(float), DIM * VOCAB_SIZE, file);

    *position_embedding = (float*)malloc(MAX_SEQ_LEN * DIM * sizeof(float));
    fread(*position_embedding, sizeof(float), MAX_SEQ_LEN * DIM, file);

    // Allocate and load transformer block weights
    for (int i = 0; i < num_blocks; i++) {
        for (int j = 0; j < 8; j++) {
            block_weights[i][j] = (float*)malloc(DIM * DIM * sizeof(float)); // Simplified size allocation
            fread(block_weights[i][j], sizeof(float), DIM * DIM, file);
        }
    }

    // Allocate and load logits projection
    *lm_head = (float*)malloc(DIM * VOCAB_SIZE * sizeof(float));
    fread(*lm_head, sizeof(float), DIM * VOCAB_SIZE, file);

    fclose(file);
}

int main() {
    int seq_len = 8;  // Example sequence length
    int dim = DIM;

    // Configure OpenBLAS threads
    openblas_set_num_threads(4);

    // Allocate memory for weights
    float* token_embedding;
    float* position_embedding;
    float* lm_head;
    float* block_weights[NUM_BLOCKS][8];

    // Load weights
    load_weights("gpt2_weights.bin", &token_embedding, &position_embedding, &lm_head, block_weights, NUM_BLOCKS);

    // Example input: token IDs
    int input[MAX_SEQ_LEN] = {0, 1, 2, 3, 4, 5, 6, 7};

    // Perform GPT forward pass
    gpt_forward(input, token_embedding, position_embedding, lm_head, block_weights, seq_len, dim);

    printf("GPT Forward Pass Complete.\n");

    // Free allocated memory
    free(token_embedding);
    free(position_embedding);
    free(lm_head);
    for (int i = 0; i < NUM_BLOCKS; i++) {
        for (int j = 0; j < 8; j++) {
            free(block_weights[i][j]);
        }
    }

    return 0;
}



In [None]:

gcc -o gpt -lopenblas gpt.c -lm



In [None]:

./gpt

