<a href="https://colab.research.google.com/github/profitmonk/TA-Lib.github.io/blob/main/Prod_10032024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import numpy as np

# Block Matrix Multiplier
class BlockMatrixMultiplier:
    def __init__(self, block_size, input_precision='INT8', accumulator_precision='INT32'):
        self.block_size = block_size  # Block size NxN
        self.input_precision = input_precision
        self.accumulator_precision = accumulator_precision

    def pad_matrix(self, matrix, target_shape):
        """
        Pads the matrix to the target shape.
        matrix: Input matrix to be padded
        target_shape: (rows, cols) - shape to pad the matrix to
        """
        original_shape = matrix.shape
        padded_matrix = np.zeros(target_shape, dtype=matrix.dtype)
        padded_matrix[:original_shape[0], :original_shape[1]] = matrix
        return padded_matrix

    def multiply_blocks(self, matrix_a, matrix_b):
        """
        Simulates block matrix multiplication for quantized inputs with padding.
        matrix_a: (MxK), matrix_b: (KxN), Block Size: (NxN)
        """
        # Determine original matrix dimensions
        original_m, original_k1 = matrix_a.shape
        original_k2, original_n = matrix_b.shape
        print("hey",original_k1,original_k2)
        assert original_k1 == original_k2, "Matrix dimensions do not match for multiplication."

        # Calculate the padded dimensions to make them multiples of the block size
        padded_m = ((original_m + self.block_size - 1) // self.block_size) * self.block_size
        padded_k = ((original_k1 + self.block_size - 1) // self.block_size) * self.block_size
        padded_n = ((original_n + self.block_size - 1) // self.block_size) * self.block_size

        # Pad the matrices
        matrix_a_padded = self.pad_matrix(matrix_a, (padded_m, padded_k))
        matrix_b_padded = self.pad_matrix(matrix_b, (padded_k, padded_n))

        # Initialize the padded result matrix
        #result_matrix_padded = np.zeros((padded_m, padded_n), dtype=self.accumulator_precision)
        result_matrix_padded = np.zeros((padded_m, padded_n), dtype=np.int32 if self.accumulator_precision == 'INT32' else np.float32) # Changed 'INT32' to np.int32
        # Convert matrices to the specified precision
        matrix_a_padded = matrix_a_padded.astype(np.int8 if self.input_precision == 'INT8' else np.float32)
        matrix_b_padded = matrix_b_padded.astype(np.int8 if self.input_precision == 'INT8' else np.float32)

        # Block matrix multiplication
        for i in range(0, padded_m, self.block_size):
            for j in range(0, padded_n, self.block_size):
                for k in range(0, padded_k, self.block_size):
                    # Extract blocks of size NxN (or smaller if near boundary)
                    a_block = matrix_a_padded[i:i+self.block_size, k:k+self.block_size]
                    b_block = matrix_b_padded[k:k+self.block_size, j:j+self.block_size]

                    # Perform block multiplication
                    partial_result = np.dot(a_block, b_block)

                    # Accumulate the result in the final matrix
                    result_matrix_padded[i:i+self.block_size, j:j+self.block_size] += partial_result.astype(np.int32 if self.accumulator_precision == 'INT32' else np.float32)

        # Trim the result matrix back to the original size
        result_matrix = result_matrix_padded[:original_m, :original_n]

        return result_matrix


# Element-Wise Operations
class ElementWiseOperations:
    def __init__(self, input_precision='INT8'):
        self.input_precision = input_precision

    def add(self, tensor_a, tensor_b):
        tensor_a = tensor_a.astype(np.int8)
        tensor_b = tensor_b.astype(np.int8)
        return np.add(tensor_a, tensor_b).astype(np.int32)


# Activation Functions
class ActivationFunctions:
    def relu(self, tensor):
        return np.maximum(tensor, 0)


# Pooling Operations
class PoolingOperations:
    def max_pooling(self, tensor, kernel_size, stride):
        output_shape = (
            (tensor.shape[0] - kernel_size) // stride + 1,
            (tensor.shape[1] - kernel_size) // stride + 1
        )
        pooled_tensor = np.zeros(output_shape)
        for i in range(0, tensor.shape[0] - kernel_size + 1, stride):
            for j in range(0, tensor.shape[1] - kernel_size + 1, stride):
                pooled_tensor[i // stride, j // stride] = np.max(
                    tensor[i:i + kernel_size, j:j + kernel_size])
        return pooled_tensor


# ResNet Block Simulator
class ResNetBlockSimulator:
    def __init__(self, block_size=4):
        self.block_size = block_size
        self.matrix_multiplier = BlockMatrixMultiplier(block_size)
        self.element_ops = ElementWiseOperations(input_precision='INT8')
        self.activation_funcs = ActivationFunctions()

    def conv2d(self, input_tensor, kernel):
        """
        Simulate a 2D convolution using matrix multiplication.
        input_tensor: (m x n) input matrix (for simplicity)
        kernel: (k x k) convolution kernel matrix
        """
        # Simulating convolution as a matrix multiplication for simplicity
        output = self.matrix_multiplier.multiply_blocks(input_tensor, kernel)
        return output

    def resnet_block(self, input_tensor, conv_kernel1, conv_kernel2, downsample=False):
        """
        Simulates a simplified ResNet block.
        input_tensor: Input feature map
        conv_kernel1: First convolution kernel
        conv_kernel2: Second convolution kernel
        downsample: If True, simulate downsampling with stride.
        """
        # First convolution followed by ReLU activation
        conv1_output = self.conv2d(input_tensor, conv_kernel1)
        relu1_output = self.activation_funcs.relu(conv1_output)

        # Second convolution
        conv2_output = self.conv2d(relu1_output, conv_kernel2)

        # Residual connection (element-wise addition of input_tensor and conv2_output)
        if downsample:
            input_tensor = self.conv2d(input_tensor, np.eye(conv_kernel1.shape[0]))  # Simulate downsampling

        residual_output = self.element_ops.add(input_tensor, conv2_output)

        # ReLU activation on the residual output
        final_output = self.activation_funcs.relu(residual_output)

        return final_output


# Full ResNet-18 Implementation
class ResNet18Simulator:
    def __init__(self, block_size=4):
        self.block_size = block_size
        self.resnet_simulator = ResNetBlockSimulator(block_size)
        self.pooling_ops = PoolingOperations()

    def initial_conv(self, input_tensor):
        # Initial Conv Layer (7x7 kernel, 64 filters, stride 2)
        conv_kernel = np.random.randint(0, 10, size=(7, 7), dtype=np.int8)
        return self.resnet_simulator.conv2d(input_tensor, conv_kernel)

    def simulate_resnet(self, input_tensor, conv_kernels):
        """
        Simulate the full ResNet-18 architecture.
        input_tensor: Input feature map
        conv_kernels: List of convolution kernels for the blocks
        """
        # Initial Conv + MaxPooling
        input_tensor = self.initial_conv(input_tensor)
        input_tensor = self.pooling_ops.max_pooling(input_tensor, kernel_size=3, stride=2)

        # Four stages of residual blocks
        for i in range(0, len(conv_kernels), 4):
            # Each stage has two residual blocks
            conv_kernel1 = conv_kernels[i]
            conv_kernel2 = conv_kernels[i+1]
            input_tensor = self.resnet_simulator.resnet_block(input_tensor, conv_kernel1, conv_kernel2, downsample=(i > 0))

            conv_kernel3 = conv_kernels[i+2]
            conv_kernel4 = conv_kernels[i+3]
            input_tensor = self.resnet_simulator.resnet_block(input_tensor, conv_kernel3, conv_kernel4)

        # Global average pooling (for simplicity, we will simulate a pooling operation)
        input_tensor = self.pooling_ops.max_pooling(input_tensor, kernel_size=input_tensor.shape[0], stride=1)

        # Fully connected layer (for classification)
        fc_weights = np.random.randint(0, 10, size=(input_tensor.size, 1000), dtype=np.int8)
        output_tensor = np.dot(input_tensor.flatten(), fc_weights)

        return output_tensor


# Verification for Block Matrix Multiplication
def verify_block_matrix_multiplication(matrix_a, matrix_b, block_size):
    # Initialize the block matrix multiplier
    block_multiplier = BlockMatrixMultiplier(block_size, input_precision='INT8', accumulator_precision='INT32')

    # Perform block matrix multiplication
    block_result = block_multiplier.multiply_blocks(matrix_a, matrix_b)

    # Perform standard matrix multiplication using numpy for verification
    standard_result = np.dot(matrix_a.astype(np.int32), matrix_b.astype(np.int32))

    # Verify correctness by comparing the results
    if np.array_equal(block_result, standard_result):
        print("Block Matrix Multiplication is correct!")
    else:
        print("Block Matrix Multiplication is incorrect!")
        print("Block Matrix Result:\n", block_result)
        print("Standard Matrix Result:\n", standard_result)


# Example usage of ResNet18Simulator and matrix multiplication checker
if __name__ == "__main__":
    # Test case for block matrix multiplication
    matrix_a = np.random.randint(0, 2, size=(7, 5), dtype=np.int8)  # Not a multiple of block size
    matrix_b = np.random.randint(0, 2, size=(5, 6), dtype=np.int8)  # Not a multiple of block size
    block_size = 4

    print("Verifying block matrix multiplication...")
    verify_block_matrix_multiplication(matrix_a, matrix_b, block_size)

    # Test case for ResNet-18 simulation
    print("\nSimulating ResNet-18...")
    input_tensor = np.random.randint(0, 10, size=(224, 224), dtype=np.int8)

    # Convolution kernels for ResNet-18 (8 blocks = 16 convolutions)
    conv_kernels = [np.random.randint(0, 10, size=(3, 3), dtype=np.int8) for _ in range(16)]

    # Initialize and simulate ResNet-18
    resnet18_simulator = ResNet18Simulator(block_size=4)
    output_tensor = resnet18_simulator.simulate_resnet(input_tensor, conv_kernels)
    print("ResNet-18 Output (after Fully Connected Layer):\n", output_tensor)

Verifying block matrix multiplication...
hey 5 5
Block Matrix Multiplication is correct!

Simulating ResNet-18...
hey 224 7


AssertionError: Matrix dimensions do not match for multiplication.

In [15]:
original_m, original_k1 = matrix_a.shape
original_k2, original_n = matrix_b.shape

In [12]:
matrix_a

array([[8, 7, 1, 1, 3],
       [3, 4, 4, 9, 9],
       [2, 5, 5, 2, 8],
       [1, 8, 3, 5, 8],
       [5, 4, 9, 9, 6],
       [8, 8, 1, 4, 7],
       [6, 0, 0, 8, 7]], dtype=int8)

In [13]:
matrix_b

array([[5, 6, 2, 1, 2, 7],
       [1, 1, 7, 1, 9, 3],
       [6, 9, 5, 3, 1, 3],
       [0, 5, 5, 1, 0, 3],
       [1, 6, 9, 8, 4, 3]], dtype=int8)

In [16]:
        block_size = 4
        # Calculate the padded dimensions to make them multiples of the block size
        padded_m = ((original_m + block_size - 1) // block_size) * block_size
        padded_k = ((original_k1 + block_size - 1) // block_size) * block_size
        padded_n = ((original_n + block_size - 1) // block_size) * block_size

In [20]:
original_m, original_k1, original_n, padded_m, padded_k, padded_n

(7, 5, 6, 8, 8, 8)

In [23]:
    def pad_matrix(matrix, target_shape):
        """
        Pads the matrix to the target shape.
        matrix: Input matrix to be padded
        target_shape: (rows, cols) - shape to pad the matrix to
        """
        original_shape = matrix.shape
        padded_matrix = np.zeros(target_shape, dtype=matrix.dtype)
        padded_matrix[:original_shape[0], :original_shape[1]] = matrix
        return padded_matrix
matrix_a_padded = pad_matrix(matrix_a, (padded_m, padded_k))
matrix_b_padded = pad_matrix(matrix_b, (padded_k, padded_n))
matrix_a_padded, matrix_b_padded

(array([[8, 7, 1, 1, 3, 0, 0, 0],
        [3, 4, 4, 9, 9, 0, 0, 0],
        [2, 5, 5, 2, 8, 0, 0, 0],
        [1, 8, 3, 5, 8, 0, 0, 0],
        [5, 4, 9, 9, 6, 0, 0, 0],
        [8, 8, 1, 4, 7, 0, 0, 0],
        [6, 0, 0, 8, 7, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8),
 array([[5, 6, 2, 1, 2, 7, 0, 0],
        [1, 1, 7, 1, 9, 3, 0, 0],
        [6, 9, 5, 3, 1, 3, 0, 0],
        [0, 5, 5, 1, 0, 3, 0, 0],
        [1, 6, 9, 8, 4, 3, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8))

In [31]:
        result_matrix_padded = np.zeros((padded_m, padded_n), dtype=np.int32)
        result_matrix_padded

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [32]:
        for i in range(0, padded_m, block_size):
            for j in range(0, padded_n, block_size):
                for k in range(0, padded_k, block_size):
                    # Extract blocks of size NxN (or smaller if near boundary)
                    a_block = matrix_a_padded[i:i+block_size, k:k+block_size]
                    b_block = matrix_b_padded[k:k+block_size, j:j+block_size]

                    # Perform block multiplication
                    partial_result = np.dot(a_block, b_block)

                    # Accumulate the result in the final matrix
                    result_matrix_padded[i:i+block_size, j:j+block_size] += partial_result.astype(np.int32)
                    print (i,j,k, a_block, b_block, partial_result, result_matrix_padded)
result_matrix_padded

0 0 0 [[8 7 1 1]
 [3 4 4 9]
 [2 5 5 2]
 [1 8 3 5]] [[5 6 2 1]
 [1 1 7 1]
 [6 9 5 3]
 [0 5 5 1]] [[ 53  69  75  19]
 [ 43 103  99  28]
 [ 45  72  74  24]
 [ 31  66  98  23]] [[ 53  69  75  19   0   0   0   0]
 [ 43 103  99  28   0   0   0   0]
 [ 45  72  74  24   0   0   0   0]
 [ 31  66  98  23   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]]
0 0 4 [[3 0 0 0]
 [9 0 0 0]
 [8 0 0 0]
 [8 0 0 0]] [[1 6 9 8]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]] [[ 3 18 27 24]
 [ 9 54 81 72]
 [ 8 48 72 64]
 [ 8 48 72 64]] [[ 56  87 102  43   0   0   0   0]
 [ 52 157 180 100   0   0   0   0]
 [ 53 120 146  88   0   0   0   0]
 [ 39 114 170  87   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]]
0 4 0 [[8 7 1 1]
 [3 4 4 9]
 [2 5 5 2]
 [1 8 3 5]] [[2 7 0 0]
 [9 3 0 0]
 [1 3 0 0]
 [0 3 0 0]] [[80 83  0  

array([[ 56,  87, 102,  43,  92,  92,   0,   0],
       [ 52, 157, 180, 100,  82,  99,   0,   0],
       [ 53, 120, 146,  88,  86,  74,   0,   0],
       [ 39, 114, 170,  87, 109,  79,   0,   0],
       [ 89, -60, -74,  93,  79, 119,   0,   0],
       [ 61, 127, 160,  79, 117, 116,   0,   0],
       [ 37, 118, 115,  70,  40,  87,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [36]:
np.dot(matrix_a.astype(np.int32), matrix_b.astype(np.int32))

array([[ 56,  87, 102,  43,  92,  92],
       [ 52, 157, 180, 100,  82,  99],
       [ 53, 120, 146,  88,  86,  74],
       [ 39, 114, 170,  87, 109,  79],
       [ 89, 196, 182,  93,  79, 119],
       [ 61, 127, 160,  79, 117, 116],
       [ 37, 118, 115,  70,  40,  87]], dtype=int32)

In [34]:
matrix_a

array([[8, 7, 1, 1, 3],
       [3, 4, 4, 9, 9],
       [2, 5, 5, 2, 8],
       [1, 8, 3, 5, 8],
       [5, 4, 9, 9, 6],
       [8, 8, 1, 4, 7],
       [6, 0, 0, 8, 7]], dtype=int8)

In [35]:
matrix_b

array([[5, 6, 2, 1, 2, 7],
       [1, 1, 7, 1, 9, 3],
       [6, 9, 5, 3, 1, 3],
       [0, 5, 5, 1, 0, 3],
       [1, 6, 9, 8, 4, 3]], dtype=int8)

In [42]:
import numpy as np

# Block Matrix Multiplier
class BlockMatrixMultiplier:
    def __init__(self, block_size, input_precision='int8', accumulator_precision='int32'):
        self.block_size = block_size  # Block size NxN
        self.input_precision = input_precision
        self.accumulator_precision = accumulator_precision

    def pad_matrix(self, matrix, target_shape):
        """
        Pads the matrix to the target shape using the data type of the input matrix.
        """
        original_shape = matrix.shape
        padded_matrix = np.zeros(target_shape, dtype=matrix.dtype)
        padded_matrix[:original_shape[0], :original_shape[1]] = matrix
        return padded_matrix

    def multiply_blocks(self, matrix_a, matrix_b):
        """
        Simulates block matrix multiplication for quantized inputs with padding.
        matrix_a: (MxK), matrix_b: (KxN), Block Size: (NxN)
        """
        original_m, original_k1 = matrix_a.shape
        original_k2, original_n = matrix_b.shape
        print("hey",original_k1,original_k2)

        assert original_k1 == original_k2, "Matrix dimensions do not match for multiplication."

        # Calculate the padded dimensions to make them multiples of the block size
        padded_m = ((original_m + self.block_size - 1) // self.block_size) * self.block_size
        padded_k = ((original_k1 + self.block_size - 1) // self.block_size) * self.block_size
        padded_n = ((original_n + self.block_size - 1) // self.block_size) * self.block_size

        # Pad the matrices using their own data types
        matrix_a_padded = self.pad_matrix(matrix_a, (padded_m, padded_k))
        matrix_b_padded = self.pad_matrix(matrix_b, (padded_k, padded_n))

        # Initialize the padded result matrix using accumulator's data type
        result_matrix_padded = np.zeros((padded_m, padded_n), dtype=np.int32)

        # Convert matrices to the specified precision (only if input_precision is not already correct)
        matrix_a_padded = matrix_a_padded.astype(np.int8 if self.input_precision == 'int8' else np.float32)
        matrix_b_padded = matrix_b_padded.astype(np.int8 if self.input_precision == 'int8' else np.float32)

        # Block matrix multiplication
        for i in range(0, padded_m, self.block_size):
            for j in range(0, padded_n, self.block_size):
                for k in range(0, padded_k, self.block_size):
                    a_block = matrix_a_padded[i:i+self.block_size, k:k+self.block_size]
                    b_block = matrix_b_padded[k:k+self.block_size, j:j+self.block_size]
                    partial_result = np.dot(a_block, b_block)
                    result_matrix_padded[i:i+self.block_size, j:j+self.block_size] += partial_result.astype(np.int32)

        # Trim the result matrix back to the original size
        result_matrix = result_matrix_padded[:original_m, :original_n]
        return result_matrix


# im2col function
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    """
    Rearranges image blocks into columns for convolution as matrix multiplication.
    input_data: (H, W, C) input feature map or image
    filter_h: Filter height
    filter_w: Filter width
    stride: Stride for the convolution
    pad: Padding for the input
    """
    H, W, C = input_data.shape  # Height, Width, Channels
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1

    # Add padding to the input data if required
    img = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], 'constant')

    # Prepare the columns
    col = np.zeros((filter_h * filter_w * C, out_h * out_w))

    # Slide the filter over the image and collect patches
    for y in range(0, out_h):
        y_min = y * stride
        y_max = y_min + filter_h

        for x in range(0, out_w):
            x_min = x * stride
            x_max = x_min + filter_w

            patch = img[y_min:y_max, x_min:x_max, :].reshape(-1)
            col[:, y * out_w + x] = patch

    return col


# ResNet Block Simulator
class ResNetBlockSimulator:
    def __init__(self, block_size=4):
        self.block_size = block_size
        self.matrix_multiplier = BlockMatrixMultiplier(block_size)

    def conv2d(self, input_tensor, kernel, stride=1, pad=1):
        """
        Simulate a 2D convolution using matrix multiplication via im2col.
        input_tensor: (H, W, C) input feature map or image
        kernel: (filter_h, filter_w, C, out_channels) convolution kernel
        """
        filter_h, filter_w, in_channels, out_channels = kernel.shape

        # Apply im2col to the input data
        input_col = im2col(input_tensor, filter_h, filter_w, stride=stride, pad=pad)

        # Reshape the kernel into matrix form
        kernel_col = kernel.reshape(filter_h * filter_w * in_channels, out_channels)

        # Perform matrix multiplication using block matrix multiplier
        output_col = self.matrix_multiplier.multiply_blocks(input_col, kernel_col)

        # Reshape the output to the correct dimensions (H_out, W_out, out_channels)
        output_h = (input_tensor.shape[0] + 2 * pad - filter_h) // stride + 1
        output_w = (input_tensor.shape[1] + 2 * pad - filter_w) // stride + 1
        output_tensor = output_col.reshape(output_h, output_w, out_channels)

        return output_tensor

    def resnet_block(self, input_tensor, conv_kernel1, conv_kernel2, stride=1):
        """
        Simulates a ResNet block with two convolutional layers and a residual connection.
        """
        # First convolution using im2col + matrix multiplication
        conv1_output = self.conv2d(input_tensor, conv_kernel1, stride=stride)

        # Second convolution
        conv2_output = self.conv2d(conv1_output, conv_kernel2)

        # Residual connection (adding the input to the output of the second conv layer)
        residual_output = conv2_output + input_tensor  # Assuming input and output have the same shape
        return residual_output


# Verification for Block Matrix Multiplication
def verify_block_matrix_multiplication(matrix_a, matrix_b, block_size):
    # Initialize the block matrix multiplier
    block_multiplier = BlockMatrixMultiplier(block_size, input_precision='int8', accumulator_precision='int32')

    # Perform block matrix multiplication
    block_result = block_multiplier.multiply_blocks(matrix_a, matrix_b)

    # Perform standard matrix multiplication using numpy for verification
    standard_result = np.dot(matrix_a, matrix_b)

    # Verify correctness by comparing the results
    if np.array_equal(block_result, standard_result):
        print("Block Matrix Multiplication is correct!")
    else:
        print("Block Matrix Multiplication is incorrect!")
        print("Block Matrix Result:\n", block_result)
        print("Standard Matrix Result:\n", standard_result)


# Example usage of the ResNetBlockSimulator with im2col and matrix multiplication checker
if __name__ == "__main__":
    # Test case for block matrix multiplication
    matrix_a = np.random.randint(0, 2, size=(7, 5), dtype=np.int8)  # Not a multiple of block size
    matrix_b = np.random.randint(0, 2, size=(5, 6), dtype=np.int8)  # Not a multiple of block size
    block_size = 4

    print("Verifying block matrix multiplication...")
    verify_block_matrix_multiplication(matrix_a, matrix_b, block_size)

    # Test case for ResNet block simulation
    print("\nSimulating ResNet block...")
    # Input tensor (e.g., 224x224 image with 3 channels)
    input_tensor = np.random.randint(0, 10, size=(224, 224, 3), dtype=np.int8)

    # Convolution kernels (3x3 filters)
    conv_kernel1 = np.random.randint(0, 10, size=(3, 3, 3, 64), dtype=np.int8)  # First conv layer
    conv_kernel2 = np.random.randint(0, 10, size=(3, 3, 64, 64), dtype=np.int8)  # Second conv layer

    # Initialize the ResNet block simulator
    resnet_block_simulator = ResNetBlockSimulator(block_size=4)

    # Simulate a ResNet block
    output_tensor = resnet_block_simulator.resnet_block(input_tensor, conv_kernel1, conv_kernel2)
    print("Output of the ResNet block:\n", output_tensor)

Verifying block matrix multiplication...
hey 5 5
Block Matrix Multiplication is correct!

Simulating ResNet block...
hey 50176 27


AssertionError: Matrix dimensions do not match for multiplication.

In [45]:
import numpy as np

# im2col function for converting the input into columns
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    """
    Rearranges image blocks into columns for convolution as matrix multiplication.
    input_data: (H, W, C) input feature map or image
    filter_h: Filter height
    filter_w: Filter width
    stride: Stride for the convolution
    pad: Padding for the input
    """
    H, W, C = input_data.shape  # Height, Width, Channels
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1

    # Add padding to the input data if required
    img = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], 'constant')

    # Prepare the columns
    col = np.zeros((filter_h * filter_w * C, out_h * out_w))

    # Slide the filter over the image and collect patches
    for y in range(0, out_h):
        y_min = y * stride
        y_max = y_min + filter_h

        for x in range(0, out_w):
            x_min = x * stride
            x_max = x_min + filter_w

            patch = img[y_min:y_max, x_min:x_max, :].reshape(-1)
            col[:, y * out_w + x] = patch

    return col

# Input image of size (224, 224, 3)
input_image = np.random.randn(224, 224, 3).astype(np.float32)

# 7x7 filter with 3 input channels (RGB) and 64 output channels
filter_h, filter_w, in_channels, out_channels = 7, 7, 3, 64
conv_kernel = np.random.randn(filter_h, filter_w, in_channels, out_channels).astype(np.float32)

# Convert the input image to column format (im2col)
input_col = im2col(input_image, filter_h, filter_w, stride=2, pad=3)

# Reshape the convolutional kernel into a matrix for multiplication
conv_kernel_col = conv_kernel.reshape(filter_h * filter_w * in_channels, out_channels)

# Perform matrix multiplication
output_col = np.dot(input_col.T, conv_kernel_col)

# Reshape the output back into the correct output dimensions
output_h = (input_image.shape[0] + 2 * 3 - filter_h) // 2 + 1
output_w = (input_image.shape[1] + 2 * 3 - filter_w) // 2 + 1
output = output_col.reshape(output_h, output_w, out_channels)

print("Output shape:", output.shape)  # Should be (112, 112, 64)

print("input shape:", input_col.shape)  # Should be (147, 12544)

print("kernel shape:", conv_kernel_col.shape)  # Should be (147, 12544)

Output shape: (112, 112, 64)
input shape: (147, 12544)
kernel shape: (147, 64)


In [47]:
import numpy as np

# im2col function for converting the input into columns
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    """
    Rearranges image blocks into columns for convolution as matrix multiplication.
    input_data: (H, W, C) input feature map or image
    filter_h: Filter height
    filter_w: Filter width
    stride: Stride for the convolution
    pad: Padding for the input
    """
    H, W, C = input_data.shape  # Height, Width, Channels
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1

    # Add padding to the input data if required
    img = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], 'constant')

    # Prepare the columns
    col = np.zeros((filter_h * filter_w * C, out_h * out_w))

    # Slide the filter over the image and collect patches
    for y in range(0, out_h):
        y_min = y * stride
        y_max = y_min + filter_h

        for x in range(0, out_w):
            x_min = x * stride
            x_max = x_min + filter_w

            patch = img[y_min:y_max, x_min:x_max, :].reshape(-1)
            col[:, y * out_w + x] = patch

    return col

# Convolution layer
def conv_layer(input_tensor, conv_kernel, stride=1, pad=1):
    filter_h, filter_w, in_channels, out_channels = conv_kernel.shape

    # Convert input image to column format (im2col)
    input_col = im2col(input_tensor, filter_h, filter_w, stride=stride, pad=pad)

    # Reshape convolution kernel into matrix format
    conv_kernel_col = conv_kernel.reshape(filter_h * filter_w * in_channels, out_channels)

    # Perform matrix multiplication
    output_col = np.dot(input_col.T, conv_kernel_col)

    # Reshape output back into the correct dimensions
    output_h = (input_tensor.shape[0] + 2 * pad - filter_h) // stride + 1
    output_w = (input_tensor.shape[1] + 2 * pad - filter_w) // stride + 1
    output = output_col.reshape(output_h, output_w, out_channels)

    return output

# Max pooling layer
def max_pooling(input_data, kernel_size=3, stride=2, pad=1):
    H, W, C = input_data.shape
    out_h = (H + 2 * pad - kernel_size) // stride + 1
    out_w = (W + 2 * pad - kernel_size) // stride + 1

    # Add padding
    input_data_padded = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], mode='constant')

    # Initialize output
    pooled_output = np.zeros((out_h, out_w, C))

    # Perform max pooling
    for c in range(C):
        for y in range(out_h):
            y_min = y * stride
            y_max = y_min + kernel_size
            for x in range(out_w):
                x_min = x * stride
                x_max = x_min + kernel_size
                pooled_output[y, x, c] = np.max(input_data_padded[y_min:y_max, x_min:x_max, c])

    return pooled_output

# Residual block with two 3x3 convolution layers and a 1x1 downsample if needed
def resnet18_residual_block(input_tensor, in_channels, out_channels, stride=1):
    # First 3x3 convolution
    conv_kernel1 = np.random.randn(3, 3, in_channels, out_channels).astype(np.float32)
    output1 = conv_layer(input_tensor, conv_kernel1, stride=stride, pad=1)

    # Second 3x3 convolution
    conv_kernel2 = np.random.randn(3, 3, out_channels, out_channels).astype(np.float32)
    output2 = conv_layer(output1, conv_kernel2, stride=1, pad=1)

    # Check if the input shape and output shape are the same
    if input_tensor.shape != output2.shape:
        # Perform downsampling with 1x1 convolution if shapes differ
        downsample_kernel = np.random.randn(1, 1, in_channels, out_channels).astype(np.float32)
        input_resized = conv_layer(input_tensor, downsample_kernel, stride=stride, pad=0)
    else:
        input_resized = input_tensor

    # Add the residual connection (input_tensor added to the final output)
    residual_output = output2 + input_resized

    return residual_output

# Global Average Pooling
def global_avg_pooling(input_tensor):
    return np.mean(input_tensor, axis=(0, 1))  # Average over height and width

# Fully Connected Layer (FC)
def fully_connected(input_vector, fc_weights):
    return np.dot(input_vector, fc_weights)

# ResNet-18 architecture, layer by layer
def resnet18_forward(input_image):
    # Initial convolution layer (7x7, stride 2, padding 3, 64 output channels)
    conv_kernel_init = np.random.randn(7, 7, 3, 64).astype(np.float32)
    output = conv_layer(input_image, conv_kernel_init, stride=2, pad=3)
    print("Initial Conv Output Shape:", output.shape)

    # Max pooling layer (3x3, stride 2, padding 1)
    output = max_pooling(output)
    print("Max Pool Output Shape:", output.shape)

    # Residual Block Stage 1 (2 blocks, 64 channels)
    output = resnet18_residual_block(output, 64, 64, stride=1)
    output = resnet18_residual_block(output, 64, 64, stride=1)
    print("Stage 1 Output Shape:", output.shape)

    # Residual Block Stage 2 (2 blocks, 128 channels, first block with stride 2)
    output = resnet18_residual_block(output, 64, 128, stride=2)
    output = resnet18_residual_block(output, 128, 128, stride=1)
    print("Stage 2 Output Shape:", output.shape)

    # Residual Block Stage 3 (2 blocks, 256 channels, first block with stride 2)
    output = resnet18_residual_block(output, 128, 256, stride=2)
    output = resnet18_residual_block(output, 256, 256, stride=1)
    print("Stage 3 Output Shape:", output.shape)

    # Residual Block Stage 4 (2 blocks, 512 channels, first block with stride 2)
    output = resnet18_residual_block(output, 256, 512, stride=2)
    output = resnet18_residual_block(output, 512, 512, stride=1)
    print("Stage 4 Output Shape:", output.shape)

    # Global Average Pooling
    output = global_avg_pooling(output)
    print("Global Average Pooling Output Shape:", output.shape)

    # Fully connected layer
    fc_weights = np.random.randn(512, 1000).astype(np.float32)  # 1000 output classes
    output = fully_connected(output, fc_weights)
    print("Fully Connected Output Shape:", output.shape)

    return output

# Example usage of ResNet-18 architecture
input_image = np.random.randn(224, 224, 3).astype(np.float32)  # Input image of size (224, 224, 3)
output_resnet18 = resnet18_forward(input_image)
print("Final Output Shape:", output_resnet18.shape)

Initial Conv Output Shape: (112, 112, 64)
Max Pool Output Shape: (56, 56, 64)
Stage 1 Output Shape: (56, 56, 64)
Stage 2 Output Shape: (28, 28, 128)
Stage 3 Output Shape: (14, 14, 256)
Stage 4 Output Shape: (7, 7, 512)
Global Average Pooling Output Shape: (512,)
Fully Connected Output Shape: (1000,)
Final Output Shape: (1000,)


In [49]:
import numpy as np

# Block Matrix Multiplier
class BlockMatrixMultiplier:
    def __init__(self, block_size, input_precision='int8', accumulator_precision='int32'):
        self.block_size = block_size  # Block size NxN
        self.input_precision = input_precision
        self.accumulator_precision = accumulator_precision

    def pad_matrix(self, matrix, target_shape):
        """
        Pads the matrix to the target shape using the data type of the input matrix.
        """
        original_shape = matrix.shape
        padded_matrix = np.zeros(target_shape, dtype=matrix.dtype)
        padded_matrix[:original_shape[0], :original_shape[1]] = matrix
        return padded_matrix

    def multiply_blocks(self, matrix_a, matrix_b):
        """
        Simulates block matrix multiplication for quantized inputs with padding.
        matrix_a: (MxK), matrix_b: (KxN), Block Size: (NxN)
        """
        original_m, original_k1 = matrix_a.shape
        original_k2, original_n = matrix_b.shape

        assert original_k1 == original_k2, "Matrix dimensions do not match for multiplication."

        # Calculate the padded dimensions to make them multiples of the block size
        padded_m = ((original_m + self.block_size - 1) // self.block_size) * self.block_size
        padded_k = ((original_k1 + self.block_size - 1) // self.block_size) * self.block_size
        padded_n = ((original_n + self.block_size - 1) // self.block_size) * self.block_size

        # Pad the matrices using their own data types
        matrix_a_padded = self.pad_matrix(matrix_a, (padded_m, padded_k))
        matrix_b_padded = self.pad_matrix(matrix_b, (padded_k, padded_n))

        # Initialize the padded result matrix using accumulator's data type
        result_matrix_padded = np.zeros((padded_m, padded_n), dtype=np.int32)

        # Convert matrices to the specified precision (only if input_precision is not already correct)
        matrix_a_padded = matrix_a_padded.astype(np.int8 if self.input_precision == 'int8' else np.float32)
        matrix_b_padded = matrix_b_padded.astype(np.int8 if self.input_precision == 'int8' else np.float32)

        # Block matrix multiplication
        for i in range(0, padded_m, self.block_size):
            for j in range(0, padded_n, self.block_size):
                for k in range(0, padded_k, self.block_size):
                    a_block = matrix_a_padded[i:i+self.block_size, k:k+self.block_size]
                    b_block = matrix_b_padded[k:k+self.block_size, j:j+self.block_size]
                    partial_result = np.dot(a_block, b_block)
                    result_matrix_padded[i:i+self.block_size, j:j+self.block_size] += partial_result.astype(np.int32)

        # Trim the result matrix back to the original size
        result_matrix = result_matrix_padded[:original_m, :original_n]
        return result_matrix


# im2col function for converting the input into columns
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    """
    Rearranges image blocks into columns for convolution as matrix multiplication.
    input_data: (H, W, C) input feature map or image
    filter_h: Filter height
    filter_w: Filter width
    stride: Stride for the convolution
    pad: Padding for the input
    """
    H, W, C = input_data.shape  # Height, Width, Channels
    out_h = (H + 2 * pad - filter_h) // stride + 1
    out_w = (W + 2 * pad - filter_w) // stride + 1

    # Add padding to the input data if required
    img = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], 'constant')

    # Prepare the columns
    col = np.zeros((filter_h * filter_w * C, out_h * out_w))

    # Slide the filter over the image and collect patches
    for y in range(0, out_h):
        y_min = y * stride
        y_max = y_min + filter_h

        for x in range(0, out_w):
            x_min = x * stride
            x_max = x_min + filter_w

            patch = img[y_min:y_max, x_min:x_max, :].reshape(-1)
            col[:, y * out_w + x] = patch

    return col

# Convolution layer with block matrix multiplication
def conv_layer(input_tensor, conv_kernel, block_multiplier, stride=1, pad=1):
    filter_h, filter_w, in_channels, out_channels = conv_kernel.shape

    # Convert input image to column format (im2col)
    input_col = im2col(input_tensor, filter_h, filter_w, stride=stride, pad=pad)

    # Reshape convolution kernel into matrix format
    conv_kernel_col = conv_kernel.reshape(filter_h * filter_w * in_channels, out_channels)

    # Perform matrix multiplication using the block matrix multiplier
    output_col = block_multiplier.multiply_blocks(input_col.T, conv_kernel_col)

    # Reshape output back into the correct dimensions
    output_h = (input_tensor.shape[0] + 2 * pad - filter_h) // stride + 1
    output_w = (input_tensor.shape[1] + 2 * pad - filter_w) // stride + 1
    output = output_col.reshape(output_h, output_w, out_channels)

    return output

# Max pooling layer
def max_pooling(input_data, kernel_size=3, stride=2, pad=1):
    H, W, C = input_data.shape
    out_h = (H + 2 * pad - kernel_size) // stride + 1
    out_w = (W + 2 * pad - kernel_size) // stride + 1

    # Add padding
    input_data_padded = np.pad(input_data, [(pad, pad), (pad, pad), (0, 0)], mode='constant')

    # Initialize output
    pooled_output = np.zeros((out_h, out_w, C))

    # Perform max pooling
    for c in range(C):
        for y in range(out_h):
            y_min = y * stride
            y_max = y_min + kernel_size
            for x in range(out_w):
                x_min = x * stride
                x_max = x_min + kernel_size
                pooled_output[y, x, c] = np.max(input_data_padded[y_min:y_max, x_min:x_max, c])

    return pooled_output

# Dedicated Functional Unit (DFU) for element-wise operations (e.g., addition)
class DFU:
    @staticmethod
    def add(tensor_a, tensor_b):
        return tensor_a + tensor_b

    @staticmethod
    def relu(tensor):
        return np.maximum(0, tensor)

# Residual block with two 3x3 convolution layers and a 1x1 downsample if needed
def resnet18_residual_block(input_tensor, in_channels, out_channels, block_multiplier, stride=1):
    # First 3x3 convolution
    conv_kernel1 = np.random.randn(3, 3, in_channels, out_channels).astype(np.float32)
    output1 = conv_layer(input_tensor, conv_kernel1, block_multiplier, stride=stride, pad=1)

    # Second 3x3 convolution
    conv_kernel2 = np.random.randn(3, 3, out_channels, out_channels).astype(np.float32)
    output2 = conv_layer(output1, conv_kernel2, block_multiplier, stride=1, pad=1)

    # Check if the input shape and output shape are the same
    if input_tensor.shape != output2.shape:
        # Perform downsampling with 1x1 convolution if shapes differ
        downsample_kernel = np.random.randn(1, 1, in_channels, out_channels).astype(np.float32)
        input_resized = conv_layer(input_tensor, downsample_kernel, block_multiplier, stride=stride, pad=0)
    else:
        input_resized = input_tensor

    # Add the residual connection (input_tensor added to the final output)
    residual_output = DFU.add(output2, input_resized)
    return DFU.relu(residual_output)

# Global Average Pooling
def global_avg_pooling(input_tensor):
    return np.mean(input_tensor, axis=(0, 1))  # Average over height and width

# Fully Connected Layer (FC)
def fully_connected(input_vector, fc_weights):
    return np.dot(input_vector, fc_weights)

# ResNet-18 architecture, layer by layer
def resnet18_forward(input_image, block_multiplier):
    # Initial convolution layer (7x7, stride 2, padding 3, 64 output channels)
    conv_kernel_init = np.random.randn(7, 7, 3, 64).astype(np.float32)
    output = conv_layer(input_image, conv_kernel_init, block_multiplier, stride=2, pad=3)
    print("Initial Conv Output Shape:", output.shape)

    # Max pooling layer (3x3, stride 2, padding 1)
    output = max_pooling(output)
    print("Max Pool Output Shape:", output.shape)

    # Residual Block Stage 1 (2 blocks, 64 channels)
    output = resnet18_residual_block(output, 64, 64, block_multiplier, stride=1)
    output = resnet18_residual_block(output, 64, 64, block_multiplier, stride=1)
    print("Stage 1 Output Shape:", output.shape)

    # Residual Block Stage 2 (2 blocks, 128 channels, first block with stride 2)
    output = resnet18_residual_block(output, 64, 128, block_multiplier, stride=2)
    output = resnet18_residual_block(output, 128, 128, block_multiplier, stride=1)
    print("Stage 2 Output Shape:", output.shape)

    # Residual Block Stage 3 (2 blocks, 256 channels, first block with stride 2)
    output = resnet18_residual_block(output, 128, 256, block_multiplier, stride=2)
    output = resnet18_residual_block(output, 256, 256, block_multiplier, stride=1)
    print("Stage 3 Output Shape:", output.shape)

    # Residual Block Stage 4 (2 blocks, 512 channels, first block with stride 2)
    output = resnet18_residual_block(output, 256, 512, block_multiplier, stride=2)
    output = resnet18_residual_block(output, 512, 512, block_multiplier, stride=1)
    print("Stage 4 Output Shape:", output.shape)

    # Global Average Pooling
    output = global_avg_pooling(output)
    print("Global Average Pooling Output Shape:", output.shape)

    # Fully connected layer
    fc_weights = np.random.randn(512, 1000).astype(np.float32)  # 1000 output classes
    output = fully_connected(output, fc_weights)
    print("Fully Connected Output Shape:", output.shape)

    return output

# Example usage of ResNet-18 architecture with block matrix multiplication
block_multiplier = BlockMatrixMultiplier(block_size=4)
input_image = np.random.randn(224, 224, 3).astype(np.float32)  # Input image of size (224, 224, 3)
output_resnet18 = resnet18_forward(input_image, block_multiplier)
print("Final Output Shape:", output_resnet18.shape)

KeyboardInterrupt: 

Here is the full transcript of the questions asked for the session today

Apologies for the confusion. Let me provide a more comprehensive transcript, including the missing questions. Here’s the full list of questions you’ve asked throughout this session:

	1.	Pls write detailed technical feature requirements for a hardware matrix multiplier for quantized neural network inference computation.
	2.	Also add requirements to support the other PyTorch tensor operators which might not be supported through matrix multiplication. Do this through a dedicated functional unit for such purpose.
	3.	For each requirements that you created - can you write a functional simulator model. Use any language that’s suitable.
	4.	Let us modify the multiplier to a block multiplier of NxN.
	5.	Add code to confirm if the matrix multiplication result was correct.
	6.	OK. Put all the functional models together in one script.
	7.	How would a resnet be mapped on this functional simulator?
	8.	Consolidate the full functional model including the resnet simulator code in one script.
	9.	Upgrade the matrix multiplication functional code to extend the input matrix sizes to a multiple of matrix multiplier block sizes - but output only the dimensions that are needed.
	10.	Upgrade the full script with this improved version and run it to check for correctness for matrix multiplication and resnet example.
	11.	Now expand the script to include the actual resnet computation for resnet18.
	12.	Do not delete the matrix multiply checker code from the script. Add it back.
	13.	Now expand the script to include the actual resnet computation for resnet18.
	14.	Let us first start with how to convert the resnet18 computation to matrix multiplications - layer by layer. Start with an example.
	15.	OK. Now convert the resnet18 layer by layer to the matrix style multiplication.
	16.	Assemble the python code that converts resnet18 to matrix multiplication for all layers.
	17.	Input_2 and output_resized are not the same shape in this code. Something is wrong. What is it?
	18.	Now integrate this whole code into the matrix multiplication and the DFU codebase.
	19.	Please give me the full transcript of all the questions that I asked.

I hope this is now complete. Let me know if you need further clarifications or adjustments!