# Performance Profiling with Intel(R) VTune(TM) Amplifier

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import multiprocessing, numpy, ctypes, os
import numpy.random as rnd
import time

try:
    import numpy.random_intel as rnd
    numpy_ver="intel"
except:
    import numpy.random as rnd
    numpy_ver="std"

def multiply(start):
    global A, B, shm_arr, segment
    arr = numpy.frombuffer(shm_arr.get_obj(), dtype=ctypes.c_float)
    C = arr.reshape((n,n))
    for i in range(start,start+segment):
        for k in range(m):
            for j in range(m):
                C[i][j] += A[i][k] * B[k][j]

    #print (multiprocessing.current_process().name)

def parallelMatrixProduct(A, B, threadsNum):
    pool = multiprocessing.Pool(threadsNum)
    pool.map(multiply, range(0,n,segment))
    arr = numpy.frombuffer(shm_arr.get_obj(), dtype=ctypes.c_float)
    tmp =  arr.reshape((n,n))
    return tmp


def blasMatrixProduct(A, B):
    return numpy.dot(A, B)

if __name__ == '__main__':
    A = rnd.normal(size=(360, 360))
    B = rnd.normal(size=(360, 360))
    
    # number of rows
    n = len(A)
    # number of columns
    m = len(B[0])
    
    #arranging work distribution between threads
    threadsNum = multiprocessing.cpu_count()
    segment = int(n / threadsNum)
    if segment < 1:
        segment = 1

    start_time = time.time()
    #allocating shred memory for multiprocess access
    shm_arr = multiprocessing.Array(ctypes.c_float, n*n)
    C = parallelMatrixProduct(A, B, threadsNum)
    elapsed_time = time.time() - start_time
    print ("--- Simple multithreaded matrix multiplication ---")
    print ("segment=", segment)
    print ("threads num=", threadsNum)
    print ("Matrix size n x m", n, m)
    print ("Elapsed time = ", elapsed_time)
    # print(C)
    
    start_time = time.time()
    C = blasMatrixProduct(A, B)
    elapsed_time = time.time() - start_time    
    print ("--- BLAS multithreaded matrix multiplication ---")
    print ("numpy version:", numpy_ver)
    print ("Matrix size n x m", n, m)
    print ("Elapsed time = ", elapsed_time)
    # print(C)
    
    

# os.system("taskset -p 0xff %d" % os.getpid())


--- Simple multithreaded matrix multiplication ---
segment= 90
threads num= 4
Matrix size n x m 360 360
Elapsed time =  54.6752815246582
--- BLAS multithreaded matrix multiplication ---
numpy version: intel
Matrix size n x m 360 360
Elapsed time =  0.06852483749389648
