Procediamo con l'import dei dati

In [1]:
from numba import vectorize
import numpy as np
import time
import numba.cuda.api,numba.cuda.cudadrv.libs
import tensorflow as tf


numba.cuda.cudadrv.libs.test()
numba.cuda.api.detect()

Finding cublas
	ERROR: can't locate lib
Finding cusparse
	located at /usr/local/cuda/lib64/libcusparse.so.10.1.105
Finding cufft
	located at /usr/local/cuda/lib64/libcufft.so.10.1.105
Finding curand
	located at /usr/local/cuda/lib64/libcurand.so.10.1.105
Finding nvvm
	located at /usr/local/cuda/nvvm/lib64/libnvvm.so
	finding libdevice for compute_20...	ok
	finding libdevice for compute_30...	ok
	finding libdevice for compute_35...	ok
	finding libdevice for compute_50...	ok
Found 1 CUDA devices
id 0     b'GeForce GTX 1050'                              [SUPPORTED]
                      compute capability: 6.1
                           pci device id: 0
                              pci bus id: 1
Summary:
	1/1 devices are supported


True

In [2]:
@vectorize(['float32(float32, float32)'], target='cuda')
def parallelized_add(a, b):
    return a + b

def normal_add(a, b):
    c = []
    for i in range(0, len(a)):
        c.append(a[i]+b[i])
    return np.array(c, dtype=np.float32)    
    

In [3]:
n = 1e8
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)


In [4]:

start_parallelized = time.time()
c = parallelized_add(a, b)
end_parallelized = time.time() 
delta =  end_parallelized-start_parallelized
print('parallelized_add executed in {0} seconds'.format(delta))


start_normal  = time.time()
c = normal_add(a, b)
end_normal = time.time() 
delta = end_normal -start_normal 
print('normal_add executed in {0} seconds'.format(delta))



parallelized_add executed in 0.5228691101074219 seconds
normal_add executed in 27.878769636154175 seconds


**Due dimensioni**

In [2]:
@vectorize(['float32(float32, float32)',
            'float32(float32, float32)',
            'float32(float32, float32)'],
           target='cuda')
def parallelized_add_2d(a, b):
    return a + b


def normal_add_2d(a, b):
    c = np.ones((a.shape[0], a.shape[1])).astype(np.float32)
    for i in range(0, a.shape[0]):
        for j in range(0, a.shape[1]):
            c[i][j] = a[i][j] + b[i][j]
    return np.array(c, dtype=np.float32)    

In [4]:
n = 10000
p = 10000
a = np.random.random((n, p)).astype(np.float32)
b = np.random.random((n, p)).astype(np.float32)

start_normal  = time.time()
c = parallelized_add_2d(a, b)
end_normal = time.time() 
delta = end_normal -start_normal 
print('parallelized_add_2d executed in {0} seconds'.format(delta))


start_normal  = time.time()
c = normal_add_2d(a, b)
end_normal = time.time() 
delta = end_normal -start_normal 
print('normal_add executed in {0} seconds'.format(delta))

parallelized_add_2d executed in 0.38671278953552246 seconds
normal_add executed in 65.92687273025513 seconds


In [3]:
n = 10000
p = 10000
a = np.random.random((n, p)).astype(np.float32)
b = np.random.random((n, p)).astype(np.float32)

a_tensor = tf.convert_to_tensor(a, dtype=tf.float32)
b_tensor = tf.convert_to_tensor(b, dtype=tf.float32)

start_tf = time.time()
c = tf.matmul(a_tensor,tf.transpose(b_tensor))
end_tf = time.time()
delta = end_tf - start_tf
print('normal_add executed in {0} seconds'.format(delta))

start_n = time.time()
c = np.dot(a,b.T)
end_n = time.time()
delta = end_n - start_n
print('normal_add executed in {0} seconds'.format(delta))

normal_add executed in 0.07178807258605957 seconds
normal_add executed in 23.933459758758545 seconds
