# Comparación Numpy vs. Dask

In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import dask.array as da
import time 
import numpy as np



In [2]:
#Creación de datos
dNP = np.random.randint(0, 100, size=(50000, 50000))
dNP


array([[31, 93, 22, ..., 87, 12, 77],
       [27, 22, 75, ..., 72, 39, 46],
       [11, 58,  7, ..., 55, 22,  0],
       ...,
       [20, 80, 78, ..., 65, 77, 39],
       [38, 10, 34, ...,  2, 82, 56],
       [63, 87, 97, ...,  9, 60,  1]])

In [3]:
dD =da.random.randint(0, 100, size=(50000, 50000),chunks=(1000, 1000))

Operaciones Matemáticas

In [4]:
#Sacar media de las filas
#NP
startTimeNP = time.time()
media_filasN = np.mean(dNP, axis=1)
print(f"Medias de las filas: {media_filasN}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
media_filasD = dD.mean(axis=1)
print(f"Medias de las filas: {media_filasD}")
print(time.time() -startTimeD)



Medias de las filas: [49.47088 49.55498 49.72404 ... 49.44952 49.40524 49.43996]
23.49649214744568
Medias de las filas: dask.array<mean_agg-aggregate, shape=(50000,), dtype=float64, chunksize=(1000,), chunktype=numpy.ndarray>
0.04595518112182617


In [5]:
media_filasD

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,7.81 kiB
Shape,"(50000,)","(1000,)"
Dask graph,50 chunks in 5 graph layers,50 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 390.62 kiB 7.81 kiB Shape (50000,) (1000,) Dask graph 50 chunks in 5 graph layers Data type float64 numpy.ndarray",50000  1,

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,7.81 kiB
Shape,"(50000,)","(1000,)"
Dask graph,50 chunks in 5 graph layers,50 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [8]:
#Máximos
#NP
startTimeNP = time.time()
maxN = np.max(dNP)
print(f"Máximo es: {maxN}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
maxD = dD.max()
print(f"Máximo es: {maxD}")
print(time.time() -startTimeD)

Máximo es: 99
21.56166982650757
Máximo es: dask.array<max-aggregate, shape=(), dtype=int64, chunksize=(), chunktype=numpy.ndarray>
0.02472996711730957


In [9]:
#Suma de dos columnas
#NP
startTimeNP = time.time()
suma_colN = np.sum(dNP[:, [0, 1]], axis=1)
print(f"La suma es: {suma_colN}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
suma_colD =  dD[:, [0, 1]].sum(axis=1)
print(f"La suma es: {suma_colD}")
print(time.time() -startTimeD)



La suma es: [ 80 103 137 ... 121 125  74]
5.777093887329102
La suma es: dask.array<sum-aggregate, shape=(50000,), dtype=int64, chunksize=(1000,), chunktype=numpy.ndarray>
0.025713205337524414


In [11]:
suma_colD.visualize(engine="cytoscape")

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

Manipulación de los arreglos

In [13]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#NP
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(time.time() -startTimeD)

#En este caso, si usamos un subconjunto pequeño es más rápido np array

0.00011873245239257812
0.0069730281829833984


In [14]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#NP
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(time.time() -startTimeD)

6.198883056640625e-05
0.00038814544677734375


In [15]:
#Trasponer filas y columnas
#NP
startTimeNP = time.time()
traspN = np.transpose(dNP)
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
traspN = da.transpose(dD)
print(time.time() -startTimeD)


8.320808410644531e-05
0.0010349750518798828


In [4]:
dNP2 = np.random.randint(0, 100, size=(50000, 5))
dD2 = da.random.randint(0, 100, size=(50000, 5),chunks=(1000, 1000))


In [7]:
#Dask
startTimeD = time.time()
concat_resultD = da.concatenate([dD, dD2], axis=1)
print(time.time() - startTimeD)


: 

In [None]:
#NP
startTimeNP = time.time()
concat_resultN = np.concatenate((dNP, dNP2), axis=1)
print(time.time() -startTimeNP)