In [11]:
import cloudpickle
from scipy.sparse import dok_matrix
from scipy.sparse import tril


def load_dict(zeilen,spalten):
    folder = "..\\coocurrence_blocks"
    if(spalten > zeilen):
        template = "block_{i}_{j}".format(i=spalten,j=zeilen)
    else:
        template = "block_{i}_{j}".format(i=zeilen,j=spalten)

    file_path = folder + '\\' + template
    with open(file_path, 'rb+') as file:
        co_occurences = cloudpickle.load(file)
    
    return co_occurences


def load_co_occurence(zeilen,spalten):
    co_occurences = load_dict(zeilen,spalten)
    coocurrence = dok_matrix((20000,20000),dtype='i')
    
    coocurrence._update(co_occurences) # dok_matrix updates #7673 pull request

    if spalten > zeilen :
        print('transposing')
        coocurrence = coocurrence.transpose()
    
    if spalten == zeilen:
        print('mirroring')
        print(coocurrence.toarray())
        coocurrence = coocurrence + tril(coocurrence,k=-1).transpose()
    
    return coocurrence

In [12]:
import time
tic = time.perf_counter()
coocurrence = load_co_occurence(0,0)
numpy = coocurrence.toarray()
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)

mirroring
[[    2570        0        0 ...        0        0        0]
 [     104    63009        0 ...        0        0        0]
 [    1027    16840 25500923 ...        0        0        0]
 ...
 [       0        0        0 ...        1        0        0]
 [       0        0        0 ...        0        1        0]
 [       0        0        2 ...        0        0        4]]
 in 68.3597 seconds
[[    2570      104     1027 ...        0        0        0]
 [     104    63009    16840 ...        0        0        0]
 [    1027    16840 25500923 ...        0        0        2]
 ...
 [       0        0        0 ...        1        0        0]
 [       0        0        0 ...        0        1        0]
 [       0        0        2 ...        0        0        4]]


In [3]:
tic = time.perf_counter()
coocurrence = load_co_occurence(1,0)
numpy = coocurrence.toarray()
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)

 in 24.3296 seconds
[[  0   0 770 ...   0   0   0]
 [  0   0 472 ...   0   0   0]
 [  0   0  13 ...   0   0   0]
 ...
 [  0   0  16 ...   0   0   0]
 [  0   1 143 ...   0   0   0]
 [  0   0 298 ...   0   0   0]]


In [4]:
tic = time.perf_counter()
coocurrence = load_co_occurence(0,1)
numpy = coocurrence.toarray()
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)

transposing
 in 40.1776 seconds
[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   1   0]
 [770 472  13 ...  16 143 298]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]


In [5]:
import scipy
coocurrence = load_co_occurence(1,0).tocoo()
scipy.sparse.save_npz('./speed_load_test.npz',coocurrence)

In [6]:
tic = time.perf_counter()
coocurrence = scipy.sparse.load_npz('./speed_load_test.npz')
numpy = coocurrence.toarray()
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)

 in 2.2427 seconds
[[  0   0 770 ...   0   0   0]
 [  0   0 472 ...   0   0   0]
 [  0   0  13 ...   0   0   0]
 ...
 [  0   0  16 ...   0   0   0]
 [  0   1 143 ...   0   0   0]
 [  0   0 298 ...   0   0   0]]


In [7]:
import h5py
f = h5py.File('./speed_load_test.hdf5', "w")#plus experiment name
HDF_matrix = f.create_dataset("co-ocurrence", (20000, 20000))
coocurrence_dict = load_dict(1,0)
numpy = coocurrence.toarray()
HDF_matrix[:,:] =  numpy
print(HDF_matrix)

<HDF5 dataset "co-ocurrence": shape (20000, 20000), type "<f4">


In [8]:
print(HDF_matrix[:][:])
f.close()

[[  0.   0. 770. ...   0.   0.   0.]
 [  0.   0. 472. ...   0.   0.   0.]
 [  0.   0.  13. ...   0.   0.   0.]
 ...
 [  0.   0.  16. ...   0.   0.   0.]
 [  0.   1. 143. ...   0.   0.   0.]
 [  0.   0. 298. ...   0.   0.   0.]]


In [None]:
print('speedtest hdf load')
tic = time.perf_counter()
f = h5py.File('./speed_load_test.hdf5', "r+")
HDF_matrix = f.get("co-ocurrence")
numpy = HDF_matrix[:]
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)
print(type(numpy))

In [19]:
import numpy as np
print('speedtest hdf + transpose')
tic = time.perf_counter()
f = h5py.File('./speed_load_test.hdf5', "r+")
HDF_matrix = f.get("co-ocurrence")
numpy = HDF_matrix[:]
numpy = np.transpose(numpy)
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)
print(type(numpy))

 in 0.5442 seconds
[[  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   1.   0.]
 [770. 472.  13. ...  16. 143. 298.]
 ...
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]]
<class 'numpy.ndarray'>


In [21]:
f = h5py.File('./speed_load_mirrored.hdf5', "w")#plus experiment name
HDF_matrix = f.create_dataset("co-ocurrence", (20000, 20000))
coocurrence_dict = load_dict(1,0)
numpy = coocurrence.toarray()
HDF_matrix[:,:] = numpy
print(HDF_matrix[:][:])
f.close()

print('speedtest hdf + mirroring')

tic = time.perf_counter()
f = h5py.File('./speed_load_test.hdf5', "r+")
HDF_matrix = f.get("co-ocurrence")
numpy = HDF_matrix[:]
numpy = numpy + tril(numpy,k=-1).transpose()
toc = time.perf_counter()
print(f" in {toc - tic:0.4f} seconds")
print(numpy)
print(type(numpy))

[[  0.   0. 770. ...   0.   0.   0.]
 [  0.   0. 472. ...   0.   0.   0.]
 [  0.   0.  13. ...   0.   0.   0.]
 ...
 [  0.   0.  16. ...   0.   0.   0.]
 [  0.   1. 143. ...   0.   0.   0.]
 [  0.   0. 298. ...   0.   0.   0.]]
speedtest hdf + mirroring
 in 4.8703 seconds
[[  0.   0. 770. ...   0.   0.   0.]
 [  0.   0. 472. ...   0.   1.   0.]
 [  0.   0.  13. ...  16. 143. 298.]
 ...
 [  0.   0.  16. ...   0.   0.   0.]
 [  0.   1. 143. ...   0.   0.   0.]
 [  0.   0. 298. ...   0.   0.   0.]]
<class 'numpy.matrix'>
