In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
# standard form
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
sp.csr_matrix((data, (row, col)), shape=(3, 3)).toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]], dtype=int64)

In [3]:
# indptr form
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
sp.csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

In [4]:
standard_df = pd.DataFrame(
    {
        'row': np.array([0, 0, 1, 2, 2, 2]),
        'col': np.array([0, 2, 2, 0, 1, 2]),
        'data': np.array([1, 2, 3, 4, 5, 6])
    }
)
standard_df

Unnamed: 0,row,col,data
0,0,0,1
1,0,2,2
2,1,2,3
3,2,0,4
4,2,1,5
5,2,2,6


In [5]:
standard_df.reset_index().groupby('row').agg(indptr = ('index', 'min'))

Unnamed: 0_level_0,indptr
row,Unnamed: 1_level_1
0,0
1,2
2,3


In [6]:
standard_df

Unnamed: 0,row,col,data
0,0,0,1
1,0,2,2
2,1,2,3
3,2,0,4
4,2,1,5
5,2,2,6


In [7]:
num_rows = 1000
num_cols = 5000
matrix = sp.random(num_rows, num_cols, density=0.01, format='coo', dtype=np.int64)

In [8]:
matrix

<1000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 50000 stored elements in COOrdinate format>

In [9]:
def get_csr_representation(
    df, 
    row_id_col = 'row_id', 
    col_id_col='col_id', 
    data_col='data', 
    row_id_is_index=False,
    is_sorted=True,
    include_last_indptr=False
    ):
    
    if row_id_is_index:
        df = df.reset_index()
        
    if not is_sorted:
        df = df.sort_values([row_id_col, col_id_col])
    
    indptr = df.reset_index().groupby(row_id_col).agg(indptr = ('index', 'min')).indptr.values
    
    if include_last_indptr:
        indptr = np.concatenate(
            (indptr, np.array([df.shape[0]]))
        )
    
    return {
        'data': df.data.values,
        'col': df.col.values,
        'indptr': indptr
    }

In [10]:
df = pd.DataFrame({
    'row': matrix.row,
    'col': matrix.col,
    'data': matrix.data
})

df = df.sort_values(['row', 'col'], ignore_index=True)

csr_representation = get_csr_representation(df, 
                                            row_id_col='row', 
                                            col_id_col = 'col',
                                            row_id_is_index=False, 
                                            is_sorted=True,
                                            include_last_indptr=True
                                           )

In [11]:
csr_representation

{'data': array([ 3629560214291314907, -8924361107475166815, -4794540946872473863,
        ..., -8046420516069156777, -1833940076661959568,
         6173096225330117154]),
 'col': array([ 199,  462,  713, ..., 4626, 4705, 4724], dtype=int32),
 'indptr': array([    0,    50,    96, ..., 49903, 49949, 50000])}

In [12]:
matrix2 = sp.csr_matrix((csr_representation['data'], 
                         csr_representation['col'], 
                         csr_representation['indptr']),
                       shape=(num_rows, num_cols))

In [13]:
(matrix2.tocoo() != matrix).sum()

0

In [25]:
x = {}
isinstance(x, dict)

True

In [24]:
# batching the data
batch_size = 64
batch_id = 1
chunk_min = batch_size*batch_id
chunk_max = chunk_min+batch_size
print(chunk_min)
print(chunk_max)
# Performing the filter
df_chunk = df.set_index('row').loc[0:10]
df_chunk.index = df_chunk.index - df_chunk.index.min()
df_chunk
# csr_chunk = get_csr_representation(df_chunk)

64
128


Unnamed: 0_level_0,col,data
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,199,3629560214291314907
0,462,-8924361107475166815
0,713,-4794540946872473863
0,793,4761907142362924347
0,839,970646068969879643
...,...,...
10,4079,6992509387460850002
10,4243,615142908061491888
10,4724,-4545715588585677169
10,4733,5784821252224610880
