In [None]:
import xclib.data.data_utils as du, numpy as np, scipy.sparse as sp, os
from tqdm.auto import tqdm

In [None]:
def add_noise_to_matrix(mat, pct=0.5):
    indices = mat.indices.copy()

    lbl_idx = np.arange(mat.shape[1])
    for i,j in tqdm(zip(mat.indptr, mat.indptr[1:]), total=mat.shape[0]):
        idx = mat.indices[i:j]
        n_noise = int(len(idx) * pct)
        rnd_idx = np.random.permutation(len(idx))[:n_noise]
        indices[rnd_idx+i] = np.random.choice(lbl_idx, size=n_noise)
    
    noisy_mat = sp.csr_matrix((mat.data, indices, mat.indptr), shape=mat.shape, dtype=mat.dtype)
    return noisy_mat
    

In [None]:
def add_noise(fname, pct=0.5):
    mat = du.read_sparse_file(fname)
    noisy_mat = add_noise_to_matrix(mat, pct=pct)

    fname = Path(fname)
    name = fname.stem + f'_noise-{int(pct*100):03d}'
    sp.save_npz(fname.parent/name, noisy_mat)
    return noisy_mat
    

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-WikiSeeAlsoTitles-320K/'

add_noise(f'{data_dir}/category_trn_X_Y.txt', pct=0.5)
add_noise(f'{data_dir}/category_tst_X_Y.txt', pct=0.5)
add_noise(f'{data_dir}/category_lbl_X_Y.txt', pct=0.5)

  0%|          | 0/693082 [00:00<?, ?it/s]

  0%|          | 0/177515 [00:00<?, ?it/s]

  0%|          | 0/312330 [00:00<?, ?it/s]

<312330x656086 sparse matrix of type '<class 'numpy.float32'>'
	with 1126099 stored elements in Compressed Sparse Row format>

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-WikiTitles-500K'

add_noise(f'{data_dir}/hyper_link_trn_X_Y.txt', pct=0.5)
add_noise(f'{data_dir}/hyper_link_tst_X_Y.txt', pct=0.5)
add_noise(f'{data_dir}/hyper_link_lbl_X_Y.txt', pct=0.5)

  0%|          | 0/1813391 [00:00<?, ?it/s]

  0%|          | 0/783743 [00:00<?, ?it/s]

  0%|          | 0/501070 [00:00<?, ?it/s]

<501070x2148579 sparse matrix of type '<class 'numpy.float32'>'
	with 85390 stored elements in Compressed Sparse Row format>

In [None]:
def verify(mat, noisy_mat):
    nnz_full, nnz = mat.getnnz(axis=1), mat.multiply(noisy_mat).getnnz(axis=1)
    idx = np.where(nnz_full > 0)[0]
    m = nnz[idx]/nnz_full[idx]
    return m.mean()
    

In [None]:
mat = du.read_sparse_file(f'{data_dir}/hyper_link_tst_X_Y.txt')
noisy_mat = sp.load_npz(f'{data_dir}/hyper_link_tst_X_Y_noise-050.npz')

In [None]:
verify(mat, noisy_mat)

0.5466656174185117