### Inspired by <a href="https://github.com/shu65/open-problems-multimodal/blob/main/script/make_compressed_dataset.py">Code</a>

In [1]:
import argparse
import os

import numpy as np
import pandas as pd
import scipy
import scipy.sparse


In [3]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [6]:
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start + chunksize)
        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        print(total_rows)
        if len(df_chunk) < chunksize:
            del df_chunk
            break
        del df_chunk
        start += chunksize

    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list

    all_indices = np.hstack(chunks_index_list)

    scipy.sparse.save_npz(out_filename + "_values.sparse", all_data_sparse)
    np.savez(out_filename + "_idxcol.npz", index=all_indices, columns=columns_name)

In [8]:
def compress(data_dir, output_data_dir):
    if output_data_dir is None:
        output_data_dir = data_dir
    
    # make sure you have write access
    os.makedirs(output_data_dir, exist_ok=True)
    file_prefixes = ["evaluation_ids", "metadata", "sample_submission"]
    for file_prefix in file_prefixes:
        convert_to_parquet(os.path.join(data_dir, f"{file_prefix}.csv"), os.path.join(output_data_dir, file_prefix))
    file_prefixes = [
        "test_cite_inputs",
        "test_multi_inputs",
        "train_cite_inputs",
        "train_cite_targets",
        "train_multi_inputs",
        "train_multi_targets",
    ]
    for file_prefix in file_prefixes:
        convert_h5_to_sparse_csr(os.path.join(data_dir, f"{file_prefix}.h5"), os.path.join(output_data_dir, file_prefix))

In [13]:
data_dir = '/arc/project/st-jiaruid-1/yinian/multiome/'
output_data_dir = '/scratch/st-jiaruid-1/shenoy/data/multiome-sparse-rc/'

In [None]:
compress(data_dir, output_data_dir)

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
48663
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
55935
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
