# マルチモーダルなシングルセル統合 スパースマトリックスデータセットの作成

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse

# conversion functions

In [3]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [4]:
multi_drop_columns = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/multi_drop_columns.csv',index_col=0)

In [5]:
multi_drop_columns.head()

Unnamed: 0,multi_drop_columns
0,chr1633700634539
1,chr882278838228737
2,chr1629656630464
3,chr2148881331148882209
4,chr1630875631689


In [6]:
drop_list = multi_drop_columns.multi_drop_columns.values.tolist()

In [7]:
drop_list = drop_list[:30000]

In [8]:
len(drop_list)

30000

In [9]:
import scipy
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start+chunksize)
        for col in df_chunk.columns:
            if col in drop_list:
                df_chunk = df_chunk.drop(columns=col)

        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        print(total_rows)
        if len(df_chunk) < chunksize: 
            del df_chunk
            break
        del df_chunk
        start += chunksize
        
    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list
    
    all_indices = np.hstack(chunks_index_list)
    
    scipy.sparse.save_npz(out_filename+"_values.sparse", all_data_sparse)
    np.savez(out_filename+"_idxcol.npz", index=all_indices, columns =columns_name)

# H5 > sparse conversion

In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_train_multi_inputs4")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_test_multi_inputs4")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000


# csv > parquet conversion

In [None]:
convert_to_parquet("../dataset/metadata.csv", "metadata")

In [None]:
convert_to_parquet("../dataset/evaluation_ids.csv", "evaluation")

In [None]:
convert_to_parquet("../dataset/sample_submission.csv", "sample_submission")