# マルチモーダルなシングルセル統合 スパースマトリックスデータセットの作成

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse

# conversion functions

In [3]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [4]:
cite_drop_columns = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/cite_drop_columns.csv',index_col=0)

In [5]:
cite_drop_columns.head()

Unnamed: 0,drop_columns
0,ENSG00000230202_AL450405.1
1,ENSG00000105205_CLC
2,ENSG00000229391_HLA-DRB6
3,ENSG00000198804_MT-CO1
4,ENSG00000225630_MTND2P28


In [6]:
drop_list = cite_drop_columns.drop_columns.values.tolist()

In [None]:
original_train = pd.read_hdf("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_cite_inputs.h5")

In [None]:
for col in original_train.columns:
    if col in drop_list:
        original_train = original_train.drop(columns=col)

In [None]:
original_train.shape

In [None]:
original_train.to_hdf("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_train_cite_inputs.h5")

In [7]:
original_test = pd.read_hdf('/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_cite_inputs.h5')

In [8]:
for col in original_test.columns:
    if col in drop_list:
        original_test = original_test.drop(columns=col)

In [9]:
original_test.shape

(48663, 11614)

In [11]:
original_test.to_hdf("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_test_cite_inputs.h5",'key',mode='w')

In [None]:
import scipy
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start+chunksize)
        for col in df_chunk.columns:
            if col in drop_list:
                df_chunk = df_chunk.drop(columns=col)

        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        print(total_rows)
        if len(df_chunk) < chunksize: 
            del df_chunk
            break
        del df_chunk
        start += chunksize
        
    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list
    
    all_indices = np.hstack(chunks_index_list)
    
    scipy.sparse.save_npz(out_filename+"_values.sparse", all_data_sparse)
    np.savez(out_filename+"_idxcol.npz", index=all_indices, columns =columns_name)

# H5 > sparse conversion

In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_inputs")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_cite_targets.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_cite_targets")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
70988


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_cite_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_train_cite_inputs")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
70988


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
55935


In [None]:
convert_h5_to_sparse_csr("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_cite_inputs.h5", "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/adval_test_cite_inputs")

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
48663


# csv > parquet conversion

In [None]:
convert_to_parquet("../dataset/metadata.csv", "metadata")

In [None]:
convert_to_parquet("../dataset/evaluation_ids.csv", "evaluation")

In [None]:
convert_to_parquet("../dataset/sample_submission.csv", "sample_submission")