# Process data for model training

The model requires the data in a specific format. The images and labels were concatenated into a smaller number of files to make it easier to upload/download in batches. This script is to reformat.

In [1]:
import numpy as np
import polars as pl
import os
from tqdm import tqdm

In [2]:
# data paths
img_dir = "/dgx1nas1/storage/data/jess/cytoself/data"
meta_dir = "/dgx1nas1/storage/data/jess/cytoself/labels"
out_dir = "/dgx1nas1/storage/data/jess/cytoself/model_data"
sample_dir = "/dgx1nas1/storage/data/jess/repos/cytoself/sample_data"

In [9]:
meta_files = os.listdir(meta_dir)
img_files = os.listdir(img_dir)
root_files = [f.replace('label_', '').replace('.csv', '') for f in meta_files]

In [8]:
rf = "label_data00.csv"
mf = [f for f in meta_files if rf in f][0]
meta = pl.read_csv(f"{meta_dir}/{mf}").with_row_index('index')
proteins = meta.select("name").to_series().unique().to_list()
prot = proteins[0]
prot_df = meta.filter(pl.col("name") == prot)
inds = prot_df.select('index').to_series().to_list()
prot_df = prot_df.drop('index').to_numpy()
prot_df

array([['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 27930],
       ['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 27930],
       ['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 27930],
       ...,
       ['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 7744],
       ['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 7744],
       ['ENSG00000117758', 'STX12', 'vesicles', ..., None, 84, 7744]],
      dtype=object)

In [10]:
# Reformat data to match what model is expecting
# Time: takes ~45s per input file
for rf in root_files:
    mf = [f for f in meta_files if rf in f][0]
    imgf = [f for f in img_files if rf in f][0]

    meta = pl.read_csv(f"{meta_dir}/{mf}").with_row_index('index')
    dat = np.load(f"{img_dir}/{imgf}", allow_pickle=True)

    proteins = meta.select("name").to_series().unique().to_list()
    for prot in tqdm(proteins):
        prot_df = meta.filter(pl.col("name") == prot)
        inds = prot_df.select('index').to_series().to_list()
        prof_df = prot_df.drop('index').to_numpy()
        
        np.save(f"{out_dir}/{prot}_label.npy", prot_df.drop('index').to_numpy())
        np.save(f"{out_dir}/{prot}_pro.npy", dat[inds, :, :, 0])
        np.save(f"{out_dir}/{prot}_nuc.npy", dat[inds, :, :, 1])

100%|██████████| 131/131 [00:00<00:00, 228.08it/s]
100%|██████████| 131/131 [00:00<00:00, 240.66it/s]
100%|██████████| 131/131 [00:00<00:00, 247.91it/s]
100%|██████████| 131/131 [00:00<00:00, 222.81it/s]
100%|██████████| 131/131 [00:00<00:00, 238.96it/s]
100%|██████████| 132/132 [00:00<00:00, 267.09it/s]
100%|██████████| 131/131 [00:00<00:00, 236.57it/s]
100%|██████████| 131/131 [00:00<00:00, 255.07it/s]
100%|██████████| 131/131 [00:00<00:00, 242.29it/s]
100%|██████████| 131/131 [00:00<00:00, 261.68it/s]
