# Process data for model training

The model requires the data in a specific format. The images and labels were concatenated into a smaller number of files to make it easier to upload/download in batches. This script is to reformat.

In [1]:
import numpy as np
import polars as pl
import os
from tqdm import tqdm

In [2]:
# data paths
img_dir = "/dgx1nas1/storage/data/jess/cytoself/data"
meta_dir = "/dgx1nas1/storage/data/jess/cytoself/labels"
out_dir = "/dgx1nas1/storage/data/jess/cytoself/model_data"
sample_dir = "/dgx1nas1/storage/data/jess/repos/cytoself/sample_data"

In [5]:
meta_files = os.listdir(meta_dir)
img_files = os.listdir(img_dir)
root_files = [f.replace('label_', '').replace('.csv', '') for f in meta_files]

In [6]:
# Reformat data to match what model is expecting
# Time: takes ~45s per input file
for rf in root_files:
    mf = [f for f in meta_files if rf in f][0]
    imgf = [f for f in img_files if rf in f][0]

    meta = pl.read_csv(f"{meta_dir}/{mf}").with_row_index('index')
    dat = np.load(f"{img_dir}/{imgf}", allow_pickle=True)

    proteins = meta.select("name").to_series().unique().to_list()
    for prot in tqdm(proteins):
        # save label info
        prot_df = meta.filter(pl.col("name") == prot)
        np.save(f"{out_dir}/{prot}_label.npy", prot_df.drop('index').to_numpy())
        
        inds = prot_df.select('index').to_series().to_list()
        np.save(f"{out_dir}/{prot}_pro.npy", dat[inds, :, :, 0])
        np.save(f"{out_dir}/{prot}_nuc.npy", dat[inds, :, :, 1])

100%|██████████| 131/131 [00:00<00:00, 233.60it/s]
100%|██████████| 131/131 [00:00<00:00, 219.14it/s]
100%|██████████| 131/131 [00:00<00:00, 228.81it/s]
100%|██████████| 131/131 [00:00<00:00, 203.01it/s]
100%|██████████| 131/131 [00:00<00:00, 223.24it/s]
100%|██████████| 132/132 [00:00<00:00, 246.18it/s]
100%|██████████| 131/131 [00:00<00:00, 213.74it/s]
100%|██████████| 131/131 [00:00<00:00, 221.97it/s]
100%|██████████| 131/131 [00:00<00:00, 217.58it/s]
100%|██████████| 131/131 [00:00<00:00, 219.63it/s]
