In [1]:
import pandas as pd
import numpy as np
import jsonlines
from tqdm import tqdm
from scipy.io import mmread
import os

In [2]:
# Each duplicate well is 1 datapoint

### Init

In [3]:
image_folder = "/mnt/scratch/Son_cellpainting/my_cp_images/"
jsonl_folder = "/home/son.ha/FSL_CP_DataPrep/jsonl/"

### Load data

In [4]:
# Features
CP_df = pd.read_csv('output/norm_CP_feature_df.csv')
FP_df = pd.read_csv('output/norm_ECFP_feature_df.csv')
RDKit_df = pd.read_csv('output/norm_RDKit_feature_df.csv')

In [5]:
# Assert that inchikeys should be the same and have the same order
assert CP_df['INCHIKEY'].equals(RDKit_df['INCHIKEY'])
assert RDKit_df['INCHIKEY'].equals(FP_df['INCHIKEY'])

In [6]:
# Assay Labels
assay = pd.read_csv('output/label_df.csv', index_col=0)
labels_with_dupl = pd.merge(FP_df[['INCHIKEY', 'CPD_SMILES', 'SAMPLE_KEY']].set_index('INCHIKEY'), assay, left_index=True, right_index=True)

### Find out which views are missing from each well

In [7]:
view_counts = {}
for file in os.listdir(image_folder):
    if file.endswith('.npz'):
        sample_key = file[0:9]
        view = file[-5:-4]
        if sample_key not in view_counts.keys():
            view_counts[sample_key] = [view]
        else:
            view_counts[sample_key].append(view)
            view_counts[sample_key].sort()

count_missing = 0
for sample_key in view_counts.keys():
    if len(view_counts[sample_key]) != 6:
        #print(sample_key, view_counts[sample_key])
        missing_num = 6-len(view_counts[sample_key])
        count_missing += missing_num
print(f"There are {count_missing} views missing across all wells") 

There are 210 views missing across all wells


### Add a views column to the label dataframe

In [8]:
def _retrieve_view(sample_key):
    """Helper function. Retrive a list of available views given sample key"""
    return(view_counts[sample_key])

labels_with_dupl['VIEWS'] = labels_with_dupl['SAMPLE_KEY'].apply(_retrieve_view)

In [9]:
labels_with_dupl.loc[labels_with_dupl.loc[:, ['CPD_SMILES', 'SAMPLE_KEY', labels_with_dupl.columns[3]]].replace(-1, np.NaN).dropna().index[1]]

Unnamed: 0,CPD_SMILES,SAMPLE_KEY,737823,737824,688422,688810,688812,688816,688724,688739,...,1301717,1301709,688671,845206,1301866,1301890,1301859,1301893,1495346,VIEWS
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25643-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25663-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25664-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25665-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"


In [10]:
for column in tqdm(labels_with_dupl.columns[2:-1]):
    items = []
    # Iterate through each column, remove NA
    temp_df = labels_with_dupl.loc[:, ['CPD_SMILES', 'SAMPLE_KEY', 'VIEWS', column]].replace(-1, np.NaN).dropna()
    for i in range(len(temp_df)):
        temp_dict = {
            'SMILES': temp_df.iloc[i,0],
            'INCHIKEY': temp_df.index[i],
            'SAMPLE_KEY': temp_df.iloc[i,1],
            'VIEWS':temp_df.iloc[i,2],
            'NUM_ROW_CP_FEATURES':int(CP_df.loc[CP_df['SAMPLE_KEY']==temp_df.iloc[i,1]].index[0]),
            'LABEL': int(temp_df.iloc[i,3])
        }
        items.append(temp_dict)
    with jsonlines.open(jsonl_folder+column+'.jsonl', 'w') as writer:
        writer.write_all(items)

100%|██████████| 201/201 [11:11<00:00,  3.34s/it]
