In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.io import mmread
import os

In [2]:
# Each duplicate well is 1 datapoint

In [3]:
image_folder = "/mnt/scratch/Son_cellpainting/my_cp_images/"
jsonl_folder = "/home/son.ha/FSL_CP_DataPrep/jsonl/"

### Load Data

In [4]:
# Features
CP_df = pd.read_csv('output/norm_CP_feature_df.csv')
FP_df = pd.read_csv('output/norm_ECFP_feature_df.csv')
RDKit_df = pd.read_csv('output/norm_RDKit_feature_df.csv')

In [5]:
# Assert that inchikeys should be the same and have the same order
assert CP_df['INCHIKEY'].equals(RDKit_df['INCHIKEY'])
assert RDKit_df['INCHIKEY'].equals(FP_df['INCHIKEY'])

In [6]:
# Assay Labels
assay = pd.read_csv('output/label_df.csv', index_col=0)
labels_with_dupl = pd.merge(FP_df[['INCHIKEY', 'CPD_SMILES', 'SAMPLE_KEY']].set_index('INCHIKEY'), assay, left_index=True, right_index=True)

### Find out which views are missing from each well

In [7]:
view_counts = {}
for file in os.listdir(image_folder):
    if file.endswith('.npz'):
        sample_key = file[0:9]
        view = file[-5:-4]
        if sample_key not in view_counts.keys():
            view_counts[sample_key] = [view]
        else:
            view_counts[sample_key].append(view)
            view_counts[sample_key].sort()

count_missing = 0
for sample_key in view_counts.keys():
    if len(view_counts[sample_key]) != 6:
        #print(sample_key, view_counts[sample_key])
        missing_num = 6-len(view_counts[sample_key])
        count_missing += missing_num
print(f"There are {count_missing} views missing across all wells") 

There are 210 views missing across all wells


### Add a views column to the label dataframe

In [8]:
def _retrieve_view(sample_key):
    """Helper function. Retrive a list of available views given sample key"""
    return(view_counts[sample_key])

labels_with_dupl['VIEWS'] = labels_with_dupl['SAMPLE_KEY'].apply(_retrieve_view)

In [9]:
labels_with_dupl.loc[labels_with_dupl.loc[:, ['CPD_SMILES', 'SAMPLE_KEY', labels_with_dupl.columns[3]]].replace(-1, np.NaN).dropna().index[1]]

Unnamed: 0,CPD_SMILES,SAMPLE_KEY,737823,737824,688422,688810,688812,688816,688724,688739,...,1301717,1301709,688671,845206,1301866,1301890,1301859,1301893,1495346,VIEWS
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25643-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25663-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25664-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"
BBIRKSILZBXBGJ-KDYSTLNUSA-N,OCc1ccc(CO[C@H]2C[C@H](C=C(O2)C(=O)NCc2nc3cccc...,25665-P14,1,0,1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[1, 2, 3, 4, 5, 6]"


In [18]:
list_assay = []
list_smiles = []
list_inchi = []
list_sample_key = []
list_views = []
list_num_row_cp_features = []
list_label = []

df_for_each_assay = []
for column in tqdm(labels_with_dupl.columns[2:-1]):
    # Iterate through each column, remove NA
    temp_df = labels_with_dupl.loc[:, ['CPD_SMILES', 'SAMPLE_KEY', 'VIEWS', column]].replace(-1, np.NaN).dropna().copy(deep=True)
    temp_df['ASSAY'] = column
    temp_df = temp_df.rename(columns={column:'LABEL'})
    df_for_each_assay.append(temp_df)
    """
    for i in range(len(temp_df)):
        list_smiles.append(temp_df.iloc[i,0])
        list_inchi.append(temp_df.index[i])
        list_sample_key.append(temp_df.iloc[i,1])
        list_views.append(temp_df.iloc[i,2])
        list_num_row_cp_features.append(int(CP_df.loc[CP_df['SAMPLE_KEY']==temp_df.iloc[i,1]].index[0]))
        list_label.append(int(temp_df.iloc[i,3]))
      

d = {
    'ASSAY': list_assay,
    'SMILES': list_smiles,
    'INCHIKEY': list_inchi,
    'SAMPLE_KEY': list_sample_key,
    'VIEWS': list_views,
    'NUM_ROW_CP_FEATURES': list_num_row_cp_features,
    'LABEL': list_label
"""  


100%|██████████| 201/201 [00:04<00:00, 46.68it/s]


In [20]:
final_label_df = pd.concat(df_for_each_assay)
final_label_df = final_label_df.reset_index()
final_label_df = final_label_df.rename(columns={'index':'INCHIKEY'})
final_label_df

Unnamed: 0,INCHIKEY,CPD_SMILES,SAMPLE_KEY,VIEWS,LABEL,ASSAY
0,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24305-D04,"[1, 2, 3, 4, 5, 6]",1.0,737823
1,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24306-D04,"[1, 2, 3, 4, 5, 6]",1.0,737823
2,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24307-D04,"[1, 2, 3, 4, 5, 6]",1.0,737823
3,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24352-D04,"[1, 2, 3, 4, 5, 6]",1.0,737823
4,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,25955-D04,"[1, 2, 3, 4, 5, 6]",1.0,737823
...,...,...,...,...,...,...
286336,ZZRJKLPEGBZEAO-UHFFFAOYSA-N,COCCNC1=NN=C(CS1)c1ccc(NC(C)=O)cc1,25738-F12,"[1, 2, 3, 4, 5, 6]",0.0,1495346
286337,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25689-D07,"[1, 2, 3, 4, 5, 6]",0.0,1495346
286338,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25690-D07,"[1, 2, 3, 4, 5, 6]",0.0,1495346
286339,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25692-D07,"[1, 2, 3, 4, 5, 6]",0.0,1495346


In [24]:
final_label_df.to_csv('output/FINAL_LABEL_DF.csv', index=False)