In [2]:
import os
import pandas as pd
import re
import shutil

In [2]:
# Filters rows where MoA is empty.

df = pd.read_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/P102785_cmpds_anno.csv')
df_cleaned = df.dropna(subset=['moa'])
df_cleaned.to_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing/P102785_cmpds_anno_MoA.csv', index=False)


In [20]:
# Creating image metadata file
img_directory = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff' #preprocssing_bigger

metadata = []

pattern = re.compile(r"(?P<WELL>\w\d+)_s(?P<SITE>\d+)_x\d+_y\d+_Fluorescence_(?P<WAVE>\d+)_nm_Ex")

for filename in os.listdir(img_directory):
    if filename.endswith('.tiff'):
        match = pattern.match(filename)
        if match:
            well = match.group('WELL')
            site = match.group('SITE')
            wave = match.group('WAVE')

            # Check if the row for this WELL and SITE already exists
            existing_row = next((row for row in metadata if row['WELL_POSITION'] == well and row['SITE'] == site), None)

            if existing_row:
                existing_row[wave] = filename
            else:
                plate_id = 'P102785'
                sample_key = plate_id + "-" + well + "-" + site
                row = {
                    'SAMPLE_KEY' : sample_key,
                    'PLATE_ID': plate_id,
                    'WELL_POSITION': well,
                    'SITE': site,
                    '405': 0,
                    '488': 0,
                    '561': 0,
                    '638': 0,
                    '730': 0,
                }
                row[wave] = filename
                metadata.append(row)

metadata_df = pd.DataFrame(metadata)

metadata_df.rename(columns={
    '405': 'IMG_Hoechst', 
    '488': 'IMG_ERSyto', 
    '561': 'IMG_Ph_golgi', 
    '638': 'IMG_Mito', 
    '730': 'IMG_ERSytoBleed'
}, inplace=True)

metadata_df_sorted = metadata_df.sort_values(by=['PLATE_ID', 'WELL_POSITION', 'SITE'])
#print(metadata_df_sorted)

csv_filename = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images.csv'     # Added dir preprocessing_all
metadata_df_sorted.to_csv(csv_filename, index=False)

In [4]:
# Smiles to INCHIKEY
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToInchiKey(mol)
        else:
            return None
    except:
        return None

In [22]:
metadata_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images.csv'            # Added dir preprocessing_all
#cmpds_anno_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/P102785_cmpds_anno_MoA.csv'                            # Used for preprocessing_bigger
cmpds_anno_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/P102785_cmpds_ann_cleaned.csv'

outfile = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785.csv'                         # Added dir preprocessing_all
outfile_removed = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images-MoA_unclassified.csv'

metadata_df = pd.read_csv(metadata_file)
cmpds_anno_df = pd.read_csv(cmpds_anno_file)

merged_df = pd.merge(
    metadata_df, 
    cmpds_anno_df, 
    left_on=['PLATE_ID', 'WELL_POSITION'], 
    right_on=['barcode', 'well-id'], 
    how='left'
)

merged_df.rename(columns={
    'cbk-id': 'CPD_NAME',
    'cmpd-name': 'CPD_NAME_TYPE',
    'smiles': 'SMILES'
}, inplace=True)

# Removing if MoA is unknown
removed = merged_df[merged_df['well-id'].isna()]
removed.to_csv(outfile_removed, index=False)
merged_df_MoA = merged_df.dropna(subset=['well-id'])

df = merged_df_MoA.drop(columns=['well-id', 'barcode'])
df['INCHIKEY'] = df['SMILES'].apply(smiles_to_inchikey)

column_order = ['SAMPLE_KEY', 'PLATE_ID', 'WELL_POSITION', 'SITE', 'CPD_NAME', 'CPD_NAME_TYPE', 'SMILES', 'INCHIKEY', 'IMG_ERSyto', 'IMG_ERSytoBleed', 'IMG_Hoechst', 'IMG_Mito', 'IMG_Ph_golgi']
final_df = df[column_order]

#final_df.to_csv(outfile, index=False)
#print(final_df)

In [23]:
final_df_smiles_unique = final_df.drop_duplicates(subset='SAMPLE_KEY', keep='first')
final_df_smiles_unique.to_csv(outfile, index=False)
print(final_df_smiles_unique)

         SAMPLE_KEY PLATE_ID WELL_POSITION  SITE   CPD_NAME CPD_NAME_TYPE  \
0     P102785-A03-1  P102785           A03     1  CBK290225     PD 198306   
1     P102785-A03-2  P102785           A03     2  CBK290225     PD 198306   
2     P102785-A03-3  P102785           A03     3  CBK290225     PD 198306   
3     P102785-A03-4  P102785           A03     4  CBK290225     PD 198306   
4     P102785-A03-5  P102785           A03     5  CBK290225     PD 198306   
...             ...      ...           ...   ...        ...           ...   
2452  P102785-P23-5  P102785           P23     5  CBK290225     PD 198306   
2453  P102785-P23-6  P102785           P23     6  CBK290225     PD 198306   
2454  P102785-P23-7  P102785           P23     7  CBK290225     PD 198306   
2455  P102785-P23-8  P102785           P23     8  CBK290225     PD 198306   
2456  P102785-P23-9  P102785           P23     9  CBK290225     PD 198306   

                                           SMILES  \
0     Cc1cc(I)ccc1Nc1c

In [None]:
# Moving images for which MoA is unclassified to another directory

img_dir = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_bigger/channels_tiff'
img_dir_noMoA = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_bigger/channels_tiff_noMoA'

metadata_to_move = '/share/data/analyses/silvija/RT/data_cloome/our_images/metadata_P102785_images-MoA_unclassified.csv'

df = pd.read_csv(metadata_to_move)

if not os.path.exists(img_dir_noMoA):
    os.makedirs(img_dir_noMoA)

image_columns = ['IMG_ERSyto', 'IMG_ERSytoBleed', 'IMG_Hoechst', 'IMG_Mito', 'IMG_Ph_golgi']

for index, row in df.iterrows():
    for col in image_columns:
        image_filename = row[col]
        if pd.notna(image_filename):  
            source_path = os.path.join(img_dir, image_filename)
            destination_path = os.path.join(img_dir_noMoA, image_filename)
            if os.path.exists(source_path):
                shutil.move(source_path, destination_path)
                print(f"Moved: {image_filename}")
            else:
                print(f"File not found: {image_filename}")

In [4]:
# Zipping images based on channels

img_dir = "/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff"
zip_dir = "/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff_zip"

channels = ['638', '730'] #'405', '488', '561', 
plate_id = 'P102785'

for i, channel in enumerate(channels):
    command = f"zip {zip_dir}/{plate_id}_channel_{i+4}.zip {img_dir}/*{channel}*.tiff  -r"
    result = os.system(command)
    print(f"Zipped channel {channel} into {plate_id}_channel_{i+1}.zip")

  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s1_x0_y0_Fluorescence_638_nm_Ex.tiff (deflated 3%)
  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s2_x1_y0_Fluorescence_638_nm_Ex.tiff (deflated 3%)
  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s3_x2_y0_Fluorescence_638_nm_Ex.tiff (deflated 2%)
  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s4_x0_y1_Fluorescence_638_nm_Ex.tiff (deflated 3%)
  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s5_x1_y1_Fluorescence_638_nm_Ex.tiff (deflated 2%)
  adding: share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff/P102785_A03_s6_x2_y1_Fluorescence_638_nm_Ex.tiff (deflated 2%)
  adding: share/data/analyses/silvija/RT/data_