In [1]:
import os
import pandas as pd
import re
import shutil

In [2]:
# Filters rows where MoA is empty.

df = pd.read_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/P102785_cmpds_anno.csv')
df.loc[df['SMILES'] == '[dmso]', 'moa'] = 'dmso'              # 13.09. In order to include DMSO for z-score
df_cleaned = df.dropna(subset=['moa'])
df_cleaned.to_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing/P102785_cmpds_anno_MoA.csv', index=False)


In [5]:
# Creating image metadata file
img_directory = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff_DMSO' #preprocssing_bigger

metadata = []

pattern = re.compile(r"(?P<plate>\w+)_(?P<WELL>\w\d+)_s(?P<SITE>\d+)_x\d+_y\d+_Fluorescence_(?P<WAVE>\d+)_nm_Ex")

for filename in os.listdir(img_directory):
    if filename.endswith('.tiff'):
        match = pattern.match(filename)
        if match:
            well = match.group('WELL')
            site = match.group('SITE')
            wave = match.group('WAVE')

            # Check if the row for this WELL and SITE already exists
            existing_row = next((row for row in metadata if row['WELL_POSITION'] == well and row['SITE'] == site), None)

            if existing_row:
                existing_row[wave] = filename
            else:
                plate_id = 'P102785'
                sample_key = plate_id + "-" + well + "-" + site
                row = {
                    'SAMPLE_KEY' : sample_key,
                    'PLATE_ID': plate_id,
                    'WELL_POSITION': well,
                    'SITE': site,
                    '405': 0,
                    '488': 0,
                    '561': 0,
                    '638': 0,
                    '730': 0,
                }
                row[wave] = filename
                metadata.append(row)

metadata_df = pd.DataFrame(metadata)

metadata_df.rename(columns={
    '405': 'IMG_Hoechst', 
    '488': 'IMG_ERSyto', 
    '561': 'IMG_Ph_golgi', 
    '638': 'IMG_Mito', 
    '730': 'IMG_ERSytoBleed'
}, inplace=True)

metadata_df_sorted = metadata_df.sort_values(by=['PLATE_ID', 'WELL_POSITION', 'SITE'])
#print(metadata_df_sorted)

csv_filename = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images_DMSO.csv'     # Added dir preprocessing_all   #13.09 _DMSO
metadata_df_sorted.to_csv(csv_filename, index=False)

In [7]:
# Smiles to INCHIKEY
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToInchiKey(mol)
        else:
            return None
    except:
        return None

In [11]:
# 13.09 : _DMSO 
metadata_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images_DMSO.csv'            # Added dir preprocessing_all
#cmpds_anno_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/P102785_cmpds_anno_MoA.csv'                            # Used for preprocessing_bigger
cmpds_anno_file = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/P102785_cmpds_ann_cleaned_DMSO.csv'

outfile = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_DMSO.csv'                         # Added dir preprocessing_all
outfile_removed = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_images-MoA_unclassified_DMSO.csv'

metadata_df = pd.read_csv(metadata_file)
cmpds_anno_df = pd.read_csv(cmpds_anno_file)

merged_df = pd.merge(
    metadata_df, 
    cmpds_anno_df, 
    left_on=['PLATE_ID', 'WELL_POSITION'], 
    right_on=['barcode', 'well-id'], 
    how='left'
)

merged_df.rename(columns={
    'cbk-id': 'CPD_NAME',
    'cmpd-name': 'CPD_NAME_TYPE',
    'smiles': 'SMILES'
}, inplace=True)

# Removing if MoA is unknown
removed = merged_df[merged_df['well-id'].isna()]
removed.to_csv(outfile_removed, index=False)
merged_df_MoA = merged_df.dropna(subset=['well-id'])

df = merged_df_MoA.drop(columns=['well-id', 'barcode'])
df['SMILES'] = "CS(C)=O"                                       # 13.09 DMSO
df['INCHIKEY'] = df['SMILES'].apply(smiles_to_inchikey)

column_order = ['SAMPLE_KEY', 'PLATE_ID', 'WELL_POSITION', 'SITE', 'CPD_NAME', 'CPD_NAME_TYPE', 'SMILES', 'INCHIKEY', 'IMG_ERSyto', 'IMG_ERSytoBleed', 'IMG_Hoechst', 'IMG_Mito', 'IMG_Ph_golgi']
final_df = df[column_order]

#final_df.to_csv(outfile, index=False)
#print(final_df)

In [12]:
final_df_smiles_unique = final_df.drop_duplicates(subset='SAMPLE_KEY', keep='first')
final_df_smiles_unique.to_csv(outfile, index=False)
print(final_df_smiles_unique)

        SAMPLE_KEY PLATE_ID WELL_POSITION  SITE CPD_NAME       CPD_NAME_TYPE  \
0    P102785-A02-1  P102785           A02     1   [dmso]  Dimethyl Sulfoxide   
1    P102785-A02-2  P102785           A02     2   [dmso]  Dimethyl Sulfoxide   
2    P102785-A02-3  P102785           A02     3   [dmso]  Dimethyl Sulfoxide   
3    P102785-A02-4  P102785           A02     4   [dmso]  Dimethyl Sulfoxide   
4    P102785-A02-5  P102785           A02     5   [dmso]  Dimethyl Sulfoxide   
..             ...      ...           ...   ...      ...                 ...   
211  P102785-P04-5  P102785           P04     5   [dmso]  Dimethyl Sulfoxide   
212  P102785-P04-6  P102785           P04     6   [dmso]  Dimethyl Sulfoxide   
213  P102785-P04-7  P102785           P04     7   [dmso]  Dimethyl Sulfoxide   
214  P102785-P04-8  P102785           P04     8   [dmso]  Dimethyl Sulfoxide   
215  P102785-P04-9  P102785           P04     9   [dmso]  Dimethyl Sulfoxide   

      SMILES                     INCHIK

In [None]:
# Moving images for which MoA is unclassified to another directory

img_dir = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_bigger/channels_tiff'
img_dir_noMoA = '/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_bigger/channels_tiff_noMoA'

metadata_to_move = '/share/data/analyses/silvija/RT/data_cloome/our_images/metadata_P102785_images-MoA_unclassified.csv'

df = pd.read_csv(metadata_to_move)

if not os.path.exists(img_dir_noMoA):
    os.makedirs(img_dir_noMoA)

image_columns = ['IMG_ERSyto', 'IMG_ERSytoBleed', 'IMG_Hoechst', 'IMG_Mito', 'IMG_Ph_golgi']

for index, row in df.iterrows():
    for col in image_columns:
        image_filename = row[col]
        if pd.notna(image_filename):  
            source_path = os.path.join(img_dir, image_filename)
            destination_path = os.path.join(img_dir_noMoA, image_filename)
            if os.path.exists(source_path):
                shutil.move(source_path, destination_path)
                print(f"Moved: {image_filename}")
            else:
                print(f"File not found: {image_filename}")

In [None]:
# Zipping images based on channels

img_dir = "/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff"
zip_dir = "/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/channels_tiff_zip"

channels = ['638', '730'] #'405', '488', '561', 
plate_id = 'P102785'

for i, channel in enumerate(channels):
    command = f"zip {zip_dir}/{plate_id}_channel_{i+4}.zip {img_dir}/*{channel}*.tiff  -r"
    result = os.system(command)
    print(f"Zipped channel {channel} into {plate_id}_channel_{i+1}.zip")

In [None]:
# 13.09 MERGING metadata_P102785 files
import pandas as pd

DMSO_df = pd.read_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_DMSO.csv')
noDMSO_df = pd.read_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785.csv')

concat_df = pd.concat([DMSO_df, noDMSO_df])
sorted_df = concat_df.sort_values(by='SAMPLE_KEY', ascending=True)
#print(sorted_df)

sorted_df.to_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all/metadata_P102785_ALL.csv', index=False)

In [6]:
# 25.09 Checking if it will work only with one crop image

import pandas as pd

top10 = pd.read_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all_singleCrop/metadata_P102785_top10.csv')
top10_single = top10[top10["CROP_POS"] == "2-1"]

top10_single.to_csv('/share/data/analyses/silvija/RT/data_cloome/our_images/preprocessing_all_singleCrop/metadata_P102785_top10_single.csv', index=False)
