In [4]:
import pandas as pd
import json
import os

# Load CSVs
img_meta=pd.read_csv("BBBC021_v1_image.csv")
img_meta=img_meta[img_meta['Image_PathName_DAPI'].str.contains("Week1_")]
img_meta["full_image_path"] = img_meta["Image_PathName_DAPI"]+"/"+img_meta["Image_FileName_DAPI"]
comp = pd.read_csv("BBBC021_v1_compound.csv")
moa = pd.read_csv("BBBC021_v1_moa.csv")

# Merge with SMILES and MoA
comp = comp[['compound', 'smiles']]
moa = moa[['compound', 'concentration', 'moa']]

df = img_meta.merge(comp, left_on="Image_Metadata_Compound", right_on="compound", how='left') \
             .merge(moa, left_on=["Image_Metadata_Compound", "Image_Metadata_Concentration"], 
                         right_on=["compound", "concentration"], how='left')

# Drop rows missing MoA or SMILES
df = df.dropna(subset=['moa', 'smiles'])

# Define root folder where Week1 images are stored
root_folder = 'Week1_22123'

# Generate JSONL file
with open("bbbc021_week1_training.jsonl", "w") as fout:
    for _, r in df.iterrows():
        # Construct full image path (e.g. Week1_22123/Week1_22123_Plate1_A01_s1_w1.TIF)
        image_filename = r["Image_FileName_DAPI"]
        image_path = r["full_image_path"]

        compound = r["Image_Metadata_Compound"]
        smiles = r["smiles"]
        conc = r["Image_Metadata_Concentration"]
        moa_name = r["moa"]

        text = (f"Compound: {compound}, Smiles: {smiles}, "
                f"Concentration: {conc} µM, Mechanism of Action: {moa_name}")

        fout.write(json.dumps({"image": image_path, "text": text}) + "\n")
