In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Load the datasets
cp_count_sanchez = pd.read_csv('CP_count_Sanchez.csv')

# Function to merge data with CP_count_Sanchez
def merge_data(data, cp_data):
    return data.merge(cp_data, on='INCHIKEY')


# Extract assays (columns) to be used for logistic regression
assay_columns = [col for col in cp_count_sanchez.columns if col not in ['INCHIKEY', 'Unnamed: 0','Cells_Number_Object_Number','Cells_Neighbors_FirstClosestObjectNumber_5',
 'Cells_Neighbors_FirstClosestObjectNumber_Adjacent',
 'Cells_Neighbors_SecondClosestObjectNumber_5',
 'Cells_Neighbors_SecondClosestObjectNumber_Adjacent',
 'Cells_Parent_Nuclei',
 'Cytoplasm_Number_Object_Number',
 'Cytoplasm_Parent_Cells',
 'Cytoplasm_Parent_Nuclei',
 'Nuclei_Neighbors_FirstClosestObjectNumber_1',
 'Nuclei_Neighbors_SecondClosestObjectNumber_1',
 'Nuclei_Number_Object_Number',
 'InChIKey']]

# Replace -1 with NaN only in assay columns
cp_count_sanchez[assay_columns] = cp_count_sanchez[assay_columns].replace(-1, np.nan)



In [4]:
cp_count_sanchez

Unnamed: 0.1,Unnamed: 0,INCHIKEY,1,2,3,4,5,6,7,8,...,Cells_Neighbors_SecondClosestObjectNumber_5,Cells_Neighbors_SecondClosestObjectNumber_Adjacent,Cells_Parent_Nuclei,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cells,Cytoplasm_Parent_Nuclei,Nuclei_Neighbors_FirstClosestObjectNumber_1,Nuclei_Neighbors_SecondClosestObjectNumber_1,Nuclei_Number_Object_Number,InChIKey
0,0,AACRWZVDRSTLKY-UHFFFAOYSA-N,,,,,,,,,...,-0.054688,-0.054688,0.859375,0.859375,0.859375,0.859375,1.445312,1.007812,0.859375,AACRWZVDRSTLKY-UHFFFAOYSA-N
1,1,AACUKVXTFOXDGE-UHFFFAOYSA-N,,,,,,,,,...,-1.171875,-1.171875,-1.148438,-1.148438,-1.148438,-1.148438,-1.101562,-1.453125,-1.148438,AACUKVXTFOXDGE-UHFFFAOYSA-N
2,2,AADCDMQTJNYOSS-LBPRGKRZSA-N,,,,,,,,,...,-0.425781,-0.425781,0.453125,0.453125,0.453125,0.453125,-0.117188,-0.531250,0.453125,AADCDMQTJNYOSS-LBPRGKRZSA-N
3,3,AADORYZVGJDNSZ-UHFFFAOYSA-N,,,,,,,,,...,1.123188,1.123188,0.521739,0.521739,0.521739,0.521739,1.536232,0.601449,0.521739,AADORYZVGJDNSZ-UHFFFAOYSA-N
4,4,AAEVYOVXGOFMJO-UHFFFAOYSA-N,,,,,,,,,...,-6.882812,-6.882812,-7.218750,-7.218750,-7.218750,-7.218750,-7.679688,-7.773438,-7.218750,AAEVYOVXGOFMJO-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10568,10568,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,,,,,,,,,...,0.609375,0.609375,0.679688,0.679688,0.679688,0.679688,0.757812,0.757812,0.679688,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N
10569,10569,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,,,,,,,,,...,-10.242188,-10.242188,-10.746094,-10.746094,-10.746094,-10.746094,-11.167969,-10.281250,-10.746094,ZZUFCTLCJUWOSV-UHFFFAOYSA-N
10570,10570,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,,,,,,,,,...,-16.171875,-16.171875,-16.304688,-16.304688,-16.304688,-16.304688,-16.335938,-15.152344,-16.304688,ZZUZYEMRHCMVTB-UHFFFAOYSA-N
10571,10571,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,,,,,,,,,...,-0.640625,-0.640625,-1.664062,-1.664062,-1.664062,-1.664062,-1.257812,-0.289062,-1.664062,ZZVUWRFHKOJYTH-UHFFFAOYSA-N


In [5]:
import pandas as pd
from chembl_structure_pipeline import standardizer
from chembl_structure_pipeline import checker
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MolStandardize

def smiles_to_inchikey(smiles):
    
    try:
        # Read SMILES and convert it to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        inchikey = Chem.MolToInchiKey(mol)

        return inchikey
    
    except:
        return "Cannot_do"

[20:22:27] Initializing Normalizer


In [6]:
identifier = pd.read_csv("data/chemical_annotations.csv")
identifier

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT
0,BRD-A56675431-001-04-0,altizide,INN,SA82748,,Prestwick Chemical Inc.,,Prestw-721,NS(=O)(=O)c1cc2c(NC(CSCC=C)NS2(=O)=O)cc1Cl,
1,BRD-A51829654-001-01-4,"BRL-15,572",common,SA82481,,Biomol International Inc.,,AC-536,OC(CN1CCN(CC1)c1cccc(Cl)c1)C(c1ccccc1)c1ccccc1,
2,BRD-K04046242-001-03-6,equilin,primary-common,SA82922,,Prestwick Chemical Inc.,,Prestw-850,C[C@]12CC[C@H]3C(=CCc4cc(O)ccc34)[C@@H]1CCC2=O,
3,BRD-K16508793-001-01-8,diazepam,INN,SA59660,,MicroSource Discovery Systems Inc.,,1900003,CN1c2ccc(Cl)cc2C(=NCC1=O)c1ccccc1,
4,BRD-K09397065-001-01-6,SR 57227A,to-be-curated,SA82504,,Biomol International Inc.,,AC-561,NC1CCN(CC1)c1cccc(Cl)n1,
...,...,...,...,...,...,...,...,...,...,...
30611,BRD-K47092271-001-01-7,BRD-K47092271,BROAD_CPD_ID,SA799574,Benzofuran Library,Broad Institute of MIT and Harvard,,,COCC(=O)Nc1ccc2O[C@@H]3[C@@H](C[C@H](CC(=O)NCC...,
30612,BRD-K30358777-001-01-0,BRD-K30358777,BROAD_CPD_ID,SA800605,Benzofuran Library,Broad Institute of MIT and Harvard,,,COc1ccc(CNC(=O)C[C@@H]2C[C@H]3[C@H](Oc4ccc(NC(...,
30613,BRD-K32423836-001-01-9,BRD-K32423836,BROAD_CPD_ID,SA799806,Benzofuran Library,Broad Institute of MIT and Harvard,,,COCCNC(=O)C[C@H]1C[C@@H]2[C@@H](Oc3ccc(NC(=O)C...,
30614,BRD-K28250273-001-01-2,BRD-K28250273,BROAD_CPD_ID,SA1482018,Azetidine Nitrile Full Library,Broad Institute of MIT and Harvard,,,OC[C@H]1[C@H]([C@H](C#N)N1C(=O)Nc1cccc(F)c1)c1...,


In [7]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=80)

RDLogger.DisableLog('rdApp.*')  
identifier["INCHIKEY"] = identifier["CPD_SMILES"].parallel_apply(smiles_to_inchikey)
identifier

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=383), Label(value='0 / 383'))), HB…

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT,INCHIKEY
0,BRD-A56675431-001-04-0,altizide,INN,SA82748,,Prestwick Chemical Inc.,,Prestw-721,NS(=O)(=O)c1cc2c(NC(CSCC=C)NS2(=O)=O)cc1Cl,,VGLGVJVUHYTIIU-UHFFFAOYSA-N
1,BRD-A51829654-001-01-4,"BRL-15,572",common,SA82481,,Biomol International Inc.,,AC-536,OC(CN1CCN(CC1)c1cccc(Cl)c1)C(c1ccccc1)c1ccccc1,,QJHCTHPYUOXOGM-UHFFFAOYSA-N
2,BRD-K04046242-001-03-6,equilin,primary-common,SA82922,,Prestwick Chemical Inc.,,Prestw-850,C[C@]12CC[C@H]3C(=CCc4cc(O)ccc34)[C@@H]1CCC2=O,,WKRLQDKEXYKHJB-HFTRVMKXSA-N
3,BRD-K16508793-001-01-8,diazepam,INN,SA59660,,MicroSource Discovery Systems Inc.,,1900003,CN1c2ccc(Cl)cc2C(=NCC1=O)c1ccccc1,,AAOVKJBEBIDNHE-UHFFFAOYSA-N
4,BRD-K09397065-001-01-6,SR 57227A,to-be-curated,SA82504,,Biomol International Inc.,,AC-561,NC1CCN(CC1)c1cccc(Cl)n1,,WPVVMKYQOMJPIN-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...
30611,BRD-K47092271-001-01-7,BRD-K47092271,BROAD_CPD_ID,SA799574,Benzofuran Library,Broad Institute of MIT and Harvard,,,COCC(=O)Nc1ccc2O[C@@H]3[C@@H](C[C@H](CC(=O)NCC...,,NDUFHUOGHUIKBH-OFYLTFBOSA-N
30612,BRD-K30358777-001-01-0,BRD-K30358777,BROAD_CPD_ID,SA800605,Benzofuran Library,Broad Institute of MIT and Harvard,,,COc1ccc(CNC(=O)C[C@@H]2C[C@H]3[C@H](Oc4ccc(NC(...,,UPMRBEOCYVNRMT-OXPRRSNCSA-N
30613,BRD-K32423836-001-01-9,BRD-K32423836,BROAD_CPD_ID,SA799806,Benzofuran Library,Broad Institute of MIT and Harvard,,,COCCNC(=O)C[C@H]1C[C@@H]2[C@@H](Oc3ccc(NC(=O)C...,,HRBYSLULMJVYNQ-TXDFXGTJSA-N
30614,BRD-K28250273-001-01-2,BRD-K28250273,BROAD_CPD_ID,SA1482018,Azetidine Nitrile Full Library,Broad Institute of MIT and Harvard,,,OC[C@H]1[C@H]([C@H](C#N)N1C(=O)Nc1cccc(F)c1)c1...,,NDKYDEPQZPJKDZ-ULQDDVLXSA-N


In [8]:
Merged_df = pd.merge(cp_count_sanchez, identifier[["INCHIKEY", "CPD_SMILES"]], left_on ="INCHIKEY", right_on = "INCHIKEY")
Merged_df

Unnamed: 0.1,Unnamed: 0,INCHIKEY,1,2,3,4,5,6,7,8,...,Cells_Neighbors_SecondClosestObjectNumber_Adjacent,Cells_Parent_Nuclei,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cells,Cytoplasm_Parent_Nuclei,Nuclei_Neighbors_FirstClosestObjectNumber_1,Nuclei_Neighbors_SecondClosestObjectNumber_1,Nuclei_Number_Object_Number,InChIKey,CPD_SMILES
0,0,AACRWZVDRSTLKY-UHFFFAOYSA-N,,,,,,,,,...,-0.054688,0.859375,0.859375,0.859375,0.859375,1.445312,1.007812,0.859375,AACRWZVDRSTLKY-UHFFFAOYSA-N,[O-][N+](=O)c1ccccc1S(=O)(=O)N1CCCCC1
1,1,AACUKVXTFOXDGE-UHFFFAOYSA-N,,,,,,,,,...,-1.171875,-1.148438,-1.148438,-1.148438,-1.148438,-1.101562,-1.453125,-1.148438,AACUKVXTFOXDGE-UHFFFAOYSA-N,CCOC(=O)C(CC)Sc1ncnc2n(nnc12)-c1ccc(F)cc1
2,2,AADCDMQTJNYOSS-LBPRGKRZSA-N,,,,,,,,,...,-0.425781,0.453125,0.453125,0.453125,0.453125,-0.117188,-0.531250,0.453125,AADCDMQTJNYOSS-LBPRGKRZSA-N,CCN1CCC[C@H]1CNC(=O)c1c(O)c(CC)cc(Cl)c1OC
3,3,AADORYZVGJDNSZ-UHFFFAOYSA-N,,,,,,,,,...,1.123188,0.521739,0.521739,0.521739,0.521739,1.536232,0.601449,0.521739,AADORYZVGJDNSZ-UHFFFAOYSA-N,COc1ccc(C2=CC(c3ccc(OC)c(OC)c3)n3nnnc3N2)c(OC)c1
4,4,AAEVYOVXGOFMJO-UHFFFAOYSA-N,,,,,,,,,...,-6.882812,-7.218750,-7.218750,-7.218750,-7.218750,-7.679688,-7.773438,-7.218750,AAEVYOVXGOFMJO-UHFFFAOYSA-N,CSc1nc(NC(C)C)nc(NC(C)C)n1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10677,10568,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,,,,,,,,,...,0.609375,0.679688,0.679688,0.679688,0.679688,0.757812,0.757812,0.679688,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,Oc1c(CN2CCCCC2)c(=O)oc2ccccc12
10678,10569,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,,,,,,,,,...,-10.242188,-10.746094,-10.746094,-10.746094,-10.746094,-11.167969,-10.281250,-10.746094,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,NS(=O)(=O)c1cc(C(O)=O)c(NCc2ccco2)cc1Cl
10679,10570,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,,,,,,,,,...,-16.171875,-16.304688,-16.304688,-16.304688,-16.304688,-16.335938,-15.152344,-16.304688,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,NS(=O)(=O)C#Cc1ccccc1
10680,10571,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,,,,,,,,,...,-0.640625,-1.664062,-1.664062,-1.664062,-1.664062,-1.257812,-0.289062,-1.664062,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,CN(C)CCOC(c1ccccc1)c1ccccc1


In [9]:
Merged_df["Unnamed: 0"].value_counts()

1184     2
8193     2
4853     2
8964     2
8965     2
        ..
3539     1
3541     1
3542     1
3543     1
10572    1
Name: Unnamed: 0, Length: 10573, dtype: int64

In [10]:
cp_count_sanchez = Merged_df.drop_duplicates(subset=["Unnamed: 0"], keep="first").reset_index(drop=True)
cp_count_sanchez

Unnamed: 0.1,Unnamed: 0,INCHIKEY,1,2,3,4,5,6,7,8,...,Cells_Neighbors_SecondClosestObjectNumber_Adjacent,Cells_Parent_Nuclei,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cells,Cytoplasm_Parent_Nuclei,Nuclei_Neighbors_FirstClosestObjectNumber_1,Nuclei_Neighbors_SecondClosestObjectNumber_1,Nuclei_Number_Object_Number,InChIKey,CPD_SMILES
0,0,AACRWZVDRSTLKY-UHFFFAOYSA-N,,,,,,,,,...,-0.054688,0.859375,0.859375,0.859375,0.859375,1.445312,1.007812,0.859375,AACRWZVDRSTLKY-UHFFFAOYSA-N,[O-][N+](=O)c1ccccc1S(=O)(=O)N1CCCCC1
1,1,AACUKVXTFOXDGE-UHFFFAOYSA-N,,,,,,,,,...,-1.171875,-1.148438,-1.148438,-1.148438,-1.148438,-1.101562,-1.453125,-1.148438,AACUKVXTFOXDGE-UHFFFAOYSA-N,CCOC(=O)C(CC)Sc1ncnc2n(nnc12)-c1ccc(F)cc1
2,2,AADCDMQTJNYOSS-LBPRGKRZSA-N,,,,,,,,,...,-0.425781,0.453125,0.453125,0.453125,0.453125,-0.117188,-0.531250,0.453125,AADCDMQTJNYOSS-LBPRGKRZSA-N,CCN1CCC[C@H]1CNC(=O)c1c(O)c(CC)cc(Cl)c1OC
3,3,AADORYZVGJDNSZ-UHFFFAOYSA-N,,,,,,,,,...,1.123188,0.521739,0.521739,0.521739,0.521739,1.536232,0.601449,0.521739,AADORYZVGJDNSZ-UHFFFAOYSA-N,COc1ccc(C2=CC(c3ccc(OC)c(OC)c3)n3nnnc3N2)c(OC)c1
4,4,AAEVYOVXGOFMJO-UHFFFAOYSA-N,,,,,,,,,...,-6.882812,-7.218750,-7.218750,-7.218750,-7.218750,-7.679688,-7.773438,-7.218750,AAEVYOVXGOFMJO-UHFFFAOYSA-N,CSc1nc(NC(C)C)nc(NC(C)C)n1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10568,10568,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,,,,,,,,,...,0.609375,0.679688,0.679688,0.679688,0.679688,0.757812,0.757812,0.679688,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,Oc1c(CN2CCCCC2)c(=O)oc2ccccc12
10569,10569,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,,,,,,,,,...,-10.242188,-10.746094,-10.746094,-10.746094,-10.746094,-11.167969,-10.281250,-10.746094,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,NS(=O)(=O)c1cc(C(O)=O)c(NCc2ccco2)cc1Cl
10570,10570,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,,,,,,,,,...,-16.171875,-16.304688,-16.304688,-16.304688,-16.304688,-16.335938,-15.152344,-16.304688,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,NS(=O)(=O)C#Cc1ccccc1
10571,10571,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,,,,,,,,,...,-0.640625,-1.664062,-1.664062,-1.664062,-1.664062,-1.257812,-0.289062,-1.664062,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,CN(C)CCOC(c1ccccc1)c1ccccc1


In [11]:
from rdkit import Chem
from rdkit.Chem import Descriptors

# Function to calculate MW and logP from SMILES
def calculate_mw_logp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mw = Descriptors.MolWt(mol)
        logP = Descriptors.MolLogP(mol)
        return mw, logP
    else:
        return None, None  # Return None if SMILES is invalid

# Apply the function to each SMILES in the column and store results
cp_count_sanchez["MW"], cp_count_sanchez["logP"] = zip(*cp_count_sanchez["CPD_SMILES"].parallel_apply(calculate_mw_logp))
cp_count_sanchez

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=133), Label(value='0 / 133'))), HB…

Unnamed: 0.1,Unnamed: 0,INCHIKEY,1,2,3,4,5,6,7,8,...,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cells,Cytoplasm_Parent_Nuclei,Nuclei_Neighbors_FirstClosestObjectNumber_1,Nuclei_Neighbors_SecondClosestObjectNumber_1,Nuclei_Number_Object_Number,InChIKey,CPD_SMILES,MW,logP
0,0,AACRWZVDRSTLKY-UHFFFAOYSA-N,,,,,,,,,...,0.859375,0.859375,0.859375,1.445312,1.007812,0.859375,AACRWZVDRSTLKY-UHFFFAOYSA-N,[O-][N+](=O)c1ccccc1S(=O)(=O)N1CCCCC1,270.310,1.7694
1,1,AACUKVXTFOXDGE-UHFFFAOYSA-N,,,,,,,,,...,-1.148438,-1.148438,-1.148438,-1.101562,-1.453125,-1.148438,AACUKVXTFOXDGE-UHFFFAOYSA-N,CCOC(=O)C(CC)Sc1ncnc2n(nnc12)-c1ccc(F)cc1,361.402,2.7834
2,2,AADCDMQTJNYOSS-LBPRGKRZSA-N,,,,,,,,,...,0.453125,0.453125,0.453125,-0.117188,-0.531250,0.453125,AADCDMQTJNYOSS-LBPRGKRZSA-N,CCN1CCC[C@H]1CNC(=O)c1c(O)c(CC)cc(Cl)c1OC,340.851,2.8307
3,3,AADORYZVGJDNSZ-UHFFFAOYSA-N,,,,,,,,,...,0.521739,0.521739,0.521739,1.536232,0.601449,0.521739,AADORYZVGJDNSZ-UHFFFAOYSA-N,COc1ccc(C2=CC(c3ccc(OC)c(OC)c3)n3nnnc3N2)c(OC)c1,395.419,2.7635
4,4,AAEVYOVXGOFMJO-UHFFFAOYSA-N,,,,,,,,,...,-7.218750,-7.218750,-7.218750,-7.679688,-7.773438,-7.218750,AAEVYOVXGOFMJO-UHFFFAOYSA-N,CSc1nc(NC(C)C)nc(NC(C)C)n1,241.364,2.2341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10568,10568,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,,,,,,,,,...,0.679688,0.679688,0.679688,0.757812,0.757812,0.679688,ZZUCJGSOKDNIEZ-UHFFFAOYSA-N,Oc1c(CN2CCCCC2)c(=O)oc2ccccc12,259.305,2.4845
10569,10569,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,,,,,,,,,...,-10.746094,-10.746094,-10.746094,-11.167969,-10.281250,-10.746094,ZZUFCTLCJUWOSV-UHFFFAOYSA-N,NS(=O)(=O)c1cc(C(O)=O)c(NCc2ccco2)cc1Cl,330.749,1.8907
10570,10570,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,,,,,,,,,...,-16.304688,-16.304688,-16.304688,-16.335938,-15.152344,-16.304688,ZZUZYEMRHCMVTB-UHFFFAOYSA-N,NS(=O)(=O)C#Cc1ccccc1,181.216,0.2841
10571,10571,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,,,,,,,,,...,-1.664062,-1.664062,-1.664062,-1.257812,-0.289062,-1.664062,ZZVUWRFHKOJYTH-UHFFFAOYSA-N,CN(C)CCOC(c1ccccc1)c1ccccc1,255.361,3.3542


In [12]:
cp_count_sanchez.columns[-16:]

Index(['Cells_Number_Object_Number',
       'Cells_Neighbors_FirstClosestObjectNumber_5',
       'Cells_Neighbors_FirstClosestObjectNumber_Adjacent',
       'Cells_Neighbors_SecondClosestObjectNumber_5',
       'Cells_Neighbors_SecondClosestObjectNumber_Adjacent',
       'Cells_Parent_Nuclei', 'Cytoplasm_Number_Object_Number',
       'Cytoplasm_Parent_Cells', 'Cytoplasm_Parent_Nuclei',
       'Nuclei_Neighbors_FirstClosestObjectNumber_1',
       'Nuclei_Neighbors_SecondClosestObjectNumber_1',
       'Nuclei_Number_Object_Number', 'InChIKey', 'CPD_SMILES', 'MW', 'logP'],
      dtype='object')

In [20]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler


train_cols = ['Cells_Number_Object_Number', 'MW', 'logP']


# Function to perform logistic regression and get the best AUC-ROC score
def perform_logistic_regression(train, val, test, assay):
    X_train = train[train_cols]
    y_train = train[assay]
    X_val = val[train_cols]
    y_val = val[assay]
    X_test = test[train_cols]
    y_test = test[assay]
    
    # Check if any of the datasets are empty or only one class is present
    if (len(y_train) == 0 or len(y_val) == 0 or len(y_test) == 0 or
        len(y_train.unique()) == 1 or len(y_val.unique()) == 1 or len(y_test.unique()) == 1):
        return None, None, None, len(y_test)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Convert back to DataFrame
    X_train = pd.DataFrame(X_train, columns=train_cols)
    X_val = pd.DataFrame(X_val, columns=train_cols)
    X_test = pd.DataFrame(X_test, columns=train_cols)
    
    C_param_range = [10**i for i in range(-6, 7)]

    best_auc = 0
    best_C = None
    best_model = LogisticRegression(C=1, max_iter=3000, random_state=42)
    best_model.fit(X_train, y_train)

    for C in C_param_range:
        model = LogisticRegression(C=C, max_iter=3000, random_state=42)
        model.fit(X_train, y_train)
        
        # Evaluate on validation set
        y_prob = model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_prob)
        
        if val_auc > best_auc:
            best_auc = val_auc
            best_C = C
            best_model = model

    # Retrain the best model on the combined training and validation set
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    best_model.fit(X_train_val, y_train_val)

    # Evaluate the best model on the test set
    y_prob = best_model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_prob)
    
    N_labels_test = len(y_test)

    return best_C, best_auc, test_auc, N_labels_test


def process_split(train_path, val_path, test_path, split_label):
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    # Merge the datasets
    train_merged = merge_data(train, cp_count_sanchez)
    val_merged = merge_data(val, cp_count_sanchez)
    test_merged = merge_data(test, cp_count_sanchez)

    # Perform logistic regression for each assay and store the results
    results = []
    for assay in tqdm(assay_columns, desc=f"Processing {split_label}"):
        train_assay = train_merged.dropna(subset=[assay])[[assay] + train_cols]
        val_assay = val_merged.dropna(subset=[assay])[[assay] + train_cols]
        test_assay = test_merged.dropna(subset=[assay])[[assay] + train_cols]
        
        best_C, val_auc, test_auc, N_labels_test = perform_logistic_regression(train_assay, val_assay, test_assay, assay)
        
        if best_C is not None:
            results.append({'assay': assay, 'best_C': best_C, 
                            'val_auc': val_auc, 
                            'test_auc': test_auc, 
                            'split': split_label,
                            'N_labels_test': N_labels_test})

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Paths to data splits
data_splits = [
    ('data/datasplit1-train.csv', 'data/datasplit1-val.csv', 'data/datasplit1-test.csv', 'split1'),
    ('data/datasplit2-train.csv', 'data/datasplit2-val.csv', 'data/datasplit2-test.csv', 'split2'),
    ('data/datasplit3-train.csv', 'data/datasplit3-val.csv', 'data/datasplit3-test.csv', 'split3')
]

# Process each split and combine results
all_results = pd.DataFrame()

for train_path, val_path, test_path, split_label in data_splits:
    split_results = process_split(train_path, val_path, test_path, split_label)
    all_results = pd.concat([all_results, split_results], ignore_index=True)

# Save the combined results to a CSV file
all_results.to_csv('logistic_regression_results_CellCount_MW_logP.csv', index=False)

Processing split1: 100%|██████████| 209/209 [00:17<00:00, 12.21it/s]
Processing split2: 100%|██████████| 209/209 [00:17<00:00, 11.79it/s]
Processing split3: 100%|██████████| 209/209 [00:17<00:00, 11.89it/s]


In [22]:
all_results

Unnamed: 0,assay,best_C,val_auc,test_auc,split,N_labels_test
0,2,0.100000,0.700000,0.581633,split1,21
1,4,0.000001,0.333333,0.415385,split1,18
2,5,1.000000,0.750000,0.566667,split1,11
3,8,0.000001,0.266667,0.555556,split1,10
4,9,0.100000,0.800000,0.525714,split1,32
...,...,...,...,...,...,...
582,204,10.000000,0.730159,0.670996,split3,32
583,205,10.000000,0.778912,0.727273,split3,52
584,206,0.000001,1.000000,0.888889,split3,15
585,208,0.000001,0.500000,0.568627,split3,20
