In [1]:
import pandas as pd
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

In [2]:
# read BBBC021_v1_moa.csv file as pandas dataframe. The file can be downloaded from https://data.broadinstitute.org/bbbc/BBBC021/BBBC021_v1_compound.csv
smiles_df = pd.read_csv('/datasets/BBBC021/BBBC021_v1_compound.csv')

In [3]:
smiles_df

Unnamed: 0,compound,smiles
0,DMSO,
1,leupeptin,CC(C)C[C@H](NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N...
2,taxol,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C...
3,camptothecin,CC[C@@]1(O)C(=O)OCC2=C1C=C3N(Cc4cc5ccccc5nc34)...
4,Cdk1 inhibitor III,CCOC(=O)c1cnc2oc3ccc(O)cc3c2c1c4ccccc4
...,...,...
108,LY-294002,O=C1C=C(Oc2c1cccc2c3ccccc3)N4CCOCC4
109,SB-202190,Oc1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4
110,AZ701,CN(CCOc1cccc2ncnc(Nc3ccc(OCc4ccccn4)c(Cl)c3)c1...
111,AZ-K,


In [4]:
# remove rows with NaN values
smiles_df = smiles_df.dropna()
# reset index
smiles_df = smiles_df.reset_index(drop=True)
smiles_list = smiles_df['smiles'].values
smiles_df

Unnamed: 0,compound,smiles
0,leupeptin,CC(C)C[C@H](NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N...
1,taxol,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C...
2,camptothecin,CC[C@@]1(O)C(=O)OCC2=C1C=C3N(Cc4cc5ccccc5nc34)...
3,Cdk1 inhibitor III,CCOC(=O)c1cnc2oc3ccc(O)cc3c2c1c4ccccc4
4,"3,3'-diaminobenzidine",Nc1ccc(cc1N)c2ccc(N)c(N)c2
...,...,...
93,UO-126,N\\C(=C(\\C#N)/C(=C(\\N)/Sc1ccccc1N)/C#N)\\Sc2...
94,LY-294002,O=C1C=C(Oc2c1cccc2c3ccccc3)N4CCOCC4
95,SB-202190,Oc1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4
96,AZ701,CN(CCOc1cccc2ncnc(Nc3ccc(OCc4ccccn4)c(Cl)c3)c1...


In [5]:
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")

RDKit2D_calculated(bool)
BalabanJ(float64)
BertzCT(float64)
Chi0(float64)
Chi0n(float64)
Chi0v(float64)
Chi1(float64)
Chi1n(float64)
Chi1v(float64)
Chi2n(float64)
Chi2v(float64)
Chi3n(float64)
Chi3v(float64)
Chi4n(float64)
Chi4v(float64)
EState_VSA1(float64)
EState_VSA10(float64)
EState_VSA11(float64)
EState_VSA2(float64)
EState_VSA3(float64)
EState_VSA4(float64)
EState_VSA5(float64)
EState_VSA6(float64)
EState_VSA7(float64)
EState_VSA8(float64)
EState_VSA9(float64)
ExactMolWt(float64)
FpDensityMorgan1(float64)
FpDensityMorgan2(float64)
FpDensityMorgan3(float64)
FractionCSP3(float64)
HallKierAlpha(float64)
HeavyAtomCount(float64)
HeavyAtomMolWt(float64)
Ipc(float64)
Kappa1(float64)
Kappa2(float64)
Kappa3(float64)
LabuteASA(float64)
MaxAbsEStateIndex(float64)
MaxAbsPartialCharge(float64)
MaxEStateIndex(float64)
MaxPartialCharge(float64)
MinAbsEStateIndex(float64)
MinAbsPartialCharge(float64)
MinEStateIndex(float64)
MinPartialCharge(float64)
MolLogP(float64)
MolMR(float64)
MolWt(float64)

In [6]:
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) )

  0%|          | 0/98 [00:00<?, ?it/s]

In [7]:
smiles_df

Unnamed: 0,compound,smiles
0,leupeptin,CC(C)C[C@H](NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N...
1,taxol,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C...
2,camptothecin,CC[C@@]1(O)C(=O)OCC2=C1C=C3N(Cc4cc5ccccc5nc34)...
3,Cdk1 inhibitor III,CCOC(=O)c1cnc2oc3ccc(O)cc3c2c1c4ccccc4
4,"3,3'-diaminobenzidine",Nc1ccc(cc1N)c2ccc(N)c(N)c2
...,...,...
93,UO-126,N\\C(=C(\\C#N)/C(=C(\\N)/Sc1ccccc1N)/C#N)\\Sc2...
94,LY-294002,O=C1C=C(Oc2c1cccc2c3ccccc3)N4CCOCC4
95,SB-202190,Oc1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4
96,AZ701,CN(CCOc1cccc2ncnc(Nc3ccc(OCc4ccccn4)c(Cl)c3)c1...


In [8]:
embedding = np.ones((smiles_df.shape[0], len(data[1])))
print(embedding.shape)
print(len(data))
for d in range(len(data)):
    if data[d] is not None:
        embedding[d] = data[d]

embedding[1]

(98, 201)
98


array([ 1.00000000e+00,  1.38744766e+00,  2.28034627e+03,  4.49402179e+01,
        3.52327485e+01,  3.52327485e+01,  2.92722899e+01,  2.04728500e+01,
        2.04728500e+01,  1.73540403e+01,  1.73540403e+01,  1.34798552e+01,
        1.34798552e+01,  1.01825405e+01,  1.01825405e+01,  1.12604095e+02,
        3.92922681e+01,  4.79453718e+00,  3.53008156e+01,  5.56345149e+00,
        1.38474744e+01,  2.59802085e+01,  9.27102463e+01,  0.00000000e+00,
        5.31678860e+00,  2.36843148e+01,  8.53330955e+02,  8.22580645e-01,
        1.40322581e+00,  1.95161290e+00,  4.46808511e-01, -5.74000000e+00,
        6.20000000e+01,  8.02510000e+02,  1.95363426e+13,  4.43203576e+01,
        1.65128455e+01,  7.29440771e+00,  3.57885407e+02,  1.54837950e+01,
        4.55736924e-01,  1.54837950e+01,  3.38014233e-01,  2.88534398e-03,
        3.38014233e-01, -2.39095815e+00, -4.55736924e-01,  3.73570000e+00,
        2.17690100e+02,  8.53918000e+02,  4.00000000e+00,  1.50000000e+01,
        3.00000000e+00,  

In [9]:
import pandas as pd

df = pd.DataFrame(data=embedding,
                  index=smiles_df['compound'],
                  columns=[f'latent_{i}' for i in range(embedding.shape[1])]) 
df

Unnamed: 0_level_0,latent_0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,...,latent_191,latent_192,latent_193,latent_194,latent_195,latent_196,latent_197,latent_198,latent_199,latent_200
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
leupeptin,1.0,5.060057,593.845766,23.267220,18.709979,18.709979,13.954504,10.367655,10.367655,8.307227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.099043
taxol,1.0,1.387448,2280.346270,44.940218,35.232749,35.232749,29.272290,20.472850,20.472850,17.354040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129786
camptothecin,1.0,1.813414,1164.420305,18.051677,14.151808,14.151808,12.525024,8.585127,8.585127,6.791866,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532764
Cdk1 inhibitor III,1.0,2.138325,1083.202781,17.388541,13.522431,13.522431,12.185872,7.955084,7.955084,5.596047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.557654
"3,3'-diaminobenzidine",1.0,2.502080,485.037124,11.706742,8.773503,8.773503,7.575387,4.880768,4.880768,3.687393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UO-126,1.0,2.415200,886.631991,19.104084,13.639127,15.272120,12.472764,7.323111,8.956104,4.981885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267079
LY-294002,1.0,1.910040,880.005356,15.648054,12.696538,12.696538,11.326500,7.777836,7.777836,5.596592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728379
SB-202190,1.0,1.885294,993.562933,17.225404,13.147808,13.147808,12.203510,7.762952,7.762952,5.598673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.574239
AZ701,1.0,1.493865,1300.537790,24.622745,19.018057,19.773986,17.046045,10.855231,11.233195,7.574371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342808


In [11]:
# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=['latent_0'], inplace=True)

# Drop columns with 0 standard deviation
threshold = 0.001
columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]]
print(f'Deleting columns with std<={threshold}: {columns}')
df.drop(columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]], inplace=True)

Deleting columns with std<=0.001: ['latent_61', 'latent_90', 'latent_103', 'latent_119', 'latent_137', 'latent_143', 'latent_146', 'latent_147', 'latent_148', 'latent_150', 'latent_152', 'latent_153', 'latent_160', 'latent_161', 'latent_164', 'latent_165', 'latent_168', 'latent_176', 'latent_177', 'latent_178', 'latent_182', 'latent_183', 'latent_187', 'latent_189', 'latent_193', 'latent_194', 'latent_196']


In [12]:

df

Unnamed: 0_level_0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_186,latent_188,latent_190,latent_191,latent_192,latent_195,latent_197,latent_198,latent_199,latent_200
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
leupeptin,5.060057,593.845766,23.267220,18.709979,18.709979,13.954504,10.367655,10.367655,8.307227,8.307227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.099043
taxol,1.387448,2280.346270,44.940218,35.232749,35.232749,29.272290,20.472850,20.472850,17.354040,17.354040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129786
camptothecin,1.813414,1164.420305,18.051677,14.151808,14.151808,12.525024,8.585127,8.585127,6.791866,6.791866,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532764
Cdk1 inhibitor III,2.138325,1083.202781,17.388541,13.522431,13.522431,12.185872,7.955084,7.955084,5.596047,5.596047,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.557654
"3,3'-diaminobenzidine",2.502080,485.037124,11.706742,8.773503,8.773503,7.575387,4.880768,4.880768,3.687393,3.687393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UO-126,2.415200,886.631991,19.104084,13.639127,15.272120,12.472764,7.323111,8.956104,4.981885,7.149439,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267079
LY-294002,1.910040,880.005356,15.648054,12.696538,12.696538,11.326500,7.777836,7.777836,5.596592,5.596592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728379
SB-202190,1.885294,993.562933,17.225404,13.147808,13.147808,12.203510,7.762952,7.762952,5.598673,5.598673,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.574239
AZ701,1.493865,1300.537790,24.622745,19.018057,19.773986,17.046045,10.855231,11.233195,7.574371,7.981571,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342808


In [13]:
normalized_df=(df-df.mean())/df.std()

In [14]:
normalized_df.head()
refined_index = list()
for index in normalized_df.index.tolist():
    refined_index.append(index.lower().replace(' ', '-').replace('/', '-').replace('(', '-').replace(')', '-').replace('\'', ''))
normalized_df.index = refined_index

In [15]:
normalized_df.fillna(0, inplace=True)

In [16]:
# save normalized_df to csv
normalized_df.to_csv('required_file/perturbation_embedding_bbbc021.csv')