# Lab Rotation - Nuno Martinho (6)

#### DrugBank input and curation

In [9]:
# Inputs

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import molvs
from molvs import Standardizer
from molvs import fragment
from molvs import charge
from molvs import tautomer
from rdkit.Chem.Scaffolds.MurckoScaffold import MakeScaffoldGeneric, GetScaffoldForMol
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdinchi import InchiToInchiKey, MolToInchi
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, rdFMCS
from rdkit.Chem.Draw import DrawingOptions
from IPython.display import SVG
from rdkit.Chem.PandasTools import AddMoleculeColumnToFrame
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

#### After converting DrugBank_all.sdf to .smi using open babel...

In [10]:
# Import database: delimited by ";" 

data = pd.read_csv("DrugBankAll_03_01_2024.smi", usecols=[0])

In [11]:
data

Unnamed: 0,SMILES
0,C(=O)([C@H](CC(C)C)NC(=O)[C@H](Cc1ccc(cc1)O)NC...
1,N1[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(...
2,O=C(N1[C@@H](CCC1)C(=O)NNC(=O)N)[C@@H](NC(=O)[...
3,[C@@H](C(=O)NCC(=O)N[C@@H](C(=O)N[C@@H](C)C(=O...
4,NC(=O)CNC(=O)[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H...
...,...
11923,Cc1cc(on1)/C(=C/Nc1ccc(Cl)cc1)/C(=O)Nc1ccc(Cl)...
11924,Cc1onc(c1COc1ccc(nn1)C(=O)NC1CCOCC1)c1ccc(C)nc1\t
11925,Cn1cc(cn1)c1cn2nccc2c(n1)c1cnn(c1)[C@@]1(CC#N)...
11926,OC(=O)CCCCC(=O)O.C[C@@H](N)COc1ccc(cc1)c1cnc2c...


In [12]:
# Filter out rows with invalid SMILES strings
valid_smiles_indices = []
for i, smiles in enumerate(data["SMILES"]):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        valid_smiles_indices.append(i)

# Update DataFrame with valid SMILES strings
data = data.iloc[valid_smiles_indices].reset_index(drop=True)

[15:06:04] Explicit valence for atom # 3 N, 5, is greater than permitted
[15:06:04] Explicit valence for atom # 6 N, 5, is greater than permitted
[15:06:04] Explicit valence for atom # 8 N, 5, is greater than permitted
[15:06:05] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[15:06:05] Explicit valence for atom # 20 O, 3, is greater than permitted
[15:06:05] Explicit valence for atom # 16 O, 3, is greater than permitted
[15:06:05] Unusual charge on atom 42 number of radical electrons set to zero
[15:06:06] Explicit valence for atom # 0 Be, 3, is greater than permitted
[15:06:07] Explicit valence for atom # 50 N, 4, is greater than permitted
[15:06:07] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[15:06:07] Explicit valence for atom # 6 N, 5, is greater than permitted
[15:06:08] Explicit valence for atom # 0 B, 5, is greater than permitted


In [13]:
# Calculate molecular weights for valid SMILES strings
data["Molecular_Weight"] = [Chem.rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(data["SMILES"][i])) for i in range(len(data))]

[15:06:14] Unusual charge on atom 42 number of radical electrons set to zero


In [14]:
# Filter out rows with Molecular_Weight >= 1000
filtered_data = data[data["Molecular_Weight"] < 1000]

# Optionally, you can reset the index of the filtered DataFrame
filtered_data.reset_index(drop=True, inplace=True)

In [15]:
filtered_data

Unnamed: 0,SMILES,Molecular_Weight
0,P(=O)(OCc1c(c(O)c(nc1)C)C=O)(O)O\t114,247.024574
1,O=c1nc([nH]c2NCC(Nc12)CNc1ccc(cc1)C(=O)N[C@@H]...,445.170981
2,c1(C[C@H](N)C(=O)O)c[nH]cn1\t,155.069477
3,n1(c2c(nc1)c(ncn2)N)[C@@H]1O[C@@H]([C@H]([C@H]...,398.137239
4,OC(=O)C(=O)C\t,88.016044
...,...,...
11631,Cc1cc(C)c2CN(Cc2n1)C(=O)CC1CN(C1)c1ccnc(c1)C(F...,390.166746
11632,Cc1cc(on1)/C(=C/Nc1ccc(Cl)cc1)/C(=O)Nc1ccc(Cl)...,387.054132
11633,Cc1onc(c1COc1ccc(nn1)C(=O)NC1CCOCC1)c1ccc(C)nc1\t,409.175004
11634,Cn1cc(cn1)c1cn2nccc2c(n1)c1cnn(c1)[C@@]1(CC#N)...,383.160692


In [16]:
def standardize_molecules(smiles_list):
    unsalt = molvs.fragment.LargestFragmentChooser()
    s = molvs.Standardizer()
    u = molvs.charge.Uncharger()
    t = molvs.tautomer.TautomerCanonicalizer()
    std_smi = []
    inchik = []
    for smi in tqdm(smiles_list, desc="Standardizing molecules", unit="molecule"):
        try:
            m = Chem.MolFromSmiles(smi)
            if m is None:
                print(f"Invalid SMILES string: {smi}")
                continue  # Skip to the next SMILES
            # step 1: sanitize, disconnect metals, normalize chemotypes, reionize acids
            m = s.standardize(m)
            # step 2: disconnect salts
            m = unsalt(m)
            # step 3: neutralize
            m = u.uncharge(m)
            inchik.append(Chem.MolToInchiKey(m))
            std_smi.append(Chem.MolToSmiles(m))
        except Exception as e:
            print(f"Error processing SMILES: {smi}")
            print(f"Error message: {str(e)}")
            continue
    return inchik, std_smi

# Apply standardization to filtered SMILES
inchikey, std_mols = standardize_molecules(filtered_data["SMILES"])

# Add standardized data to the DataFrame
filtered_data["InChIKeys"] = inchikey
filtered_data["Standardized SMILES"] = std_mols

# Saving this data
filtered_data.to_csv('DrugBank_curated.csv', index=False)

filtered_data.shape

Standardizing molecules:  12%|█▏        | 1421/11636 [00:05<00:37, 274.25molecule/s][15:06:40] Unusual charge on atom 2 number of radical electrons set to zero
[15:06:40] Unusual charge on atom 0 number of radical electrons set to zero
[15:06:40] Unusual charge on atom 19 number of radical electrons set to zero
[15:06:40] Unusual charge on atom 19 number of radical electrons set to zero
[15:06:40] Unusual charge on atom 0 number of radical electrons set to zero
Standardizing molecules:  27%|██▋       | 3094/11636 [00:11<00:31, 267.97molecule/s][15:06:47] bond type above 3 (17) is treated as unspecified!
[15:06:47] bond type above 3 (17) is treated as unspecified!
[15:06:47] bond type above 3 (17) is treated as unspecified!
[15:06:47] bond type above 3 (17) is treated as unspecified!
[15:06:47] Invalid InChI prefix in generating InChI Key
Standardizing molecules:  30%|██▉       | 3438/11636 [00:12<00:26, 303.74molecule/s][15:06:48] bond type above 3 (17) is treated as unspecified!
[15:0

(11636, 4)

In [17]:
filtered_data

Unnamed: 0,SMILES,Molecular_Weight,InChIKeys,Standardized SMILES
0,P(=O)(OCc1c(c(O)c(nc1)C)C=O)(O)O\t114,247.024574,NGVDGCNFYWLIFO-UHFFFAOYSA-N,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,O=c1nc([nH]c2NCC(Nc12)CNc1ccc(cc1)C(=O)N[C@@H]...,445.170981,MSTNYGQPCMXVAQ-KIYNQFGBSA-N,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)N[C@@H](CC...
2,c1(C[C@H](N)C(=O)O)c[nH]cn1\t,155.069477,HNDVDQJCIGZPNO-YFKPBYRVSA-N,N[C@@H](Cc1c[nH]cn1)C(=O)O
3,n1(c2c(nc1)c(ncn2)N)[C@@H]1O[C@@H]([C@H]([C@H]...,398.137239,MEFKEPWMEQBLKI-AIRLBKTGSA-N,C[S+](CC[C@H](N)C(=O)[O-])C[C@H]1O[C@@H](n2cnc...
4,OC(=O)C(=O)C\t,88.016044,LCTONWCANYUPML-UHFFFAOYSA-N,CC(=O)C(=O)O
...,...,...,...,...
11631,Cc1cc(C)c2CN(Cc2n1)C(=O)CC1CN(C1)c1ccnc(c1)C(F...,390.166746,DTCZNKWBDTXEBS-UHFFFAOYSA-N,Cc1cc(C)c2c(n1)CN(C(=O)CC1CN(c3ccnc(C(F)(F)F)c...
11632,Cc1cc(on1)/C(=C/Nc1ccc(Cl)cc1)/C(=O)Nc1ccc(Cl)...,387.054132,VMAKIACTLSBBIY-BOPFTXTBSA-N,Cc1cc(/C(=C/Nc2ccc(Cl)cc2)C(=O)Nc2ccc(Cl)cc2)on1
11633,Cc1onc(c1COc1ccc(nn1)C(=O)NC1CCOCC1)c1ccc(C)nc1\t,409.175004,ACZCJTHHWMBFKC-UHFFFAOYSA-N,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
11634,Cn1cc(cn1)c1cn2nccc2c(n1)c1cnn(c1)[C@@]1(CC#N)...,383.160692,XPLZTJWZDBFWDE-OYOVHJISSA-N,Cn1cc(-c2cn3nccc3c(-c3cnn([C@]4(CC#N)C[C@@H](C...


In [18]:
# Calculation of Morgan Fingerprints

standard = []
morgan = []

for smiles in tqdm(filtered_data["Standardized SMILES"], desc="Calculating Morgan Fingerprints", unit="molecule"):
    try:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            print(f"Invalid SMILES string: {smiles}")
            continue  # Skip to the next SMILES
        standard.append(smiles)
        morgan.append(AllChem.GetMorganFingerprintAsBitVect(m, useChirality=True, radius=2, nBits = 1024))
    except Exception as e:
        print(f"Error processing SMILES: {smiles}")
        print(f"Error message: {str(e)}")
        continue


# Verify if it has the same number of rows as our dataframe

print(len(morgan))

# Add it on our dataframe and save it as csv

new_data = pd.DataFrame(data=None, columns=["Standardized SMILES", "Morgan Fingerprints"])
new_data["Morgan Fingerprints"] = morgan
new_data["Standardized SMILES"] = standard

new_data.to_csv('DrugBank_curated_WithMorgan.csv', index=False)

Calculating Morgan Fingerprints:  10%|█         | 1211/11636 [00:00<00:03, 2872.84molecule/s][15:07:32] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13
Calculating Morgan Fingerprints:  13%|█▎        | 1547/11636 [00:00<00:03, 3028.07molecule/s][15:07:33] Explicit valence for atom # 6 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  16%|█▌        | 1866/11636 [00:00<00:03, 3077.60molecule/s]

Invalid SMILES string: O=C1C=CC(=O)N1CCc1cccc1
Invalid SMILES string: N[C@@H](CCCC[BH](O)(O)O)C(=O)O


Calculating Morgan Fingerprints:  19%|█▉        | 2195/11636 [00:00<00:03, 3125.80molecule/s][15:07:33] Explicit valence for atom # 13 B, 5, is greater than permitted
[15:07:33] Explicit valence for atom # 16 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  25%|██▍       | 2888/11636 [00:00<00:02, 3176.39molecule/s]

Invalid SMILES string: CC(=O)N[C@H](Cc1ccc(Cl)cc1)[BH](O)(O)OC[C@H](N)C(=O)O
Invalid SMILES string: CC(=O)N[C@H](Cc1cccc2ccccc12)[BH](O)(O)OC[C@H](N)C(=O)O


[15:07:33] Explicit valence for atom # 16 B, 5, is greater than permitted
[15:07:33] Explicit valence for atom # 13 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  28%|██▊       | 3207/11636 [00:01<00:02, 3049.35molecule/s][15:07:33] Explicit valence for atom # 6 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  33%|███▎      | 3821/11636 [00:01<00:02, 3044.31molecule/s]

Invalid SMILES string: CC(=O)N[C@@H](Cc1cccc2ccccc12)[BH](O)(O)OC[C@H](N)C(=O)O
Invalid SMILES string: CC(=O)N[C@@H](Cc1ccc(Cl)cc1)[BH](O)(O)OC[C@H](N)C(=O)O
Invalid SMILES string: N[C@@H](CSCC[BH](O)(O)O)C(=O)O


[15:07:33] Explicit valence for atom # 6 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  38%|███▊      | 4474/11636 [00:01<00:02, 2969.82molecule/s]

Invalid SMILES string: N[C@@H](CC/C=C/[BH](O)(O)O)C(=O)O


Calculating Morgan Fingerprints:  58%|█████▊    | 6743/11636 [00:02<00:01, 2698.85molecule/s][15:07:34] Explicit valence for atom # 16 B, 5, is greater than permitted
[15:07:34] Explicit valence for atom # 16 B, 5, is greater than permitted
[15:07:34] Explicit valence for atom # 13 B, 5, is greater than permitted
[15:07:34] Explicit valence for atom # 13 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  61%|██████    | 7068/11636 [00:02<00:01, 2765.65molecule/s]

Invalid SMILES string: CC(=O)N[C@@H](Cc1cccc2ccccc12)[BH](O)(O)O
Invalid SMILES string: CC(=O)N[C@H](Cc1cccc2ccccc12)[BH](O)(O)O
Invalid SMILES string: CC(=O)N[C@H](Cc1ccc(Cl)cc1)[BH](O)(O)O
Invalid SMILES string: CC(=O)N[C@@H](Cc1ccc(Cl)cc1)[BH](O)(O)O


Calculating Morgan Fingerprints:  94%|█████████▍| 10931/11636 [00:04<00:00, 2240.57molecule/s][15:07:36] Explicit valence for atom # 1 B, 5, is greater than permitted
Calculating Morgan Fingerprints:  98%|█████████▊| 11403/11636 [00:04<00:00, 2283.84molecule/s]

Invalid SMILES string: F[BH](F)(F)F


Calculating Morgan Fingerprints: 100%|██████████| 11636/11636 [00:04<00:00, 2660.51molecule/s]


11623


In [19]:
new_data

Unnamed: 0,Standardized SMILES,Morgan Fingerprints
0,Cc1ncc(COP(=O)(O)O)c(C=O)c1O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)N[C@@H](CC...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,N[C@@H](Cc1c[nH]cn1)C(=O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,C[S+](CC[C@H](N)C(=O)[O-])C[C@H]1O[C@@H](n2cnc...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,CC(=O)C(=O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
11618,Cc1cc(C)c2c(n1)CN(C(=O)CC1CN(c3ccnc(C(F)(F)F)c...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11619,Cc1cc(/C(=C/Nc2ccc(Cl)cc2)C(=O)Nc2ccc(Cl)cc2)on1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
11620,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11621,Cn1cc(-c2cn3nccc3c(-c3cnn([C@]4(CC#N)C[C@@H](C...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
# Put it as numpy arrays

fingerprints_array = np.array(new_data["Morgan Fingerprints"].tolist())
smilescodes_array = np.array(new_data["Standardized SMILES"].tolist())

# Save the NumPy arrays to .npy files
np.save('Morgan_Fingerprints_DrugBank.npy', fingerprints_array)
np.save('Standardized_SMILES_DrugBank.npy', smilescodes_array)

In [21]:
# Calculation of Chemical Descriptors

# Upload information

smilescodes_array_DrugBank = np.load('Standardized_SMILES_DrugBank.npy')

# Retrieve the several descriptors

descriptors_list = []
for descriptor_column in Descriptors._descList:
    descriptors_list.append(descriptor_column[0])

# Calculating Chemical Descriptors

calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors_list)

In [23]:
standards_with_chemical_descriptors = []
descriptors = []

# Write everything into a file

with open("ChemicalrdkitDescriptors_DrugBank.txt", "w") as ChemicalrdkitDescriptors_file:
    ChemicalrdkitDescriptors_file.write(",".join(descriptor_each for descriptor_each in descriptors_list))
    ChemicalrdkitDescriptors_file.write("\n")
    for smi in tqdm(smilescodes_array_DrugBank, desc="Calculating Chemical Descriptors", unit="molecule"):
        tmp_calc = calculator.CalcDescriptors(Chem.MolFromSmiles(smi))
        if tmp_calc is not None:
            ChemicalrdkitDescriptors_file.write(",".join(map(str, tmp_calc)) + "\n")
            standards_with_chemical_descriptors.append(smi)
        else:
            print(f"Invalid SMILES string: {smiles}")
ChemicalrdkitDescriptors_file.close()

Calculating Chemical Descriptors:  88%|████████▊ | 10271/11623 [04:14<00:42, 31.45molecule/s]Traceback (most recent call last):
  File "C:\Users\guigo\AppData\Roaming\Python\Python311\site-packages\rdkit\ML\Descriptors\MoleculeDescriptors.py", line 88, in CalcDescriptors
    res[i] = fn(mol)
             ^^^^^^^
  File "C:\Users\guigo\AppData\Roaming\Python\Python311\site-packages\rdkit\Chem\SpacialScore.py", line 72, in SPS
    return _SpacialScore(mol, normalize=normalize).score
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\guigo\AppData\Roaming\Python\Python311\site-packages\rdkit\Chem\SpacialScore.py", line 95, in __init__
    self.score /= self.mol.GetNumHeavyAtoms()
ZeroDivisionError: division by zero
Calculating Chemical Descriptors: 100%|██████████| 11623/11623 [04:55<00:00, 39.31molecule/s]


In [29]:
# To see if we have any nulls (to verify if everything was calculated, nulls should be counted and equal to 0)

ChemicalDescriptors = pd.read_csv("ChemicalrdkitDescriptors_DrugBank.txt")
ChemicalDescriptors["Standardized_SMILES"] = standards_with_chemical_descriptors
nan_count = ChemicalDescriptors.isnull().sum().sum()
nan_count

2228

In [30]:
ChemicalDescriptors    

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Standardized_SMILES
0,10.687482,10.687482,0.082454,-4.613810,0.523900,11.437500,247.143,237.063,247.024574,88,...,0,0,0,0,0,0,0,0,0,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,12.285513,12.285513,0.028067,-1.301134,0.251235,15.437500,445.436,422.252,445.170981,170,...,0,0,0,0,0,0,0,0,0,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)N[C@@H](CC...
2,10.262535,10.262535,0.262731,-1.006481,0.541194,12.818182,155.157,146.085,155.069477,60,...,0,0,0,0,0,0,0,0,0,N[C@@H](Cc1c[nH]cn1)C(=O)O
3,10.698110,10.698110,0.210831,-1.284365,0.344970,27.703704,398.445,376.269,398.137239,148,...,0,0,0,0,0,0,0,0,0,C[S+](CC[C@H](N)C(=O)[O-])C[C@H]1O[C@@H](n2cnc...
4,9.543981,9.543981,0.824074,-1.379630,0.445928,7.500000,88.062,84.030,88.016044,34,...,0,0,0,0,0,0,0,0,0,CC(=O)C(=O)O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11618,12.804245,12.804245,0.074456,-4.452853,0.805267,16.892857,390.409,369.241,390.166746,148,...,0,0,0,0,0,0,0,0,0,Cc1cc(C)c2c(n1)CN(C(=O)CC1CN(c3ccnc(C(F)(F)F)c...
11619,12.721365,12.721365,0.301389,-0.342652,0.576065,11.269231,388.254,373.134,387.054132,132,...,0,0,0,0,0,0,0,0,0,Cc1cc(/C(=C/Nc2ccc(Cl)cc2)C(=O)Nc2ccc(Cl)cc2)on1
11620,12.317116,12.317116,0.106692,-0.245383,0.661156,14.466667,409.446,386.262,409.175004,156,...,0,0,0,0,0,0,0,0,0,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
11621,9.283128,9.283128,0.023547,-0.411819,0.534950,20.862069,383.419,366.283,383.160692,142,...,0,0,0,0,0,0,0,0,0,Cn1cc(-c2cn3nccc3c(-c3cnn([C@]4(CC#N)C[C@@H](C...


In [31]:
# Since there are some Nones in our dataframe, we need to drop those rows

ChemicalDescriptors_cleaned = ChemicalDescriptors.dropna()

In [32]:
ChemicalDescriptors_cleaned.iloc[:, :-1]

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.687482,10.687482,0.082454,-4.613810,0.523900,11.437500,247.143,237.063,247.024574,88,...,0,0,0,0,0,0,0,0,0,0
1,12.285513,12.285513,0.028067,-1.301134,0.251235,15.437500,445.436,422.252,445.170981,170,...,0,0,0,0,0,0,0,0,0,0
2,10.262535,10.262535,0.262731,-1.006481,0.541194,12.818182,155.157,146.085,155.069477,60,...,0,0,0,0,0,0,0,0,0,0
3,10.698110,10.698110,0.210831,-1.284365,0.344970,27.703704,398.445,376.269,398.137239,148,...,0,0,0,0,0,0,0,0,0,0
4,9.543981,9.543981,0.824074,-1.379630,0.445928,7.500000,88.062,84.030,88.016044,34,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11618,12.804245,12.804245,0.074456,-4.452853,0.805267,16.892857,390.409,369.241,390.166746,148,...,0,0,0,0,0,0,0,0,0,0
11619,12.721365,12.721365,0.301389,-0.342652,0.576065,11.269231,388.254,373.134,387.054132,132,...,0,0,0,0,0,0,0,0,0,0
11620,12.317116,12.317116,0.106692,-0.245383,0.661156,14.466667,409.446,386.262,409.175004,156,...,0,0,0,0,0,0,0,0,0,0
11621,9.283128,9.283128,0.023547,-0.411819,0.534950,20.862069,383.419,366.283,383.160692,142,...,0,0,0,0,0,0,0,0,0,0


In [33]:
np.save('ChemicalDescriptors_DrugBank.npy', ChemicalDescriptors_cleaned.iloc[:, :-1])
np.save("Standardized_SMILES_DrugBank_WithChemicalDescriptors.npy", ChemicalDescriptors_cleaned.iloc[:, -1])
np.save('DescriptorsList_DrugBank.npy', descriptors_list)

ChemicalDescriptors_cleaned.to_csv('DrugBank_curated_ChemicalDescriptors.csv', index=False)