In [1]:
from rdkit.Chem import rdMolDescriptors, MolFromSmiles
from tqdm import tqdm #my setup does not currently support those jupyter widgets
from functools import partial
import numpy as np
from pubchempy import get_compounds
import pandas as pd

In [2]:
ds_name = 'DarkChemicalMatter'

In [3]:
df = pd.read_csv('../data/dark_chemical_matter.csv')

## Calculate Morgan Fingerprints

In [4]:
fingerprint_function = partial(rdMolDescriptors.GetMorganFingerprintAsBitVect, 
                              radius=2, useChirality=True)
fp_name = 'morgan'

In [5]:
def row_to_fingerprint(row):
    mol = MolFromSmiles(row.smiles)
    if mol is None:
        return np.nan  # this automatically deletes the row when using pd.apply
    fp = fingerprint_function(mol)
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [6]:
tqdm.pandas()
df[f'{fp_name}_fingerprint'] = df.progress_apply(row_to_fingerprint, axis=1)

  6%|▌         | 8190/139352 [00:05<01:32, 1417.49it/s]RDKit ERROR: [17:12:06] SMILES Parse Error: syntax error while parsing: [S](=O)(=O)(N(C(=O)C)|[Na+])C1=CC=C(N)C=C1
RDKit ERROR: [17:12:06] SMILES Parse Error: Failed parsing SMILES '[S](=O)(=O)(N(C(=O)C)|[Na+])C1=CC=C(N)C=C1' for input: '[S](=O)(=O)(N(C(=O)C)|[Na+])C1=CC=C(N)C=C1'
 35%|███▍      | 48677/139352 [00:34<01:08, 1320.02it/s]RDKit ERROR: [17:12:35] SMILES Parse Error: syntax error while parsing: [K+]|OC(=N[S](=O)(=O)C1=CC=C(N=NN(C)C)C=C1)CC
RDKit ERROR: [17:12:35] SMILES Parse Error: Failed parsing SMILES '[K+]|OC(=N[S](=O)(=O)C1=CC=C(N=NN(C)C)C=C1)CC' for input: '[K+]|OC(=N[S](=O)(=O)C1=CC=C(N=NN(C)C)C=C1)CC'
 40%|████      | 55956/139352 [00:39<01:01, 1350.12it/s]RDKit ERROR: [17:12:40] SMILES Parse Error: syntax error while parsing: [K+]|OC2=CC(=O)N(C1=CC=C(F)C=C1)C(=N2)SC
RDKit ERROR: [17:12:40] SMILES Parse Error: Failed parsing SMILES '[K+]|OC2=CC(=O)N(C1=CC=C(F)C=C1)C(=N2)SC' for input: '[K+]|OC2=CC(=O)N(C1=CC=C(F

In [7]:
# some processing
df_out = df.dropna() # remove failed molecule rows
print('Number failed molecule conversions:', len(df)-len(df_out))

# remove smiles column
df_out = df_out.drop('smiles', axis=1)

# add activity column
df_out['activity'] = 'Inactive'

df_out.head()

Number failed molecule conversions: 13


Unnamed: 0,InChI_Key,set,morgan_fingerprint,activity
0,XQXPVVBIMDBYFF-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
1,ZFXYFBGIUFBOJW-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
2,KPSRODZRAIWAKH-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
3,WWNNZCOKKKDOPX-UHFFFAOYSA-O,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
4,ZIUHHBKFKCYYJD-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive


## Export

In [8]:
df_out.to_pickle(f'../processed_data/{ds_name}_processed.pkl.gz')  # gzip compresses from 300MB to 10MB... wow

In [9]:
#example loading
df_in = pd.read_pickle(f'../processed_data/{ds_name}_processed.pkl.gz')  # pandas automatically uncompresses just from filename

In [10]:
print('length:', len(df_in))
df_in.head()

length: 139339


Unnamed: 0,InChI_Key,set,morgan_fingerprint,activity
0,XQXPVVBIMDBYFF-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
1,ZFXYFBGIUFBOJW-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
2,KPSRODZRAIWAKH-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
3,WWNNZCOKKKDOPX-UHFFFAOYSA-O,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
4,ZIUHHBKFKCYYJD-UHFFFAOYSA-N,both,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Inactive
