In [None]:
from rdkit.Chem import rdMolDescriptors, MolFromSmiles
from rdkit.Chem import SaltRemover, QED, rdMolDescriptors
from molvs import Standardizer
from tqdm import tqdm #my setup does not currently support those jupyter widgets
from functools import partial
import numpy as np
from pubchempy import get_compounds
import pandas as pd

In [None]:
ds_name = 'DarkChemicalMatter'

In [None]:
df = pd.read_csv('../data/dark_chemical_matter.csv')

## Calculate Morgan Fingerprints

In [None]:
fingerprint_function = partial(rdMolDescriptors.GetMorganFingerprintAsBitVect, 
                              radius=2, useChirality=True)
fp_name = 'morgan'

In [None]:
s = Standardizer()
remover = SaltRemover.SaltRemover()

In [None]:
def row_to_fingerprint(row):
    mol = MolFromSmiles(row.smiles)
    if mol is None:
        return np.nan  # this automatically deletes the row when using pd.apply
    mol = s.standardize(mol)
    mol = remover.StripMol(mol)
    fp = fingerprint_function(mol)
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [None]:
tqdm.pandas()
df[f'{fp_name}_fingerprint'] = df.progress_apply(row_to_fingerprint, axis=1)

In [None]:
# some processing
df_out = df.dropna() # remove failed molecule rows
print('Number failed molecule conversions:', len(df)-len(df_out))

# remove smiles column
df_out = df_out.drop('smiles', axis=1)

# add activity column
df_out['activity'] = 'Inactive'

df_out.head()

## Export

In [None]:
df_out.to_pickle(f'../processed_data/{ds_name}_processed.pkl.gz')  # gzip compresses from 300MB to 10MB... wow

In [None]:
#example loading
df_in = pd.read_pickle(f'../processed_data/{ds_name}_processed.pkl.gz')  # pandas automatically uncompresses just from filename

In [None]:
print('length:', len(df_in))
df_in.head()