In [42]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from ipypb import track
import tables
from multiprocessing.pool import ThreadPool as Pool
from itertools import zip_longest
import pandas as pd



file_name = 'nature_activity.csv'
smiles = list(pd.read_csv(file_name, index_col=0).index)


In [43]:
def GetFPFromSmiles(smiles):
    if smiles is not None:
        return AllChem.GetMorganFingerprintAsBitVect(AllChem.MolFromSmiles(smiles), 2, 2048)
    
import time
batch_size = 50
list_size = len(smiles)
t0 = time.time()

with tables.open_file(f'finger_print_nature_activity_ecfp4.h5', 'w') as fp_array:
    distEcfp4 = fp_array.create_earray(fp_array.root, 'distByEcfp4', tables.IntAtom(), shape=(0, 2048), expectedrows=list_size)

    args = list([iter(smiles)]* batch_size)
    indexer = zip_longest(*args, fillvalue=None)

    for i in enumerate(indexer):
        p = Pool(50)
        fps = list(i[1])
        while None in fps:
            fps.remove(None)

        fp_batch = p.map(GetFPFromSmiles, fps)
        p.close()
        p.join()

        if fp_batch is not None:
            distEcfp4.append(np.array(fp_batch))
            print(f'\rrow {i[0]} {time.asctime()} eta: {(time.time() - t0) / (i[0] + 1) * (list_size/batch_size - i[0] - 1)} sec.', end='')
        else:
            print('cannot find FP')
            break
            

row 52 Tue Dec 15 10:22:08 2020 eta: -0.27418967193027693 sec.

In [44]:
import tables
with tables.open_file('finger_print_nature_activity_ecfp4.h5','r') as f2:
    f2_=f2.root.distByEcfp4[:]

In [45]:
print(f2_.shape)
pd.DataFrame(f2_).to_csv('nature_ecfp4.csv')

(2607, 2048)
