Import saved reaction datas

In [1]:
from __future__ import print_function

import os
import re
import cPickle, gzip
import progressbar

from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
data_directory = 'data'
processed_data_filename = 'processed_data.pkl.gz'
processed_data_filepath = os.path.join(data_directory, processed_data_filename)
data_length = 1094235

bar = progressbar.ProgressBar(max_value=data_length)

rsmi_data = []

with gzip.open(processed_data_filepath, 'rb') as data_file:
    i = 1
    while 1:
        try:
            line, ref = cPickle.load(data_file)
            rsmi_data.append((line, ref))
        except EOFError:
            break
            
        bar.update(i)
        i += 1
        
bar.finish()

100% (1094235 of 1094235) |################| Elapsed Time: 0:00:13 Time: 0:00:13


In [3]:
def cano(smiles): # canonicalize smiles by MolToSmiles function
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) if (smiles != '') else ''

In [5]:
import parser.Smipar as Smipar

bar = progressbar.ProgressBar(max_value=data_length)

_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_START_VOCAB = [_PAD, _GO, _EOS]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2

vocab_reactants = {}
vocab_products = {}

error_rsmi = {}

for i, rsmi in enumerate(rsmi_data):
    
    rsmi = rsmi[0]
    
    reactant_list = []
    agent_list = []
    product_list = []
    
    try:
        split_rsmi = rsmi.split('>')
        reactants = cano(split_rsmi[0]).split('.')
        agents = cano(split_rsmi[1]).split('.')
        products = cano(split_rsmi[2]).split('.')

        for reactant in reactants:
            reactant_list += Smipar.parser_list(reactant)
            reactant_list += '.'
        for agent in agents:
            agent_list += Smipar.parser_list(agent)
            agent_list += '.'
        for product in products:
            product_list += Smipar.parser_list(product)
            product_list += '.'

        reactant_list.pop() # to pop last '.'
        agent_list.pop()
        product_list.pop()

        reactant_list += '>'
        reactant_list += agent_list

        for reactant_token in reactant_list:
            if reactant_token in vocab_reactants:
                vocab_reactants[reactant_token] += 1
            else:
                vocab_reactants[reactant_token] = 1

        for product_token in product_list:
            if product_token in vocab_products:
                vocab_products[product_token] += 1
            else:
                vocab_products[product_token] = 1
    except:
        error_rsmi.update({i: rsmi})
    
    bar.update(i)
    
bar.finish()

100% (1094235 of 1094235) |################| Elapsed Time: 3:59:09 Time: 3:59:09


In [11]:
len(error_rsmi)

360

In [6]:
reactants_token_list = _START_VOCAB \
        + sorted(vocab_reactants, key=vocab_reactants.get, reverse=True)

products_token_list = _START_VOCAB \
        + sorted(vocab_products, key=vocab_products.get, reverse=True)
    
with gzip.open('data/vocab/vocab_dict.pkl.gz', 'wb') as dict_file:
    cPickle.dump((vocab_reactants, vocab_products), dict_file, 2)
    
with gzip.open('data/vocab/vocab_list.pkl.gz', 'wb') as list_file:
    cPickle.dump((reactants_token_list, products_token_list), list_file, 2)

In [7]:
# data reloader

with gzip.open('data/vocab/vocab_dict.pkl.gz', 'rb') as dict_file:
    vocab_reactants, vocab_products = cPickle.load(dict_file)

with gzip.open('data/vocab/vocab_list.pkl.gz', 'rb') as list_file:
    reactants_token_list, products_token_list = cPickle.load(list_file)

In [8]:
print(len(reactants_token_list))
print(reactants_token_list[:100])
print(reactants_token_list[-15:])

print('--------')

print(len(products_token_list))
print(products_token_list[:100])
print(products_token_list[-15:])

311
['_PAD', '_GO', '_EOS', u'C', u'c', u'(', u')', u'O', u'1', '.', u'=', u'2', u'N', u'n', u'Cl', '>', u'F', u'3', u'[O-]', u'S', u'-', u'Br', u'[Na+]', u'#', u'[K+]', u'4', u'[N+]', u'[nH]', u's', u'[OH-]', u'[H-]', u'I', u'o', u'P', u'B', u'[Cl-]', u'[Si]', u'[Li+]', u'[Cs+]', u'[NH4+]', u'[N-]', u'5', u'[H]', u'[Li]', u'[Br-]', u'[BH4-]', u'[Cu]', u'[Al+3]', u'[I-]', u'[P-]', u'[Mg+]', u'[Pd]', u'[Pd+2]', u'[BH-]', u'[F-]', u'[Sn]', u'[BH3-]', u'[Na]', u'[Fe]', u'[SiH]', u'[P+]', u'[n+]', u'[Zn]', u'[B-]', u'[C-]', u'[nH+]', u'[NH3+]', u'[K]', u'[Cu+2]', u'[Mg]', u'6', u'[Al+]', u'[S-]', u'[Mn]', u'[Al]', u'[Mg+2]', u'[Zn+2]', u'[Pt]', u'[Ti+4]', u'[Cr]', u'[AlH]', u'[I+3]', u'[Ca+2]', u'[Cl+]', u'[SiH2]', u'[NH2+]', u'[Zn+]', u'[Ni]', u'[S+]', u'[Ru]', u'[Cl+3]', u'[PH]', u'[SH]', u'[NH-]', u'[O+]', u'[Se]', u'[Os]', u'[NH+]', u'[Ag+]', u'[SnH]']
[u'[CH3+]', u'[Gd+3]', u'[Ru-2]', u'[IH2+2]', u'[Dy+3]', u'[CuH]', u'[Tl]', u'[Ti+]', u'[Ag-2]', u'[SH2+]', u'[Gd]', u'[PH4]', u'[Sb+3]

In [9]:
for token in reactants_token_list[3:20]:
    print(token, vocab_reactants.get(token))
    
print('--------')

for token in products_token_list[3:20]:
    print(token, vocab_products.get(token))

C 14348134
c 13693318
( 6842501
) 6842501
O 5526281
1 4629064
. 3209210
= 2845895
2 2131286
N 2031518
n 1347535
Cl 1228733
> 1093874
F 954930
3 676466
[O-] 463321
S 324056
--------
c 11601344
C 7763270
( 4688274
) 4688274
1 2869962
O 2691996
2 2082412
= 1528896
N 1383581
n 1236040
3 935880
F 660705
- 348700
Cl 326498
4 295640
S 182413
[nH] 121866


In [10]:
print(sum(vocab_reactants.itervalues()))
print(sum(vocab_products.itervalues()))

70907758
44141777
