Import saved reaction datas

In [1]:
from __future__ import print_function

import os
import re
import cPickle, gzip
import progressbar
from rdkit.Chem import AllChem

In [2]:
data_directory = 'data'
processed_data_filename = 'processed_data.pkl.gz'
processed_data_filepath = os.path.join(data_directory, processed_data_filename)
data_length = 1094235

bar = progressbar.ProgressBar(max_value=data_length)

rsmi_data = []

with gzip.open(processed_data_filepath, 'rb') as data_file:
    i = 1
    while 1:
        try:
            line, ref = cPickle.load(data_file)
            rsmi_data.append((line, ref))
        except EOFError:
            break
        bar.update(i)
        i += 1

 99% (1088828 of 1094235) |################ | Elapsed Time: 0:00:13 ETA: 0:00:00

In [15]:
import parser.Smipar as Smipar

bar = progressbar.ProgressBar(max_value=data_length)

_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_START_VOCAB = [_PAD, _GO, _EOS]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2

vocab_reactants = {}
vocab_products = {}

for i, rsmi in enumerate(rsmi_data):
    
    rsmi = rsmi[0]
    
    reactant_list = []
    agent_list = []
    product_list = []

    split_rsmi = rsmi.split('>')
    reactants = split_rsmi[0].split('.')
    agents = split_rsmi[1].split('.')
    products = split_rsmi[2].split('.')

    for reactant in reactants:
        reactant_list += Smipar.parser_list(reactant)
        reactant_list += '.'
    for agent in agents:
        agent_list += Smipar.parser_list(agent)
        agent_list += '.'
    for product in products:
        product_list += Smipar.parser_list(product)
        product_list += '.'
      
    reactant_list.pop() # to pop last '.'
    agent_list.pop()
    product_list.pop()
    
    reactant_list += '>'
    reactant_list += agent_list
    
    for reactant_token in reactant_list:
        if reactant_token in vocab_reactants:
            vocab_reactants[reactant_token] += 1
        else:
            vocab_reactants[reactant_token] = 1
    
    for product_token in product_list:
        if product_token in vocab_products:
            vocab_products[product_token] += 1
        else:
            vocab_products[product_token] = 1
            
    bar.update(i)

 99% (1094226 of 1094235) |################ | Elapsed Time: 3:45:15 ETA: 0:00:00

In [18]:
reactants_token_list = _START_VOCAB \
        + sorted(vocab_reactants, key=vocab_reactants.get, reverse=True)

products_token_list = _START_VOCAB \
        + sorted(vocab_products, key=vocab_products.get, reverse=True)
    
with gzip.open('data/vocab/vocab_dict.pkl.gz', 'wb') as dict_file:
    cPickle.dump((vocab_reactants, vocab_products), dict_file, 2)
    
with gzip.open('data/vocab/vocab_list.pkl.gz', 'wb') as list_file:
    cPickle.dump((reactants_token_list, products_token_list), list_file, 2)

In [3]:
# data reloader

with gzip.open('data/vocab/vocab_dict.pkl.gz', 'rb') as dict_file:
    vocab_reactants, vocab_products = cPickle.load(dict_file)

with gzip.open('data/vocab/vocab_list.pkl.gz', 'rb') as list_file:
    reactants_token_list, products_token_list = cPickle.load(list_file)

In [5]:
print(len(reactants_token_list))
print(reactants_token_list[:100])

print(len(products_token_list))
print(products_token_list[:100])

326
['_PAD', '_GO', '_EOS', u'C', u'c', u'(', u')', u'O', u'1', '.', u'=', u'N', u'2', u'n', u'Cl', '>', u'F', u'3', u'[O-]', u'S', u'Br', u'[Na+]', u'-', u'#', u'[K+]', u'4', u'[N+]', u'[C@H]', u'[C@@H]', u'[nH]', u's', u'[OH-]', u'[H-]', u'I', u'o', u'/', u'P', u'[Cl-]', u'B', u'[Si]', u'[Li+]', u'[Cs+]', u'[NH4+]', u'[N-]', u'5', u'[H]', u'[Li]', u'[Br-]', u'[BH4-]', u'[Cu]', u'[Al+3]', u'[I-]', u'[P-]', u'[Mg+]', u'[Pd]', u'[Pd+2]', u'[BH-]', u'\\', u'[C@]', u'[F-]', u'[Sn]', u'[C@@]', u'[BH3-]', u'[Na]', u'[Fe]', u'[SiH]', u'[P+]', u'[n+]', u'[Zn]', u'[B-]', u'[C-]', u'[nH+]', u'[NH3+]', u'[K]', u'[Cu+2]', u'[Mg]', u'6', u'[Al+]', u'[S-]', u'[Mn]', u'[Al]', u'[Mg+2]', u'[Zn+2]', u'[Pt]', u'[Ti+4]', u'[Cr]', u'[AlH]', u'[Ca+2]', u'[SiH2]', u'[NH2+]', u'[Zn+]', u'b', u'[Ni]', u'[S+]', u'[Ru]', u'[NH-]', u'[SH]', u'[PH]', u'[O+]', u'[Se]']
197
['_PAD', '_GO', '_EOS', u'c', u'C', u'(', u')', u'1', u'O', u'2', u'=', u'N', u'n', u'3', u'F', u'Cl', u'-', u'4', u'S', u'[C@H]', u'[C@@H]', 

In [8]:
for token in reactants_token_list[3:20]:
    print(token, vocab_reactants.get(token))
    
print('--------')

for token in products_token_list[3:20]:
    print(token, vocab_products.get(token))

C 14349258
c 13448060
( 6852744
) 6852744
O 5543060
1 4637546
. 3210541
= 2950216
N 2150475
2 2133048
n 1262323
Cl 1230744
> 1094234
F 955206
3 673958
[O-] 456584
S 325640
--------
c 11326705
C 7796306
( 4698161
) 4698161
1 2880258
O 2704185
2 2081720
= 1636582
N 1521848
n 1131947
3 935410
F 660949
Cl 326621
- 311248
4 290488
S 185083
[C@H] 126285


In [9]:
print(sum(vocab_reactants.itervalues()))
print(sum(vocab_products.itervalues()))

71097704
44304102
