In [None]:
# ====================================================
# Library
# ====================================================
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import torch


import re
PATTEN = re.compile('\d+|[A-Z][a-z]?|[^A-Za-z\d/]|/[a-z]')
def l_split(s):
    return ' '.join(re.findall(PATTEN,s))
# ====================================================
# Tokenizer
# ====================================================
class Tokenizer(object):    
    def __init__(self):
        self.stoi = {}
        self.itos = {}
        self.col2sos = {
            'formula': '<sos>', 
            'c': '/c', 
            'h': '/h', 
            'b': '/b', 
            't': '/t', 
            'm': '/m',
            's': '/s',
            'i': '/i'}
# ["c", "h", "b", "t", "m", "s", "i"]
    def __len__(self):
        return len(self.stoi)
    
    def fit_on_texts(self): 
        vocab = [str(i) for i in range(180)]
        vocab.extend(['(', ')', '+', ',', '-', '/b', '/c', '/h', '/i', '/m', '/s', '/t'
                         , 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S', 'Si', 'T', 'D'
                         , '<sos>', '<eos>', '<pad>', 'X'])
#         ['B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S', 'Si']
#         {'b', 'm', 'i', 'c', 't', 's', 'h'}, '<empty>'
        for i, s in enumerate(vocab):
            self.stoi[s] = i
        self.itos = {item[1]: item[0] for item in self.stoi.items()}
        
    def text_to_sequence(self, text, col):
        sequence = []
        sequence.append(self.stoi[self.col2sos[col]])
#         if col == 'i' and text == 'i':
#             sequence.append(self.stoi['<empty>'])
        for s in text.split(' '):
            sequence.append(self.stoi[s])
        sequence.append(self.stoi['<eos>'])
        return sequence
    
    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = self.text_to_sequence(text)
            sequences.append(sequence)
        return sequences

    def sequence_to_text(self, sequence):
        return ''.join(list(map(lambda i: self.itos[i], sequence)))
    
    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = self.sequence_to_text(sequence)
            texts.append(text)
        return texts
    
    def predict_caption(self, sequence):
        caption = ''
        for i in sequence:
            if i == self.stoi['<eos>'] or i == self.stoi['<pad>']:
                break
            caption += self.itos[i]
        return caption
    
    def predict_captions(self, sequences):
        captions = []
        for sequence in sequences:
            caption = self.predict_caption(sequence)
            captions.append(caption)
        return captions
    
# assemble: c -- h -- b-- t -- m  -- s  -- i

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/arranged-bms-train-labels/arranged_bms_train_labels.csv')
print(f'train.shape: {train.shape}')

In [None]:
# train = train[:1000]

In [None]:
# train.head()

In [None]:
# seq_columns = ['formula', "c", "h", "b", "t", "m", "s", "i"]
seq_columns = ('formula','c', 'h', 'b', 't', 'i')

In [None]:
for c in seq_columns:
    train[c] = (train[c].progress_apply(l_split))

In [None]:
# c = 'i'
# train[c] = (train[c].progress_apply(split_form2))

In [None]:
train.i[train.i_flg == 1]

In [None]:
train.loc[:, seq_columns].head()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts()
print(f"tokenizer.stoi: {tokenizer.stoi}")
torch.save(tokenizer, 'tokenizer2.pth')
print('Saved tokenizer')

In [None]:
train.to_pickle('train2.pkl')
print('Saved preprocessed train.pkl')

In [None]:
train.head()

In [None]:
for c in seq_columns[3:]:
    print(c)
    for text in tqdm(train[c].values):
        seq = tokenizer.text_to_sequence(text, c)

In [None]:
train=train.replace('X', '')

In [None]:
def assemble(formula, c, h, b, t, m, s, i):    
    inchis = f"InChI=1S/{formula}/c{c}"
    if h != '':
        inchis += f'/h{h}'
    if b != '':
        inchis += f'/b{b}'
    if t != '':
        inchis += f'/t{t}'
    if m != '':
        inchis += f'/{m}'
    if s != '':
        inchis += f'/{s}'
    if i != '':
        inchis += f'/i{i}'
    return inchis.replace(' ', '')

In [None]:
all_columns = ['formula', "c", "h", "b", "t", "m", "s", "i"]
inchi_preds = np.array([assemble(f, c, h, b, t, m, s, i) for f, c, h, b, t, m, s, i in zip(*[train[c1] for c1 in all_columns])])

In [None]:
train.m.unique(), train.s.unique()

In [None]:
[train[c1][0] for c1 in seq_columns]

In [None]:
eq = ((inchi_preds) == train.InChI)
eq.all()

In [None]:
neq = np.logical_not(eq)
neq.sum()

In [None]:
# i=0
# inchi_preds[neq][0]

In [None]:
print(inchi_preds[0])
print(train.InChI[0])

In [None]:
# vocabs = ['formula_vocab'] + [f'{c}_vocab' for c in seq_columns]

In [None]:
def length(text):
    return len(text.split(' '))

In [None]:
max_length = {}
for v in seq_columns:
    l = train[v].apply(length)
    max_length[v] = l.max()
    print(v, max_length[v])

In [None]:
torch.save(max_length, 'max_length.pth')
print('Saved max_length')