In [None]:
# %matplotlib inline

import pandas as pd
import re
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import cv2

from utils import get_train_file_path

tqdm.pandas()

In [None]:
train_csv = pd.read_csv('/workdir/data/bms-molecular-translation/train_labels.csv')
train_csv['image_path'] = train_csv['image_id'].apply(get_train_file_path)
print(f'train_csv.shape: {train_csv.shape}')

In [None]:
plt.figure(figsize=(40, 40))
for i in range(15):
    image = cv2.imread(train_csv.loc[i, 'image_path'])
    plt.subplot(5, 3, i+1)
    plt.imshow(image)
plt.show()

In [None]:
def split_form(form):
    string = ''
    for i in re.findall(r"[A-Z][^A-Z]*", form):
        elem = re.match(r"\D+", i).group()
        num = i.replace(elem, "")
        if num == "":
            string += f"{elem} "
        else:
            string += f"{elem} {str(num)} "
    return string.rstrip(' ')


def split_form2(form):
    string = ''
    for i in re.findall(r"[a-z][^a-z]*", form):
        elem = i[0]
        num = i.replace(elem, "").replace('/', "")
        num_string = ''
        for j in re.findall(r"[0-9]+[^0-9]*", num):
            num_list = list(re.findall(r'\d+', j))
            assert len(num_list) == 1, f"len({num_list}) != 1"
            _num = num_list[0]
            if j == _num:
                num_string += f"{_num} "
            else:
                extra = j.replace(_num, "")
                num_string += f"{_num} {' '.join(list(extra))} "
        string += f"/{elem} {num_string}"
    return string.rstrip(' ')

def split_InChI_to_tokens(raw_text):
    """Split InChI-string to separate tokens.
    """

    def is_put_space(prev_char, curr_char):
        """Cases to put space in string.
        """

        # split numbers from letters
        if (
            curr_char.isdigit()
            and not prev_char.isdigit()
        ):
            return True

        # split letters from numbers
        if (
            curr_char.isalpha()
            and prev_char.isdigit()
        ):
            return True

        # split upper letters and leave clued lower
        # chars with upper ones (e.g. "Br").
        if (
            curr_char.isalpha()
            and curr_char.isupper()
        ):
            return True
        
        # split non-letters symbols
        if (
            not curr_char.isalpha()
            and not curr_char.isdigit()
        ):
            return True

        return False
    
    # remove constant "InChI=1S/" from text
    raw_text = '/'.join(raw_text.split('/')[1:])
    
    splitted_text = ''
    prev_char = ''
    for char in raw_text:
        if is_put_space(prev_char, char):
            splitted_text += ' '
        splitted_text += char
        prev_char = char
    return splitted_text.lstrip(' ')

In [None]:
train_csv['InChI_1'] = train_csv['InChI'].progress_apply(lambda x: x.split('/')[1])
train_csv['InChI_text'] = train_csv['InChI_1'].progress_apply(split_form) + ' ' + \
    train_csv['InChI'].progress_apply(lambda x: '/'.join(x.split('/')[2:])).progress_apply(split_form2).values

In [None]:
train_csv['InChI_tokens'] = train_csv['InChI'].progress_apply(split_InChI_to_tokens)

In [None]:
train_csv['InChI_text'].equals(train_csv['InChI_tokens'])

In [None]:
train_csv['compare'] = np.where(train_csv["InChI_text"] == train_csv["InChI_tokens"], True, False)


In [None]:
train_csv.loc[train_csv['compare'] == False]

In [None]:
print(train_csv.loc[2414260, "InChI"])

print(train_csv.loc[2414260, "InChI_text"])

print(train_csv.loc[2414260, "InChI_tokens"])