# This notebook was only made to count the number of atoms and make csv.
## You can use it if you like and Please comment if you have any more questions.
"""
### This notebook only counts atom's quantity, so same atoms are combined into one.
### For exsample 'InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3',
### target is 'C13H20OS' and C:13, H:20, O:1, S:1.
"""

# Load library

In [None]:
import os
import re
import cv2
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from tqdm.auto import tqdm
tqdm.pandas()
from PIL import Image
import seaborn as sns
import Levenshtein
from albumentations import Compose, Normalize, Resize, RandomResizedCrop,CenterCrop,HorizontalFlip,VerticalFlip,Rotate,RandomContrast,IAAAdditiveGaussianNoise
from albumentations.pytorch import ToTensorV2

# Load data and else

In [None]:
def load():
    """Load data and path."""
    def get_train_file_path(image_id):
        return "../input/bms-molecular-translation/train/{}/{}/{}/{}.png".format(
            image_id[0], image_id[1], image_id[2], image_id 
        )
    def get_test_file_path(image_id):
        return "../input/bms-molecular-translation/test/{}/{}/{}/{}.png".format(
            image_id[0], image_id[1], image_id[2], image_id 
        )
    train = pd.read_csv('../input/bms-molecular-translation/train_labels.csv')
    test = pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')
    train['file_path'] = train['image_id'].progress_apply(get_train_file_path)
    test['file_path'] = test['image_id'].progress_apply(get_test_file_path)
    return train, test

def processing(df):
    """Make InChI_ row."""
    df['InChI_list'] = df['InChI'].progress_apply(lambda x: x.split('/'))
    InChI_df = df['InChI_list'].progress_apply(pd.Series)
    df = pd.concat([df, InChI_df.add_prefix('InChI_')], axis=1)
    return df

def score(y_true, y_pred):
    """Calc score."""
    return np.mean([Levenshtein.distance(true, pred) for true, pred in zip(y_true, y_pred)])

In [None]:
def make_name():
    """Atom name."""
    name = np.array([
        'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg',
        'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr',
        'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
        'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd',
        'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La',
        'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er',
        'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au',
        'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th',
        'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md',
        'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn',
        'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'], dtype='<U2')
    sort_name = np.sort(name)[::-1]
    changes = [" "+sort_name[i]+"/" for i in range(len(sort_name))]
    return dict(zip(sort_name, changes))

def translate(text, kw, ignore_case=False):
    search_keys = map(lambda x:re.escape(x), kw.keys())
    if ignore_case:
        kw = {k.lower():kw[k] for k in kw}
        regex = re.compile('|'.join(search_keys), re.IGNORECASE)
        res = regex.sub( lambda m:kw[m.group().lower()], text)
    else:
        regex = re.compile('|'.join(search_keys))
        res = regex.sub( lambda m:kw[m.group()], text)

    return res.split(" ")[1:]


def _make_df(res, name):
    """Make new df."""
    df = pd.DataFrame(0, columns=name, index=range(len(res)))
    for i, target in enumerate(tqdm(res)):
        for j in target:
            t_s = j.split("/")
            if t_s[1] == '':
                t_s[1] = 1  
            df.loc[i, t_s[0]] = int(t_s[1])
    return df

In [None]:
#Load csv
train, test = load()

In [None]:
train.head()

In [None]:
test.head()

# Processing

In [None]:
#change train data
trains = processing(train)

# Translate to train.csv

In [None]:
def make_df(InChI_1):
    """Make new df."""
    name = make_name()
    res = [translate(target, name) for target in tqdm(InChI_1)]
    df = _make_df(res, name)
    return df

In [None]:
#change df
df = make_df(trains.InChI_1)
df.head()

In [None]:
df.to_csv("atom_df.csv")

# Thank you for reading.