# About

`InChI` string can be split by `/` into some parts (max number of parts in training data is 11). The first part is the format which is uniform string(`InChI=1S`) in this competition, and the second part is **chemical formula** which represents **the number of atoms** in each molecular.

Let me take `InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3`(`image_id`: `000011a64c74`) as an example.
The second part is `C13H20OS`, which means that the molecular have 13`Carbon`s, 20`Hydrogen`s, a `Oxygen` and a `Sulfur`.

All the parts of `InChI` have **variable length** except for the second one (chemical formula) because the kind of atoms in training data is **limited** to 12(`B`, `Br`, `C`, `Cl`, `F`, `H`, `I`, `N`, `O`, `P`, `S`, and `Si`). Therefore, we can represents a chemical formula by a **fixed length** vector and treat chemical formula prediction task as **multi-output regression task**.

I'm sharing **tranining process** of the task in another notebook(Note: I used **only 4%** of data for training):  
https://www.kaggle.com/ttahara/bms-mt-chemical-formula-regression-training
  
@wineplanetary 's notebook( [Step by Step 2: LS dist < 1 chemical formula](https://www.kaggle.com/wineplanetary/step-by-step-2-ls-dist-1-chemical-formula) ) have already showed us that chemical formula prediction is relatively easy. Thanks!  

In this notebook, to make the difference, I try solving the competition task by utilizing predicted chemical formula.

<br>

NOTE: I skipped calculating OOF score because of Memory Error. Please see [1st version](https://www.kaggle.com/ttahara/bms-mt-chemical-formula-regression-inference?scriptVersionId=57399601).

# Prepare

## import

In [None]:
import os
import re
import gc
import sys
import yaml
import copy
import random
import shutil
import typing as tp
from pathlib import Path

import Levenshtein
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
import cv2
import albumentations
from albumentations.core.transforms_interface import ImageOnlyTransform, DualTransform
from albumentations.pytorch import ToTensorV2

import torch
from torch import nn
from torch.utils import data

from cuml.cluster import KMeans

sys.path.append("../input/timm-pytorch-image-models/pytorch-image-models-master")
import timm

In [None]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
DATA = INPUT / "bms-molecular-translation"
TRAIN = DATA / "train"
TRAIN_224 = INPUT / "bms-molecular-224px-jpg-padded" / "train"
TEST = DATA / "test"
TEST_224 = INPUT / "bms-molecular-224px-jpg-padded" / "test"

TRAINING_OUTPUTS = INPUT / "bms-mt-chemical-formula-regression-training"
TMP = ROOT / "tmp"
TMP.mkdir(exist_ok=True)

RANDAM_SEED = 1086
DEBUG_RUN = False

FOLDS = [0, 1, 2, 3, 4]
N_FOLD = len(FOLDS)

TARGETS = [
    'B', 'Br', 'C', 'Cl',
    'F', 'H', 'I', 'N',
    'O', 'P', 'S', 'Si']
N_TARGETS = len(TARGETS)
MAX_INCHI_N_SPLITS = 11

N_CLUSTER = 1000

## read competition data and the chemical formula prediction result

In [None]:
train = pd.read_csv(DATA / "train_labels.csv")
smpl_sub = pd.read_csv(DATA / "sample_submission.csv")

In [None]:
if DEBUG_RUN:
    smpl_sub = smpl_sub.iloc[:12800,].reset_index(drop=True)
print(smpl_sub.shape)

In [None]:
train_fml = pd.read_pickle(TRAINING_OUTPUTS / "train_formula_mlskf_5fold.pkl")
oof_fml = pd.read_csv(TRAINING_OUTPUTS / "oof_prediction.csv")

## preprocess

### split InChi into 11 parts

In [None]:
train["InChI_length"] = train["InChI"].str.len()
train["InChI_n_splits"] = train["InChI"].str.count("/") + 1

print(train[["InChI_length", "InChI_n_splits"]].describe())

In [None]:
split_inchi_list = [[] for inchi_pos in range(MAX_INCHI_N_SPLITS)] 

for inchi_str, n_splits in tqdm(train[["InChI", "InChI_n_splits"]].values):
    split_inchi = inchi_str.split("/")
    
    for inchi_pos in range(n_splits):
        split_inchi_list[ inchi_pos].append(split_inchi[inchi_pos])
        
    for inchi_pos in range(n_splits, MAX_INCHI_N_SPLITS):
        split_inchi_list[inchi_pos].append("")
        

for inchi_pos in range(MAX_INCHI_N_SPLITS):
    train[f"InChI_{inchi_pos}"] = split_inchi_list[inchi_pos]

del split_inchi_list

# Analysis of chemical formula prediction result

## correlation between predicted and target value for atoms

In [None]:
fig = plt.figure(figsize=(6 * 4, 6 * 3))
fig.subplots_adjust(wspace=0.4, hspace=0.6)

for i, atom in enumerate(TARGETS):
    ax = fig.add_subplot(3, 4, i + 1)
    pred = oof_fml[atom].values
    target = train_fml[atom].values
    mse = mean_squared_error(target, pred)
    corr = np.corrcoef([target, pred])[0, 1]
    ax.scatter(pred, target, alpha=0.3)
    ax.set_xlabel("Predicted Value")
    ax.set_ylabel("Target Value")
    ax.set_title(
        f"atom: {atom}, oof_mse: {mse:.3f}, corr: {corr:.3f}",)
    
del pred; del target
gc.collect()

## levenshtein distance for chemical formula

In [None]:
def arr2formula(atom_arr):
    """
    Convert predicted number of atoms to chemical formula
    
    atom_arr: 1d array shape of (12, )
    """
    atom_arr = np.round(atom_arr).astype(int)
    
    # order when C exists: ["C", "H", 'B', 'Br', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'Si']
    idx_order1 = [2, 5, 0, 1, 3, 4, 6, 7, 8, 9, 10, 11]
   #  order when C doesn't exist: ['B', 'Br', "C", 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S', 'Si']
    idx_order2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

    idx_order = idx_order1 if atom_arr[2]  > 0 else idx_order2
        
    elem_list = []
    for idx in idx_order:
        if atom_arr[idx] < 1:
            continue
        elif atom_arr[idx] == 1:
            elem_list.append(TARGETS[idx])
        else:
            elem_list.append(f"{TARGETS[idx]}{atom_arr[idx]}")
    
    formula = "".join(elem_list)
    return formula

In [None]:
oof_fml["pred_formula"] = [arr2formula(atom_arr) for atom_arr in tqdm(oof_fml[TARGETS].values)]

In [None]:
# oof_fml["LS_dist"] =  [
#     Levenshtein.distance(fml, pred_fml) for fml, pred_fml in tqdm(oof_fml[["formula", "pred_formula"]].values)]

In [None]:
# print(oof_fml[["formula", "pred_formula", "LS_dist"]].head(10))

print(
"""
         formula   pred_formula  LS_dist
0       C13H20OS       C13H20OS        0
1       C21H30O4       C20H30O4        1
2     C24H23N5O4     C23H23N5O4        1
3    C17H24N2O4S    C17H23N2O4S        1
4    C10H19N3O2S   C10H18BrN3O2        4
5  C19H22Br2N2O2  C19H22Br2N2O2        0
6    C17H10BrN3O      C17H10N3O        2
7    C21H21N5O2S    C21H21N5O2S        0
8    C13H18N2O5S    C13H18N2O5S        0
9   C13H15BrN2O3   C13H15BrN2O3        0
""")

In [None]:
# mode_formula = oof_fml.formula.mode()[0]
# print("mode formula:", mode_formula)
# ls_dist_by_mode = sum([
#     Levenshtein.distance(formula, mode_formula) for formula in oof_fml.formula]) / len(oof_fml)

# print(f"oof Levenshtein distance for Chemical formula **by mode**: {ls_dist_by_mode:.4f}")

print(
"""
mode formula: C15H22N2O2
oof Levenshtein distance for Chemical formula **by mode**: 5.3457
""")

In [None]:
# print(f"oof Levenshtein distance for chemical formula by regression model: {oof_fml['LS_dist'].mean():.4f}")


print("oof Levenshtein distance for chemical formula by regression model: 0.9503")

oof Levenshtein distance for chemical formula is 0.95.(You may get 5.3457 if you use mode string(`C15H22N2O2`). )

Please note that this result arises from the model which was trained **by only 4% of training data**.  
I've already confirmed in my local environment that models trained by more data can achinve more precise result ;)

# Clustering Apprach by predicted number of atoms

I think chemical formula can limit `InChI`'s degrees of freedom.

For example, total number of atoms excluding `H` has relatively high correlation with length of the whole `InChI` and `InchI_2`(which represents atom connections) as the following plots.

One possible approach that comes to mind right away is clustering moleculars by predicted chemical formula (number of atoms).
Maybe, using mode string of each cluster can get a better result than mode string of all training data.

In [None]:
# # true value
train_fml["n_atoms_ex_H"] = train_fml["n_atoms"] - train_fml["H"]

3 # pred value
oof_fml["n_atoms"] = oof_fml[TARGETS].sum(axis=1)
oof_fml["n_atoms_ex_H"] = oof_fml["n_atoms"] - oof_fml["H"]

In [None]:
fig = plt.figure(figsize=(12, 5))

n_atoms_ex_H = train_fml["n_atoms_ex_H"].values
inchi_len = train["InChI_length"].values
inchi_2_len = train["InChI_2"].str.len().values

ax = fig.add_subplot(121)
corr = np.corrcoef(n_atoms_ex_H, inchi_len)[0, 1]
ax.set_title(f"Total number of atoms ex.H vs. InChI length(corr: {corr:.4f})")
ax.set_xlabel("number of atoms ex, H")
ax.set_ylabel("InChI length")
_ = ax.scatter(n_atoms_ex_H, inchi_len, alpha=0.3)

ax = fig.add_subplot(122)
corr = np.corrcoef(n_atoms_ex_H, inchi_2_len)[0, 1]
ax.set_title(f"Total number of atoms ex. H vs. InChI_2 length(corr: {corr:.4f})")
ax.set_xlabel("number of atoms ex. H")
ax.set_ylabel("InChI_2 length")
_ = ax.scatter(n_atoms_ex_H, inchi_2_len, alpha=0.3)

del n_atoms_ex_H; del inchi_len; del inchi_2_len;

Here I use 3 values (predicted number of atoms of `C`, `H` and total number of atoms **excluding `H`**) as a feature vector.

In [None]:
def calc_each_LS(y_true, y_pred, n_targets=11):
    
    res_list = []
    for col_idx in trange(n_targets, desc='1st loop'):
        mean_LS = 0
        for row_idx in trange(len(y_pred), desc='2nd loop', leave=False):
            mean_LS += Levenshtein.distance(y_true[row_idx, col_idx], y_pred[row_idx, col_idx])
        mean_LS /= len(y_pred)
        res_list.append(mean_LS)
    
    return res_list


def join_inchi_parts(pred):
    inchi_cols = [f"InChI_{i}" for i in range(11)]
    jonied_list = []
    for inchi_parts in tqdm(pred[inchi_cols].values):
        jonied_list.append("/".join([s for s in inchi_parts if s != ""]))
        
    pred["InChI"] = jonied_list
    return pred

In [None]:
%%time
X = oof_fml[["C", "H", "n_atoms_ex_H"]].values
print(X.shape)

clustering_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ("kmeans", KMeans(n_clusters=N_CLUSTER, random_state=1086)),
])

oof_fml["cluster"] = clustering_pipe.fit_predict(X)

In [None]:
%%time
inchi_cols = [f"InChI_{i}" for i in range(11)]

mode_by_cluster = train.groupby(oof_fml["cluster"])[inchi_cols].agg(lambda x:x.value_counts().index[0])

oof_pred = train[["image_id"]].copy()
oof_pred["cluster"] = oof_fml["cluster"]
oof_pred["InChI_0"] = "InChI=1S"
oof_pred["InChI_1"] = oof_fml["pred_formula"]

for i in range(2, 11):
    oof_pred[f"InChI_{i}"] = oof_pred["cluster"].map(mode_by_cluster[f"InChI_{i}"])

oof_pred = join_inchi_parts(oof_pred)

In [None]:
del X; del oof_fml; del train_fml;
gc.collect()

In [None]:
# oof_LS_each = calc_each_LS(
#     train[inchi_cols + ["InChI"]].values, oof_pred[inchi_cols + ["InChI"]].values, 12)

# print(pd.DataFrame(
#     {"parts" : inchi_cols + ["InChI"], "mean_LS": oof_LS_each}))

print(
"""
       parts       mean_LS
0    InChI_0  0.000000e+00
1    InChI_1  9.503252e-01
2    InChI_2  4.487596e+01
3    InChI_3  1.796458e+01
4    InChI_4  1.557185e+00
5    InChI_5  3.846986e-01
6    InChI_6  3.263887e-01
7    InChI_7  2.152764e-02
8    InChI_8  4.488104e-04
9    InChI_9  3.011320e-05
10  InChI_10  8.250192e-07
11     InChI  6.529676e+01
""")

In [None]:
# print(f"CV score by regression and clustering: {oof_LS_each[-1]:.4f}")

print("CV score by regression and clustering: 65.2968")

In [None]:
oof_pred.to_csv("oof_prediction.csv", index=False)
del oof_pred; del train
gc.collect()

In [None]:
gc.collect()

# Inference Test Data

## definition

### model

In [None]:
class BasicImageModel(nn.Module):
    
    def __init__(
        self, base_name, dims_head: tp, pretrained=False
    ):
        """Initialize"""
        self.base_name = base_name
        super(BasicImageModel, self).__init__()
        
        # # prepare backbone
        if hasattr(timm.models, base_name):
            # # # load base model
            base_model = timm.create_model(base_name, pretrained=pretrained)
            in_features = base_model.num_features
            # # remove head classifier
            base_model.reset_classifier(0)
        else:
            raise NotImplementedError

        self.backbone = base_model
        
        # # prepare head clasifier
        if dims_head[0] is None:
            dims_head[0] = in_features

        layers_list = []
        for i in range(len(dims_head) - 2):
            in_dim, out_dim = dims_head[i: i + 2]
            layers_list.extend([
                nn.Linear(in_dim, out_dim),
                nn.ReLU(), nn.Dropout(0.5),])
        layers_list.append(
            nn.Linear(dims_head[-2], dims_head[-1]))
        self.head = nn.Sequential(*layers_list)

    def forward(self, x):
        """Forward"""
        h = self.backbone(x)
        h = self.head(h)
        return h

### image dataset

In [None]:
class ImageTransformBase:
    """
    Base Image Transform class.

    Args:
        data_augmentations: List of tuple(method: str, params :dict), each elems pass to albumentations
    """

    def __init__(self, data_augmentations: tp.List[tp.Tuple[str, tp.Dict]]):
        """Initialize."""
        augmentations_list = [
            self._get_augmentation(aug_name)(**params)
            for aug_name, params in data_augmentations]
        self.data_aug = albumentations.Compose(augmentations_list)

    def __call__(self, pair: tp.Tuple[np.ndarray]) -> tp.Tuple[np.ndarray]:
        """You have to implement this by task"""
        raise NotImplementedError

    def _get_augmentation(self, aug_name: str) -> tp.Tuple[ImageOnlyTransform, DualTransform]:
        """Get augmentations from albumentations"""
        if hasattr(albumentations, aug_name):
            return getattr(albumentations, aug_name)
        else:
            return eval(aug_name)


class ImageTransformForCls(ImageTransformBase):
    """Data Augmentor for Classification Task."""

    def __init__(self, data_augmentations: tp.List[tp.Tuple[str, tp.Dict]]):
        """Initialize."""
        super(ImageTransformForCls, self).__init__(data_augmentations)

    def __call__(self, in_arrs: tp.Tuple[np.ndarray]) -> tp.Tuple[np.ndarray]:
        """Apply Transform."""
        img, label = in_arrs
        augmented = self.data_aug(image=img)
        img = augmented["image"]

        return img, label

In [None]:
class LabeledImageDataset(data.Dataset):
    """Dataset class for (image, label) pairs"""

    def __init__(
        self,
        file_list: tp.List[
            tp.Tuple[tp.Union[str, Path], tp.Union[int, float, np.ndarray]]],
        transform_list: tp.List[tp.Dict],
    ):
        """Initialize"""
        self.file_list = file_list
        self.transform = ImageTransformForCls(transform_list)

    def __len__(self):
        """Return Num of Images."""
        return len(self.file_list)

    def __getitem__(self, index):
        """Return transformed image and mask for given index."""
        img_path, label = self.file_list[index]
        img = self._read_image_as_array(img_path)
        
        img, label = self.transform((img, label))
        return img, label

    def _read_image_as_array(self, path: str):
        """Read image file and convert into numpy.ndarray"""
        img_arr = cv2.imread(str(path))
        img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)
        return img_arr

In [None]:
class TestImageDataset(data.Dataset):
    """Dataset class for (image, label) pairs"""

    def __init__(
        self,
        file_list: tp.List[
            tp.Tuple[tp.Union[str, Path], tp.Union[int, float, np.ndarray], tp.Tuple[int, int]]],
        transform_list: tp.List[tp.Dict],
    ):
        """Initialize"""
        self.file_list = file_list
        self.transform = ImageTransformForCls(transform_list)
        self.fix_transform = albumentations.Compose([
            albumentations.Transpose(always_apply=True),
            albumentations.VerticalFlip(always_apply=True)])

    def __len__(self):
        """Return Num of Images."""
        return len(self.file_list)

    def __getitem__(self, index):
        """Return transformed image and mask for given index."""
        img_path, label, (height, width) = self.file_list[index]
        img = self._read_image_as_array(img_path)
        
        if height > width:
            img = self.fix_transform(image=img)['image']
        
        img, label = self.transform((img, label))
        return img, label

    def _read_image_as_array(self, path: str):
        """Read image file and convert into numpy.ndarray"""
        img_arr = cv2.imread(str(path))
        img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)
        return img_arr

### get XXX

In [None]:
def get_test_loader(
    stgs: tp.Dict,
    test_df: pd.DataFrame,
    dataset_class: data.Dataset
):
    """Create DataLoader"""
    test_img_size_info = pd.read_csv("../input/bms-mt-image-size-info/train_image_size.csv")
    
    test_file_list = list(zip(
        # # cropped and padded image path 
        [
            TEST_224 / f"{img_id}.jpg"
            for img_id in test_df["image_id"].values],
        # # dummy label
        [-1] * len(test_df),
        # # image size info 
        test_img_size_info[["height", "width"]].values.tolist()
    ))

    test_dataset = dataset_class(
        test_file_list, **stgs["dataset"]["val"])
    test_loader = data.DataLoader(
        test_dataset, **stgs["loader"]["val"])

    return test_loader

### inference utils

In [None]:
def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore

In [None]:
def run_inference_loop(stgs, model, loader, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for x, _ in tqdm(loader):
            if stgs["globals"]["use_amp"]:
                with torch.cuda.amp.autocast(): 
                    y = model(x.to(device))
            else:
                y = model(x.to(device))
            pred_list.append(y.detach().cpu().numpy())
        
        pred_arr = np.concatenate(pred_list)
        del pred_list
    return pred_arr

## inference chemical formula

In [None]:
test_preds_arr = np.zeros((N_FOLD, len(smpl_sub), N_TARGETS))    

for fold_id in range(N_FOLD):
    print(f"[fold {fold_id}]")
    tmp_dir = TRAINING_OUTPUTS / f"fold{fold_id}"
    with open(tmp_dir / "settings.yml", "r") as fr:
        tmp_stgs = yaml.safe_load(fr)
    device = torch.device(tmp_stgs["globals"]["device"])
    
    # # get data_loader
    # test_loader = get_test_loader(tmp_stgs, smpl_sub, LabeledImageDataset)
    test_loader = get_test_loader(tmp_stgs, smpl_sub, TestImageDataset)

    # # get and load model
    model_path = TRAINING_OUTPUTS / f"./best_loss_model_fold{fold_id}.pth"
    tmp_stgs["model"]["params"]["pretrained"] = False
    model = BasicImageModel(**tmp_stgs["model"]["params"])
    model.load_state_dict(torch.load(model_path, map_location=device))

    # # inference test
    test_pred = run_inference_loop(tmp_stgs, model, test_loader, device)
    test_preds_arr[fold_id] = test_pred
    
    del model; del test_pred;
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
test_pred_fml = test_preds_arr.mean(axis=0)
del test_preds_arr

test_pred = smpl_sub[["image_id"]].copy()
for i, atom in enumerate(TARGETS):
    test_pred[atom] = test_pred_fml[:, i]
    
del test_pred_fml
    
test_pred["pred_formula"] = [
    arr2formula(atom_arr) for atom_arr in tqdm(test_pred[TARGETS].values)]

test_pred["n_atoms"] = test_pred[TARGETS].sum(axis=1)
test_pred["n_atoms_ex_H"] = test_pred["n_atoms"] - test_pred["H"]

## apply clustering

In [None]:
X_test = test_pred[["C", "H", "n_atoms_ex_H"]].values

test_pred["cluster"] = clustering_pipe.predict(X_test)

## make submission

In [None]:
sub = smpl_sub[["image_id"]].copy()

sub["InChI_0"] = "InChI=1S"
sub["InChI_1"] = test_pred["pred_formula"]
for i in range(2, 11):
    sub[f"InChI_{i}"] = test_pred["cluster"].map(mode_by_cluster[f"InChI_{i}"])

sub = join_inchi_parts(sub)
sub.head()

In [None]:
sub[["image_id", "InChI"]].to_csv("submission.csv", index=False)