# Top

In [None]:
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm

tqdm().pandas()

In [None]:
train = pd.read_csv("../input/bms-molecular-translation/train_labels.csv")
print(train.shape)
train.head()

## Extract Elements from Inchi

The elements in the inchi can be extracted from the Chemical composition which is the second entry when the InChI is split by the `/` character.

In [None]:
train.InChI = train.InChI.progress_apply(lambda x: re.findall(r'([A-Z][a-z]?)',x.split('/')[1]) )
train.head()

We will now gather all the unique elements.

In [None]:
all_el = set()
train.InChI.progress_apply(lambda x: all_el.update(x))
print(len(all_el), all_el)

Next, we will be counting all lines with the presence of specific elements.

In [None]:
elements = sorted(all_el)
counts = []
for e in elements:
    counts.append((np.sum(train.InChI.progress_apply(lambda x: e in x)), e))
counts.sort(reverse=True)
for n, e in counts:
    print(f'Element {e} count: {n} (approx {n*100/train.shape[0]:.4}%)')

We will now group each entry based on the presence of specific elements.

In [None]:
groupings = {}
for e in elements:
    groupings[e] = train.InChI.progress_apply(
        lambda x: e in x
    )

In [None]:
group_sets = {}
for e in elements:
    group_sets[e] = set(train.image_id.loc[groupings[e]])

After grouping them, we will now gather pairwise information from each pair of element.

In [None]:
#Intersections of each elements:
pair_groupings = []
for e1 in elements:
    for e2 in elements:
        if e1 == e2:
            continue
        n = len(group_sets[e1].intersection(group_sets[e2]))
        pair_groupings.append((n, e1, e2))
        
pair_groupings.sort(reverse=True)
for n, e1, e2 in pair_groupings[0::2]:
    print(f'Lines with both {e1} and {e2}:\t{n}\t{100*n/train.shape[0]:.4}%')

Using this information, we can select part of the dataset which contains certain elements only. It can be useful when we want to minimize the train data size which spans over 2.4 Million entries.

I hope this notebook helps someone out there!

# End