In the Chemistry section of the dataset, there are targets which are chemical formulas, such as "ho2cco2h". Obviously, for a non-specialized language model there is not much to work with here. I decided to write a little demo for how to parse these formulas by using a couple of Python packages.

In [None]:
! pip install -q chemparse
! pip install -q pyvalem
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from pyvalem.formula import Formula
import chemparse

In [None]:
with open('../input/periodictable/periodic_table.p', 'rb') as fin:
    per_table = pickle.load(fin)

def atoms_to_str(atoms):
    return ' '.join([per_table.get(x.lower(), '') for x in atoms])
    
def parse_formula(text):
    tokenized = text.split(' ')
    
    results = []
    
    for tok in tokenized:
        atoms = chemparse.parse_formula(tok).keys()
        formula = atoms_to_str(atoms)
        if len(formula) < 2 or len(tok) < 3:
            results.append(tok)
        else:
            try:
                f = Formula(tok.upper())
                atoms = f.atoms
                formula = ' '.join([x.name.lower() for x in atoms])
            except Exception as e:
                pass
            
            results.append(formula)
    
    return ' '.join(results)
 
def parse_df_formulas(df):
    df = df.copy()
    df.loc[:, 'target'] = df.target.apply(parse_formula)
    return df

In [None]:
train_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
train_df2 = parse_df_formulas(train_df)

In [None]:
train_df2[train_df2.target != train_df.target].merge(train_df[['id', 'target']], on='id', suffixes=('_replaced', '_original'))

In [None]:
print('Modified {} samples'.format(len(train_df[train_df2.target != train_df.target])))