In [1]:
import ast
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.utils import gen_batches

# Data loading and preprocessing

We'll merge here data from multiple sources

In [2]:
%%time 

# Load original uspto csv with all fields.
uspto_csv_1 = pd.read_csv('../../data/raw/Extracted_Data_2001_Sep2016_USPTOapplications_new.csv', low_memory=False)
uspto_csv_2 = pd.read_csv('../../data/raw/Extracted_Data_1976_Sep2016_USPTOgrants_new.csv', low_memory=False)
uspto_csv = pd.concat([uspto_csv_1, uspto_csv_2])

print(len(uspto_csv))

# Load original paragraphs DB
with open('../../data/raw/DATASET_PARAGRAPH_Q2_Q3.pickle', 'rb') as f:
    parags_db = pickle.load(f)
    print(len(parags_db))

# Load segmented paragraphs
with open('../../data/processed/uspto_segmented_last.bin', 'rb') as f:
    segm_db = pickle.load(f)
    print(len(segm_db))

3748163
1878060
1872064
CPU times: user 58.5 s, sys: 4.78 s, total: 1min 3s
Wall time: 1min 3s


## Let's merge all this data into a single structure, with additions.

```json
{
    text_id: [
        'txt_sgm':
        'src_prg':
        'sgm_cls':
        'stp_ord':
        'rxn_smi':
        'prd_str':  Reaction product as given in paragraph
    ] 
}
```

---

## Match paragraphs with reaction SMILES, and sort.

This will give us a map `src_prg` -> `rxn_smi`

In [3]:
%%time

prg_rxn_sort = (
    parags_db
    .reset_index()
    .merge(
        uspto_csv[['Paragraph Text', 'Reaction Smiles', 'Product List']].drop_duplicates(),
        right_on='Paragraph Text',
        left_on='Paragraph Text',
        how='outer'
    )
    .dropna()
    .drop_duplicates(subset=['index'])
    .sort_values('index')
)

prg_rxn_sort['index'] = prg_rxn_sort['index'].astype(int)
prg_rxn_sort = prg_rxn_sort.set_index('index')

print(f"DF shape: {prg_rxn_sort.shape}")
prg_rxn_sort.head(3)

DF shape: (1878060, 3)
CPU times: user 15.2 s, sys: 519 ms, total: 15.7 s
Wall time: 15.7 s


Unnamed: 0_level_0,Paragraph Text,Reaction Smiles,Product List
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Suspend anhydrous AlCl3 (156 g, 1.15 mol) in t...",[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,['4-Chloro-1-(4-methyl-phenyl)-butan-1-one']
1,Dissolve 4-chloro-1-(4-isopropyl-phenyl)-butan...,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,['1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chlo...
2,"Mix 2-ethyl-1-hexanol (6.5 g, 5 mol), triethyl...",C(C(CCCC)C[OH:5])C.C(N(CC)CC)C.[C:17]1([CH2:23...,"['2-phenylaceticacid', '2-(2-ethylhexy)lester']"


## First, clean duplicates in segm_db and paragraphs

In [4]:
# Get a list of indices that point ot same paragraph
p_indices = prg_rxn_sort.groupby('Paragraph Text').indices

# This maps every paragraph index to a unique idx for that paragraph.
map_idx = {j:inds[0] for p,inds in p_indices.items() for j in inds}

# Apply transform to segm_db
print(len(segm_db))
segm_db = {map_idx[key]:sg for key, sg in segm_db.items()}
print(len(segm_db))

1872064
993733


## Build the dict with all merged data as discussed above (sg_db)

In [5]:
%%time

sg_db = []
rxn_dict = prg_rxn_sort['Reaction Smiles'].to_dict()
prd_dict = prg_rxn_sort['Product List'].to_dict()

for prg_id, prg in tqdm(segm_db.items()):
    for sgmnt in prg:
        try:
            cls = sgmnt['text class']
    
            feats = {
                'txt_sgm': sgmnt['text segment'].strip("'"),
                'src_prg': prg_id,
                'sgm_cls': sgmnt['text class'],
                'stp_ord': int(sgmnt['step order']),
            }
            
            if cls == 'reaction set-up':
                feats['rxn_smi'] = rxn_dict[prg_id]
                feats['prd_str'] = ast.literal_eval(prd_dict[prg_id])
    
            sg_db.append(feats)
        except:
            sg_db.append(feats)
            pass

print(len(sg_db))

  0%|          | 0/993733 [00:00<?, ?it/s]

2812027
CPU times: user 12.7 s, sys: 82.9 ms, total: 12.8 s
Wall time: 12.7 s


## Now let's filter out by edit distance

If edit distance is above some threshold, sample is wrong and we discard it. This way we ensure segmentation quality.

In [6]:
%%time

from utils import edit_distance_checkor

def quality_filter(i):
    try:
        edd = edit_distance_checkor(
            parags_db[i],
            segm_db[i]
        )
        return edd
    except:
        return 1000


edit_distances = np.array([quality_filter(i) for i in segm_db.keys()])
print(f"Edit distances of first 10 paragraphs/segmentations: {edit_distances[:10]}")

print(f"\n\n{'edd':>4}{'Number of samples':>21}\n")
for t in [0, 5, 10, 50, 100, 500, 1000]:
    print(f"{t:>4}{(edit_distances > t).sum():>10}")

# Let's not consider samples with edd > 10
remove_prgs = edit_distances > 10

# Recover indices of the segments to know what to clean from embeddings
keep_idx = []
for j, p in enumerate(segm_db.keys()):
    # Simply repeat the value of remove_prgs[p] for each segment
    keep_idx += [remove_prgs[j] for i in segm_db[p]]  

print(f"\nlen of discriminator (should be same as current len of sg_db): {len(keep_idx)}\n")
print(f"len of sg_db: {len(sg_db)}\n")

Edit distances of first 10 paragraphs/segmentations: [0 0 0 0 0 1 5 2 1 3]


 edd    Number of samples

   0    453568
   5    129884
  10    113703
  50    104008
 100     92754
 500     26756
1000      3682

len of discriminator (should be same as current len of sg_db): 2812027

len of sg_db: 2812027

CPU times: user 5.08 s, sys: 0 ns, total: 5.08 s
Wall time: 5.08 s


## Finally drop the failed entries and store

In [7]:
sg_db_clean = [s for i,s in enumerate(sg_db) if not keep_idx[i]]
print(f"Dropping {np.sum(keep_idx)} segments")

with open('../../data/processed/sg_db_clean.bin', 'wb') as f:
    pickle.dump(sg_db_clean, f)

print(f"Each entry of our dictionary looks something like this:\n")
sg_db_clean[0]

Dropping 225300 segments
Each entry of our dictionary looks something like this:



{'txt_sgm': 'Suspend anhydrous AlCl3 (156 g, 1.15 mol) in toluene (1500 mL) and cool to 2-4° C. Add, by slow addition, a solution of 4-chlorobutyryl chloride (165.5 g, 1.15 mol) in toluene (300 mL). Stir for 15 minutes and pour into stirring ice-water (2.5 L). Stir for 30 hours,',
 'src_prg': 0,
 'sgm_cls': 'reaction set-up',
 'stp_ord': 1,
 'rxn_smi': '[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][CH2:8][C:9](Cl)=[O:10].[C:12]1([CH3:18])[CH:17]=[CH:16][CH:15]=[CH:14][CH:13]=1>>[Cl:5][CH2:6][CH2:7][CH2:8][C:9]([C:15]1[CH:16]=[CH:17][C:12]([CH3:18])=[CH:13][CH:14]=1)=[O:10] |f:0.1.2.3|',
 'prd_str': ['4-Chloro-1-(4-methyl-phenyl)-butan-1-one']}

In [8]:
len(sg_db_clean)

2586727