In [1]:
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup
import datasets
from datasets import load_dataset, load_metric
import sentencepiece
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import AdamW
import pickle
import time
import math
from sklearn.preprocessing import MinMaxScaler
from datasets.utils.logging import disable_progress_bar
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
disable_progress_bar()

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, required=False)
#     parser.add_argument("--dataset_name", type=str, required=False)
    parser.add_argument("--pretrained_model_name_or_path", type=str, default="sagawa/ZINC-t5", required=False)
    parser.add_argument("--model_name_or_path", type=str, required=False)
    parser.add_argument("--scaler_path", type=str, default="/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product", required=False)
    parser.add_argument("--debug", action='store_true', default=False, required=False)
    parser.add_argument("--batch_size", type=int, default=5, required=False)
    parser.add_argument("--max_len", type=int, default=512, required=False)
    parser.add_argument("--num_workers", type=int, default=1, required=False)
    parser.add_argument("--fc_dropout", type=float, default=0.1, required=False)
    parser.add_argument("--output_dir", type=str, default='./', required=False)
    parser.add_argument("--seed", type=int, default=42, required=False)

    return parser.parse_args()

class CFG():
    data_path='../../all_ord_reaction_uniq_with_attr_v3.tsv'
#     pretrained_model_name_or_path = 'sagawa/ZINC-t5'
    model = 'sagawa/ZINC-t5'
    batch_size = 5 #max_lenを大きくしたらoomしたから15から5に
    seed = 42
    num_workers = 4
    output_dir = './'
    model_name_or_path = '/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product'
    scaler_path = '/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product'

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

OUTPUT_DIR = CFG.output_dir
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=CFG.seed)  
    


test_ds = pd.read_csv('../../regression-input-valid.csv')
display(test_ds.head())

# with open(OUTPUT_DIR+'scaler.pkl', 'rb') as f:
#     scaler = pickle.load(f)
with open(CFG.scaler_path + '/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

test_ds = test_ds[['input', 'YIELD']]

test_ds['YIELD'] = scaler.transform(test_ds['YIELD'].values.reshape(-1, 1))

Unnamed: 0,CATALYST,REACTANT,REAGENT,SOLVENT,INTERNAL_STANDARD,NoData,PRODUCT,YIELD,TEMP,input
0,,CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2ccccc2C12CCCC2,,ClCCl,,,CC#CN1C(=O)C(C)Oc2ccc(-n3c(=O)cc(C(F)(F)F)[nH]...,98.0,,REACTANT:CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2cccc...
1,[Pd],CC(C)(C)OC(=O)N1Cc2cc([N+](=O)[O-])ccc2C[C@H]1...,,CO,,,c1ccc2sc(-c3nccc4ccccc34)cc2c1,84.0,,REACTANT:CC(C)(C)OC(=O)N1Cc2cc([N+](=O)[O-])cc...
2,,CC(=O)NC[C@H]1CN(c2cc(F)c(N3CCS(=O)(=O)CC3)c(F...,,CC(=O)O,,,CCCCCCCCCCCC(=O)N1CCC[C@H]1C(=O)O,94.0,,REACTANT:CC(=O)NC[C@H]1CN(c2cc(F)c(N3CCS(=O)(=...
3,,C#C[Si](C)(C)C.FC(F)(F)c1ccc(-c2ccc3ncc(I)n3c2...,,,,,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,89.0,,REACTANT:C#C[Si](C)(C)C.FC(F)(F)c1ccc(-c2ccc3n...
4,,CC1CCC(Br)c2ncc(C(=O)O)c(=O)n21.CCCCN.Cl.ClC(C...,,O,,,Cl.Cl.Cl.NCCCC[C@H](N)C(=O)NCC(=O)Nc1ccccc1C(=...,31.0,,REACTANT:CC1CCC(Br)c2ncc(C(=O)O)c(=O)n21.CCCCN...


In [25]:
test_ds = test_ds[:500]

In [31]:
#load tokenizer
try: # load pretrained tokenizer from local directory
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path+'/tokenizer', return_tensors='pt')
except: # load pretrained tokenizer from huggingface model hub
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')

def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['input'].values
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        
        return inputs
    

       
class RegressionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            if 't5' in cfg.pretrained_model_name_or_path:
                self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path, config=self.config)
            else:
                self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path, config=self.config)
        else:
            if True:
                self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5', config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
        self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
        self.fc2 = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
        output = self.fc2(self.fc_dropout2(output))
        return output
    

    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


test_dataset = TestDataset(CFG, test_ds)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []

model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
model.load_state_dict(state)


 

Some weights of the model checkpoint at sagawa/ZINC-t5 were not used when initializing T5EncoderModel: ['decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.1.layer_norm.weight', 'decoder.block.10.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.11.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.6.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.1.EncDecAttention.o.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.1.EncDecAttention.k.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.4.layer.2.

<All keys matched successfully>

In [32]:
prediction = inference_fn(test_loader, model, device)
prediction

  0%|          | 0/100 [00:00<?, ?it/s]

array([[0.63496447],
       [0.88279617],
       [0.6655084 ],
       [0.62741876],
       [0.66845524],
       [0.6833414 ],
       [0.6394496 ],
       [0.59902155],
       [0.5515357 ],
       [0.35360926],
       [0.6612506 ],
       [0.6536778 ],
       [0.17100209],
       [0.76545405],
       [0.55786943],
       [0.56836164],
       [0.64050794],
       [0.507064  ],
       [0.62926114],
       [0.6844653 ],
       [0.8316356 ],
       [0.40217593],
       [0.6495297 ],
       [0.6949899 ],
       [0.5289475 ],
       [0.66726243],
       [0.5060371 ],
       [0.3532765 ],
       [0.5428307 ],
       [0.6304369 ],
       [0.6990107 ],
       [0.37147972],
       [0.5991924 ],
       [0.46930087],
       [0.7791816 ],
       [0.5646428 ],
       [0.44927165],
       [0.72419083],
       [0.7670516 ],
       [0.6665206 ],
       [0.5843389 ],
       [0.6230478 ],
       [0.52603865],
       [0.62091744],
       [0.5787926 ],
       [0.7471688 ],
       [0.5209596 ],
       [0.448

Unnamed: 0,input
0,REACTANT:CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2cccc


In [9]:
# train = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-train.csv')
# validation = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-valid.csv')
# test = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-test.csv')
from datasets import load_dataset
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-train.csv", "test": "../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-test.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds.to_json("../tcrp-test/transformer-chemical-reaction-prediciton/data/ord_datasets.jsonl")

Using custom data configuration default-b2d5b87c6450700e


Downloading and preparing dataset csv/default to /home/sagawa/.cache/huggingface/datasets/csv/default-b2d5b87c6450700e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/csv/default-b2d5b87c6450700e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

AttributeError: 'DatasetDict' object has no attribute 'to_json'

In [6]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /home/sagawa/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [7]:
from huggingface_hub import create_repo

repo_url = create_repo(name="ord-uniq-canonicalized", repo_type="dataset")
repo_url



'https://huggingface.co/datasets/sagawa/ord-uniq-canonicalized'

In [11]:
ds.push_to_hub(repo_id="ord-uniq-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/pubchem-10m-canonicalized-train.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/pubchem-10m-canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds

Using custom data configuration default-ae5678ec841ec48f
Reusing dataset csv (/home/sagawa/.cache/huggingface/datasets/csv/default-ae5678ec841ec48f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['smiles'],
        num_rows: 8999964
    })
    validation: Dataset({
        features: ['smiles'],
        num_rows: 999996
    })
})

In [18]:
repo_url = create_repo(name="pubchem-10m-canonicalized", repo_type="dataset")
repo_url



'https://huggingface.co/datasets/sagawa/pubchem-10m-canonicalized'

In [19]:
ds.push_to_hub(repo_id="pubchem-10m-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
ds['train'][:10]

{'Unnamed: 0': [10942073,
  3929867,
  1828479,
  4329395,
  14351797,
  6552409,
  20973385,
  22448411,
  22636960,
  2282403],
 'text': ['O=C1NCCN1[C@@H]1CCC[NH+](Cc2cccc(O)c2)C1',
  'CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@@H]1C(=O)NCCN1CCCCCC1',
  'Cc1nc(-c2ccc(NC(=O)[C@H]3C[C@H]4CC[C@@H]3O4)cc2)cs1',
  'COc1ccc(C(=O)[C@H](C)Sc2nc(-c3cccs3)n[n-]2)cc1OC',
  'CCOC(=O)c1sc(NC(=O)CCCS(=O)(=O)c2ccc(F)cc2)nc1-c1ccccc1',
  'O=C(NC[C@H]1CCCO1)c1ccc2c(c1)ncn2-c1ccccc1',
  'O=C(c1cc(F)c(F)cc1F)N1CC[NH+]([C@H](c2ccccc2)c2ccc(F)cc2)CC1',
  'Cc1c(C(=O)N(Cc2ccccc2)[C@@H](C)CCO)cnn1-c1ccccn1',
  'C[C@@H](Oc1cccc(C#N)c1)C(=O)Nc1cc(C2CCOCC2)n[nH]1',
  'CCCCn1c(S[C@@H](C)C(=O)Nc2ccc(C(C)=O)cc2)n[nH]c1=O']}

In [35]:
train = pd.read_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv")
train = train.rename(columns={'text':'smiles'}).drop(['Unnamed: 0'], axis=1)
train.to_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv", index=False)
valid = pd.read_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv")
valid = valid.rename(columns={'text':'smiles'}).drop(['Unnamed: 0'], axis=1)
valid.to_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv", index=False)


In [37]:
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds

Using custom data configuration default-681997cf8c885a68


Downloading and preparing dataset csv/default to /home/sagawa/.cache/huggingface/datasets/csv/default-681997cf8c885a68/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/csv/default-681997cf8c885a68/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['smiles'],
        num_rows: 20693269
    })
    validation: Dataset({
        features: ['smiles'],
        num_rows: 2299253
    })
})

In [38]:
repo_url = create_repo(name="ZINC-canonicalized", repo_type="dataset")
ds.push_to_hub(repo_id="ZINC-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
ds = load_dataset('sagawa/pubchem-10m-canonicalized')
ds

Downloading:   0%|          | 0.00/801 [00:00<?, ?B/s]

Using custom data configuration sagawa--pubchem-10m-canonicalized-93982af44e6a1c55


Downloading and preparing dataset csv/default (download: 250.77 MiB, generated: 463.22 MiB, post-processed: Unknown size, total: 713.99 MiB) to /home/sagawa/.cache/huggingface/datasets/sagawa___parquet/sagawa--pubchem-10m-canonicalized-93982af44e6a1c55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/26.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/237M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/sagawa___parquet/sagawa--pubchem-10m-canonicalized-93982af44e6a1c55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['smiles'],
        num_rows: 999996
    })
    train: Dataset({
        features: ['smiles'],
        num_rows: 8999964
    })
})