In [1]:
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup
import datasets
from datasets import load_dataset, load_metric
import sentencepiece
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import AdamW
import pickle
import time
import math
from sklearn.preprocessing import MinMaxScaler
from datasets.utils.logging import disable_progress_bar
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
disable_progress_bar()

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, required=False)
#     parser.add_argument("--dataset_name", type=str, required=False)
    parser.add_argument("--pretrained_model_name_or_path", type=str, default="sagawa/ZINC-t5", required=False)
    parser.add_argument("--model_name_or_path", type=str, required=False)
    parser.add_argument("--scaler_path", type=str, default="/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product", required=False)
    parser.add_argument("--debug", action='store_true', default=False, required=False)
    parser.add_argument("--batch_size", type=int, default=5, required=False)
    parser.add_argument("--max_len", type=int, default=512, required=False)
    parser.add_argument("--num_workers", type=int, default=1, required=False)
    parser.add_argument("--fc_dropout", type=float, default=0.1, required=False)
    parser.add_argument("--output_dir", type=str, default='./', required=False)
    parser.add_argument("--seed", type=int, default=42, required=False)

    return parser.parse_args()

class CFG():
    data_path='../../all_ord_reaction_uniq_with_attr_v3.tsv'
#     pretrained_model_name_or_path = 'sagawa/ZINC-t5'
    model = 'sagawa/ZINC-t5'
    batch_size = 5 #max_lenを大きくしたらoomしたから15から5に
    seed = 42
    num_workers = 4
    output_dir = './'
    model_name_or_path = '/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product'
    scaler_path = '/data2/sagawa/tcrp-regression-model-archive/10-23-1st-new-metric-reactant-product'

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

OUTPUT_DIR = CFG.output_dir
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=CFG.seed)  
    


test_ds = pd.read_csv('../../regression-input-valid.csv')
display(test_ds.head())

# with open(OUTPUT_DIR+'scaler.pkl', 'rb') as f:
#     scaler = pickle.load(f)
with open(CFG.scaler_path + '/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

test_ds = test_ds[['input', 'YIELD']]

test_ds['YIELD'] = scaler.transform(test_ds['YIELD'].values.reshape(-1, 1))

Unnamed: 0,CATALYST,REACTANT,REAGENT,SOLVENT,INTERNAL_STANDARD,NoData,PRODUCT,YIELD,TEMP,input
0,,CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2ccccc2C12CCCC2,,ClCCl,,,CC#CN1C(=O)C(C)Oc2ccc(-n3c(=O)cc(C(F)(F)F)[nH]...,98.0,,REACTANT:CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2cccc...
1,[Pd],CC(C)(C)OC(=O)N1Cc2cc([N+](=O)[O-])ccc2C[C@H]1...,,CO,,,c1ccc2sc(-c3nccc4ccccc34)cc2c1,84.0,,REACTANT:CC(C)(C)OC(=O)N1Cc2cc([N+](=O)[O-])cc...
2,,CC(=O)NC[C@H]1CN(c2cc(F)c(N3CCS(=O)(=O)CC3)c(F...,,CC(=O)O,,,CCCCCCCCCCCC(=O)N1CCC[C@H]1C(=O)O,94.0,,REACTANT:CC(=O)NC[C@H]1CN(c2cc(F)c(N3CCS(=O)(=...
3,,C#C[Si](C)(C)C.FC(F)(F)c1ccc(-c2ccc3ncc(I)n3c2...,,,,,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,89.0,,REACTANT:C#C[Si](C)(C)C.FC(F)(F)c1ccc(-c2ccc3n...
4,,CC1CCC(Br)c2ncc(C(=O)O)c(=O)n21.CCCCN.Cl.ClC(C...,,O,,,Cl.Cl.Cl.NCCCC[C@H](N)C(=O)NCC(=O)Nc1ccccc1C(=...,31.0,,REACTANT:CC1CCC(Br)c2ncc(C(=O)O)c(=O)n21.CCCCN...


In [25]:
test_ds = test_ds[:500]

In [31]:
#load tokenizer
try: # load pretrained tokenizer from local directory
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path+'/tokenizer', return_tensors='pt')
except: # load pretrained tokenizer from huggingface model hub
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')

def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['input'].values
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        
        return inputs
    

       
class RegressionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            if 't5' in cfg.pretrained_model_name_or_path:
                self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path, config=self.config)
            else:
                self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path, config=self.config)
        else:
            if True:
                self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5', config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
        self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
        self.fc2 = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
        output = self.fc2(self.fc_dropout2(output))
        return output
    

    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


test_dataset = TestDataset(CFG, test_ds)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []

model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
model.load_state_dict(state)


 

Some weights of the model checkpoint at sagawa/ZINC-t5 were not used when initializing T5EncoderModel: ['decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.1.layer_norm.weight', 'decoder.block.10.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.11.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.6.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.1.EncDecAttention.o.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.1.EncDecAttention.k.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.4.layer.2.

<All keys matched successfully>

In [32]:
prediction = inference_fn(test_loader, model, device)
prediction

  0%|          | 0/100 [00:00<?, ?it/s]

array([[0.63496447],
       [0.88279617],
       [0.6655084 ],
       [0.62741876],
       [0.66845524],
       [0.6833414 ],
       [0.6394496 ],
       [0.59902155],
       [0.5515357 ],
       [0.35360926],
       [0.6612506 ],
       [0.6536778 ],
       [0.17100209],
       [0.76545405],
       [0.55786943],
       [0.56836164],
       [0.64050794],
       [0.507064  ],
       [0.62926114],
       [0.6844653 ],
       [0.8316356 ],
       [0.40217593],
       [0.6495297 ],
       [0.6949899 ],
       [0.5289475 ],
       [0.66726243],
       [0.5060371 ],
       [0.3532765 ],
       [0.5428307 ],
       [0.6304369 ],
       [0.6990107 ],
       [0.37147972],
       [0.5991924 ],
       [0.46930087],
       [0.7791816 ],
       [0.5646428 ],
       [0.44927165],
       [0.72419083],
       [0.7670516 ],
       [0.6665206 ],
       [0.5843389 ],
       [0.6230478 ],
       [0.52603865],
       [0.62091744],
       [0.5787926 ],
       [0.7471688 ],
       [0.5209596 ],
       [0.448

In [9]:
prediction = inference_fn(test_loader, model, device)
prediction

  0%|          | 0/100 [00:00<?, ?it/s]

array([[0.6189771 ],
       [0.59322965],
       [0.5555079 ],
       [0.5918845 ],
       [0.56737405],
       [0.5985358 ],
       [0.58216214],
       [0.60153526],
       [0.679618  ],
       [0.5753029 ],
       [0.62062585],
       [0.5948294 ],
       [0.580694  ],
       [0.5894024 ],
       [0.6379183 ],
       [0.5222249 ],
       [0.5884903 ],
       [0.547188  ],
       [0.58316964],
       [0.68957883],
       [0.61838716],
       [0.564923  ],
       [0.73818123],
       [0.5935827 ],
       [0.6658546 ],
       [0.537824  ],
       [0.66204053],
       [0.7076806 ],
       [0.60698384],
       [0.6153175 ],
       [0.63155526],
       [0.48151118],
       [0.54806495],
       [0.6024049 ],
       [0.6077259 ],
       [0.59931755],
       [0.58332306],
       [0.6408519 ],
       [0.5897757 ],
       [0.56226   ],
       [0.53998065],
       [0.5712919 ],
       [0.59482294],
       [0.669676  ],
       [0.5027402 ],
       [0.5810004 ],
       [0.5921702 ],
       [0.722

Unnamed: 0,input
0,REACTANT:CCN(CC)CC.CCOC(=O)Cl.NCCC1(O)CCc2cccc


In [7]:
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv(CFG.data)
# if CFG.debug:
#     df = df[:1000]
# df = df.dropna().reset_index(drop=True)
df = df[~df['YIELD'].isna()]
for col in ['CATALYST', 'REACTANT', 'REAGENT', 'SOLVENT', 'INTERNAL_STANDARD', 'NoData','PRODUCT', 'YIELD', 'TEMP']:
    df[col] = df[col].fillna(' ')
# df['input'] = 'REACTANT:' + df['REACTANT'] + 'PRODUCT:' + df['PRODUCT'] + 'CATALYST:' + df['CATALYST'] + 'REAGENT:' + df['REAGENT'] + 'SOLVENT:' + df['SOLVENT'] + 'NoData:' + df['NoData']
df['input'] = 'REACTANT:' + df['REACTANT'] + 'PRODUCT:' + df['PRODUCT'] + 'CATALYST:' + df['CATALYST']
train_ds = df[:int(len(df)*0.8)]
train_ds = train_ds[train_ds['YIELD']<df['YIELD'].quantile(0.999)].reset_index(drop=True)
valid_ds = df[int(len(df)*0.8):].reset_index(drop=True)

scaler = MinMaxScaler()
train_ds['YIELD'] = scaler.fit_transform(train_ds['YIELD'].values.reshape(-1, 1))
valid_ds['YIELD'] = scaler.transform(valid_ds['YIELD'].values.reshape(-1, 1))

In [15]:
valid_ds['YIELD'].describe()

count    136411.000000
mean          0.597094
std           0.255214
min           0.000000
25%           0.431193
50%           0.642202
75%           0.788991
max          20.733945
Name: YIELD, dtype: float64

In [8]:
for i in df['input'].values:
    if ' ' in i:
        print(i)
        break

REACTANT:CC(O)CCc1ccccc1.O=S(=O)(F)c1ccc(Cl)cc1PRODUCT:CC(F)CCc1ccccc1CATALYST: 


In [8]:
df.sort_values('YIELD')

Unnamed: 0,CATALYST,REACTANT,REAGENT,SOLVENT,INTERNAL_STANDARD,NoData,PRODUCT,YIELD,TEMP,input
2189652,CC(C1=C(P(C2CCCCC2)C2CCCCC2)C=C[CH]1)P(C(C)(C)...,C1CCNCC1.O=C(O)Cn1nnc(-c2cc(N3CCC(Oc4cc(F)ccc4...,CC(C)(C)[O-].[Na+],COCCOC,,,COC(=O)CC1CCc2cc(N3CCCCC3)cc3[nH]c(=O)c(=O)n1c23,0.0,80.0,REACTANT:C1CCNCC1.O=C(O)Cn1nnc(-c2cc(N3CCC(Oc4...
2189876,CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2...,CC1(C)OB(c2ccc(N3CCN(S(=O)(=O)c4ccc(Cl)cc4)CC3...,O=P([O-])([O-])[O-].[K+],C1CCOC1.O,CC(C)(C)c1ccc(-c2ccc(C(C)(C)C)cc2)cc1,,COC(=O)CC1CCc2cc(N3CCCCC3)cc3[nH]c(=O)c(=O)n1c23,0.0,100.0,REACTANT:CC1(C)OB(c2ccc(N3CCN(S(=O)(=O)c4ccc(C...
2189877,CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2...,CC1(C)OB(c2cccc(-c3nnn[nH]3)c2)OC1(C)C.Clc1ccc...,O=P([O-])([O-])[O-].[K+],C1CCOC1.O,CC(C)(C)c1ccc(-c2ccc(C(C)(C)C)cc2)cc1,,COC(=O)CC1CCc2cc(N3CCCCC3)cc3[nH]c(=O)c(=O)n1c23,0.0,100.0,REACTANT:CC1(C)OB(c2cccc(-c3nnn[nH]3)c2)OC1(C)...
1629755,COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)(C23CC4CC(...,CCc1ccc(Cl)cc1.Cc1ccc(N)cc1,CN1CCCN2CCCN=C12.c1ccc2nocc2c1,CS(C)=O,,,Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1,0.0,60.0,REACTANT:CCc1ccc(Cl)cc1.Cc1ccc(N)cc1PRODUCT:Cc...
1139006,,CCCN(C(C)C)[C@H]1COc2cccc(C=O)c2C1.CN.O=C([O-]...,,CO,,,C1=CCCC=C1,0.0,,REACTANT:CCCN(C(C)C)[C@H]1COc2cccc(C=O)c2C1.CN...
...,...,...,...,...,...,...,...,...,...,...
28686,,COC(=O)c1ncn(Cc2cc(C(F)(F)F)cc(C(F)(F)F)c2)c1-...,,CCO,,,CC(C)CN=C1C(c2ccccc2)=C(c2ccccc2)C(c2ccccc2)=C...,1100.0,70.0,REACTANT:COC(=O)c1ncn(Cc2cc(C(F)(F)F)cc(C(F)(F...
2148098,,CCN(CC)CC.COc1ccc2c(c1)CCN(c1nc(Cl)nc(C)c1C)C2...,,CN(C)C=O,,,CC(C)(C)[C@H](N)C(=O)O.CN(C)CC(=O)O.NCC(=O)O.N...,2260.0,,REACTANT:CCN(CC)CC.COc1ccc2c(c1)CCN(c1nc(Cl)nc...
1178920,,CCN(CC)CC.CN(C)C(On1nnc2cccnc21)=[N+](C)C.F[P-...,,CN(C)C=O,,,COc1nc(N2CCCC2)ccc1[N+](=O)[O-],3070.0,,REACTANT:CCN(CC)CC.CN(C)C(On1nnc2cccnc21)=[N+]...
1818600,CC[C@@H]1CN2CC[C@@H]1C[C@@H]2[C@H](Oc1nnc(O[C@...,CCOC(C)=O.COC(=O)/C=C/c1ccccc1Cl.CS(N)(=O)=O.O...,,CC(C)(C)O.O,,,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,7090.0,0.0,REACTANT:CCOC(C)=O.COC(=O)/C=C/c1ccccc1Cl.CS(N...


In [23]:
df['YIELD'].describe()

count    682053.000000
mean         64.337097
std          88.808571
min           0.000000
25%          45.000000
50%          69.000000
75%          86.000000
max       69365.000000
Name: YIELD, dtype: float64

In [9]:
df['YIELD'].quantile(0.999)

110.0

In [3]:
# from transformers import AutoModelForSequenceClassification, T5EncoderModel
# model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
# model
# # model2 = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path, num_labels=1)
# # model2 = model2.encoder

In [4]:
# from transformers import AutoConfig
# config = AutoConfig.from_pretrained('sagawa/ZINC-t5')
# a = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels = 1)
# a

In [3]:
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
import time
import math

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)



def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.add_special_tokens({'additional_special_tokens': tokenizer.additional_special_tokens + ['CATALYST:', 'REACTANT:', 'REAGENT:', 'SOLVENT:', 'INTERNAL_STANDARD:', 'NoData:','PRODUCT:', 'None']})
CFG.tokenizer = tokenizer
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    
    return inputs

class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['input'].values
        self.labels = df['YIELD'].values
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        
        return inputs, label
    
class RegressionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            if 't5' in cfg.model:
                self.model = T5EncoderModel.from_pretrained(CFG.model)
            else:
                self.model = AutoModel.from_pretrained(CFG.model)
        else:
            self.model = AutoModel.from_config(self.config)
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
#         print(last_hidden_states.shape)
        output = self.fc(self.fc_dropout(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
#         print(output.shape)
        return output
    
class AverageMeter(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum/self.count
        

def asMinutes(s):
    m = math.floor(s/60)
    s -= m*60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s/(percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
#         print(y_preds.shape)
#         print(labels)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss/CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
            
    return losses.avg

def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss/CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    return losses.avg
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


def train_loop(train_ds, valid_ds):
    
    train_dataset = TrainDataset(CFG, train_ds)
    valid_dataset = TrainDataset(CFG, valid_ds)
    valid_labels = valid_ds['YIELD'].values
    
    train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    model = RegressionModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)], 'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)], 'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if 'model' not in n], 'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    
    optimizer_parameters = get_optimizer_params(model, encoder_lr=CFG.lr, decoder_lr=CFG.lr, weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.lr, eps=CFG.eps, betas=CFG.betas)
    
    num_train_steps = int(len(train_ds)/CFG.batch_size*CFG.epochs)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_train_steps)
    
    criterion = nn.MSELoss(reduction='mean')
    best_loss = 0
    
    for epoch in range(CFG.epochs):
        start_time = time.time()

        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
        
        # prediction削除
        avg_val_loss = valid_fn(valid_loader, model, criterion, device)
        
        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
    
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Lowest Loss: {best_loss:.4f} Model')
            torch.save(model.state_dict(), OUTPUT_DIR+f"{CFG.model}_best.pth")
    
    torch.cuda.empty_cache()
    gc.collect()

            
if __name__ == '__main__':
    train_loop(train_ds, valid_ds)
        
 

Some weights of the model checkpoint at sagawa/ZINC-t5 were not used when initializing T5EncoderModel: ['decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.2.layer.0.SelfAttention.v.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.1.layer_norm.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'decoder.block.4.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.2.layer.0.layer_norm.weight', 'decoder.block.9.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.5.layer.2.layer_norm.weight', 'decoder.block.10.layer.0.SelfAttention.v.weight', 'decoder.block.8.layer.1.layer_norm.weight', 'lm_head.weight', 'decoder.block.9.layer.1.EncDecAttention.o.weight', 'decoder.block.2.layer.1.EncDecAtten

Epoch: [1][0/36372] Elapsed 0m 0s (remain 532m 45s) Loss: 3682.0491(3682.0491) Grad: 394.8093  LR: 0.00199998  
Epoch: [1][100/36372] Elapsed 0m 30s (remain 180m 57s) Loss: 719.8199(1489.4144) Grad: 184.8497  LR: 0.00199815  
Epoch: [1][200/36372] Elapsed 1m 2s (remain 187m 16s) Loss: 317.4121(1148.1907) Grad: 213.2992  LR: 0.00199632  
Epoch: [1][300/36372] Elapsed 1m 36s (remain 193m 7s) Loss: 765.6782(1004.8354) Grad: 501.0276  LR: 0.00199448  
Epoch: [1][400/36372] Elapsed 2m 11s (remain 197m 14s) Loss: 636.9699(916.7192) Grad: 228.2921  LR: 0.00199265  
Epoch: [1][500/36372] Elapsed 2m 47s (remain 199m 42s) Loss: 807.5559(871.0504) Grad: 675.2719  LR: 0.00199082  
Epoch: [1][600/36372] Elapsed 3m 23s (remain 201m 34s) Loss: 745.6779(842.3986) Grad: 243.9361  LR: 0.00198898  
Epoch: [1][700/36372] Elapsed 3m 59s (remain 202m 52s) Loss: 790.9963(820.4609) Grad: 436.6902  LR: 0.00198715  
Epoch: [1][800/36372] Elapsed 4m 34s (remain 203m 27s) Loss: 573.8173(810.0087) Grad: 276.0745  

KeyboardInterrupt: 

In [4]:
df = pd.read_csv('../../all_ord_reaction_uniq_with_attr_v3.tsv')

df = df[~df['PRODUCT'].isna()]
for col in ['CATALYST', 'REACTANT', 'REAGENT', 'SOLVENT', 'INTERNAL_STANDARD', 'NoData','PRODUCT', 'YIELD', 'TEMP']:
    df[col] = df[col].fillna('None')
df['input'] = 'REACTANT:' + df['REACTANT']  + 'CATALYST:' + df['CATALYST'] + 'REAGENT:' + df['REAGENT'] + 'SOLVENT:' + df['SOLVENT'] + 'NoData:' + df['NoData']
# df['input'] = 'REACTANT:' + df['REACTANT'] + 'PRODUCT:' + df['PRODUCT'] + 'CATALYST:' + df['CATALYST']
train_ds = df[:int(len(df)*0.8)]
valid_ds = df[int(len(df)*0.8):]

train_ds[['input', 'PRODUCT']].to_csv('../../ord-train-debug.csv', index=False)
valid_ds[['input', 'PRODUCT']].to_csv('../../ord-test-debug.csv', index=False)
data_files = {'train': '../../ord-train-debug.csv', 'validation': '../../ord-test-debug.csv'}
dataset = load_dataset('csv', data_files=data_files)
dataset

Using custom data configuration default-3be574e6e217f2e3


Downloading and preparing dataset csv/default to /home/sagawa/.cache/huggingface/datasets/csv/default-3be574e6e217f2e3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...
Dataset csv downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/csv/default-3be574e6e217f2e3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['input', 'PRODUCT'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['input', 'PRODUCT'],
        num_rows: 20
    })
})

In [None]:
def preprocess_function(examples):
    inputs = examples['input'] + examples['product']
    model_inputs = tokenizer(inputs, max_length=CFG.max_len, truncation=True)
    model_inputs['labels'] = float(examples['yield'])
    return model_inputs

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    
    return {'mse':mse}




def compute_metrics(eval_preds):
    metric = load_metric('sacrebleu')
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)# np.where(条件式, x, y) True=>xi, False=>yiを要素と持つリストを返す
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {'bleu': result['score']}


#load tokenizer
try: # load pretrained tokenizer from local directory
    tokenizer = AutoTokenizer.from_pretrained(os.path.abspath(CFG.pretrained_model_name_or_path), return_tensors='pt')
except: # load pretrained tokenizer from huggingface model hub
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name_or_path, return_tensors='pt')
    
if CFG.multitask:
    tokenizer.add_special_tokens({'additional_special_tokens':tokenizer.additional_special_tokens + ['Product:', 'Reactants:']})
else:
    tokenizer.add_special_tokens({'additional_special_tokens':tokenizer.additional_special_tokens + ['Reactants:']})
tokenizer.add_tokens('.')

#load model
if CFG.model == 't5':
    try: # load pretrained model from local directory
        model = AutoModelForSeq2SeqLM.from_pretrained(os.path.abspath(CFG.pretrained_model_name_or_path), from_flax=True)
    except: # load pretrained model from huggingface model hub
        model = AutoModelForSeq2SeqLM.from_pretrained(CFG.pretrained_model_name_or_path, from_flax=True)
    model.resize_token_embeddings(len(tokenizer))
elif CFG.model == 'deberta':
    try: # load pretrained model from local directory
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(os.path.abspath(CFG.pretrained_model_name_or_path), 'roberta-large')
    except: # load pretrained model from huggingface model hub
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(os.path.abspath(CFG.pretrained_model_name_or_path), 'roberta-large')
    model.encoder.resize_token_embeddings(len(tokenizer))
    model.decoder.resize_token_embeddings(len(tokenizer))
    config_encoder = model.config.encoder
    config_decoder = model.config.decoder
    config_decoder.is_decoder = True
    config_decoder.add_cross_attention = True
    model.config.decoder_start_token_id = tokenizer.bos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    CFG.model,
    evaluation_strategy=CFG.evaluation_strategy,
    save_strategy=CFG.save_strategy,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    weight_decay=CFG.weight_decay,
    save_total_limit=CFG.save_total_limit,
    num_train_epochs=CFG.epochs,
    predict_with_generate=True,
    fp16=CFG.fp16,
    disable_tqdm=CFG.disable_tqdm,
    push_to_hub=False,
    load_best_model_at_end=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model('./best_model')

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['product', 'reactant'],
        num_rows: 855625
    })
    test: Dataset({
        features: ['product', 'reactant'],
        num_rows: 106952
    })
    validation: Dataset({
        features: ['product', 'reactant'],
        num_rows: 106952
    })
})

In [9]:
# train = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-train.csv')
# validation = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-valid.csv')
# test = pd.read_csv('../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-test.csv')
from datasets import load_dataset
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-train.csv", "test": "../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-test.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/all_ord_reaction_uniq_canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds.to_json("../tcrp-test/transformer-chemical-reaction-prediciton/data/ord_datasets.jsonl")

Using custom data configuration default-b2d5b87c6450700e


Downloading and preparing dataset csv/default to /home/sagawa/.cache/huggingface/datasets/csv/default-b2d5b87c6450700e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/csv/default-b2d5b87c6450700e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

AttributeError: 'DatasetDict' object has no attribute 'to_json'

In [6]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /home/sagawa/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [7]:
from huggingface_hub import create_repo

repo_url = create_repo(name="ord-uniq-canonicalized", repo_type="dataset")
repo_url



'https://huggingface.co/datasets/sagawa/ord-uniq-canonicalized'

In [11]:
ds.push_to_hub(repo_id="ord-uniq-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/pubchem-10m-canonicalized-train.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/pubchem-10m-canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds

Using custom data configuration default-ae5678ec841ec48f
Reusing dataset csv (/home/sagawa/.cache/huggingface/datasets/csv/default-ae5678ec841ec48f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['smiles'],
        num_rows: 8999964
    })
    validation: Dataset({
        features: ['smiles'],
        num_rows: 999996
    })
})

In [18]:
repo_url = create_repo(name="pubchem-10m-canonicalized", repo_type="dataset")
repo_url



'https://huggingface.co/datasets/sagawa/pubchem-10m-canonicalized'

In [19]:
ds.push_to_hub(repo_id="pubchem-10m-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
ds['train'][:10]

{'Unnamed: 0': [10942073,
  3929867,
  1828479,
  4329395,
  14351797,
  6552409,
  20973385,
  22448411,
  22636960,
  2282403],
 'text': ['O=C1NCCN1[C@@H]1CCC[NH+](Cc2cccc(O)c2)C1',
  'CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@@H]1C(=O)NCCN1CCCCCC1',
  'Cc1nc(-c2ccc(NC(=O)[C@H]3C[C@H]4CC[C@@H]3O4)cc2)cs1',
  'COc1ccc(C(=O)[C@H](C)Sc2nc(-c3cccs3)n[n-]2)cc1OC',
  'CCOC(=O)c1sc(NC(=O)CCCS(=O)(=O)c2ccc(F)cc2)nc1-c1ccccc1',
  'O=C(NC[C@H]1CCCO1)c1ccc2c(c1)ncn2-c1ccccc1',
  'O=C(c1cc(F)c(F)cc1F)N1CC[NH+]([C@H](c2ccccc2)c2ccc(F)cc2)CC1',
  'Cc1c(C(=O)N(Cc2ccccc2)[C@@H](C)CCO)cnn1-c1ccccn1',
  'C[C@@H](Oc1cccc(C#N)c1)C(=O)Nc1cc(C2CCOCC2)n[nH]1',
  'CCCCn1c(S[C@@H](C)C(=O)Nc2ccc(C(C)=O)cc2)n[nH]c1=O']}

In [35]:
train = pd.read_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv")
train = train.rename(columns={'text':'smiles'}).drop(['Unnamed: 0'], axis=1)
train.to_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv", index=False)
valid = pd.read_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv")
valid = valid.rename(columns={'text':'smiles'}).drop(['Unnamed: 0'], axis=1)
valid.to_csv("../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv", index=False)


In [37]:
data_files = {"train": "../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-train.csv", "validation":"../tcrp-test/transformer-chemical-reaction-prediciton/data/ZINC-canonicalized-valid.csv"}
ds = load_dataset('csv', data_files=data_files)
ds

Using custom data configuration default-681997cf8c885a68


Downloading and preparing dataset csv/default to /home/sagawa/.cache/huggingface/datasets/csv/default-681997cf8c885a68/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/csv/default-681997cf8c885a68/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['smiles'],
        num_rows: 20693269
    })
    validation: Dataset({
        features: ['smiles'],
        num_rows: 2299253
    })
})

In [38]:
repo_url = create_repo(name="ZINC-canonicalized", repo_type="dataset")
ds.push_to_hub(repo_id="ZINC-canonicalized")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
ds = load_dataset('sagawa/pubchem-10m-canonicalized')
ds

Downloading:   0%|          | 0.00/801 [00:00<?, ?B/s]

Using custom data configuration sagawa--pubchem-10m-canonicalized-93982af44e6a1c55


Downloading and preparing dataset csv/default (download: 250.77 MiB, generated: 463.22 MiB, post-processed: Unknown size, total: 713.99 MiB) to /home/sagawa/.cache/huggingface/datasets/sagawa___parquet/sagawa--pubchem-10m-canonicalized-93982af44e6a1c55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/26.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/237M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /home/sagawa/.cache/huggingface/datasets/sagawa___parquet/sagawa--pubchem-10m-canonicalized-93982af44e6a1c55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['smiles'],
        num_rows: 999996
    })
    train: Dataset({
        features: ['smiles'],
        num_rows: 8999964
    })
})