# Define Global Variable & Object

In [1]:
DATA_PATH = './data/'
SUBMISSION_PATH = './submit/'

class ModelCfg:
    epochs = 5
    seed = 42
    n_fold = 5
    batch_size = 24

    # 모델명 https://huggingface.co/models 참고
    MODEL_NAME = 'seyonec/PubChem10M_SMILES_BPE_180k'
    MAX_SEQ_LENGTH = 0
    TOKENIZER = None
    MODEL_PATH = './model/'
    CLEAN_DATA_PATH = './cleaned/'
    CONFIG_PATH = MODEL_PATH + 'model_config.pth'

# Import Modules

In [2]:
import numpy as np 
import pandas as pd
import random
import os
import dill
import matplotlib.pyplot as plt
import json
from ast import literal_eval
from tqdm.auto import tqdm
tqdm.pandas()
import re
import sys
import copy
import time
import math
import string
import pickle
import joblib
import itertools
import platform
import collections
import scipy as sp
import gc
import warnings; warnings.filterwarnings("ignore")
from scipy.stats import gmean

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

print('torch version : ' , torch.__version__)
print('tokenizers version : ', tokenizers.__version__)
print('transformers version : ', transformers.__version__)
%env TOKENIZERS_PARALLELISM=true


2022-08-19 05:12:02.801127: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


torch version :  1.11.0
tokenizers version :  0.10.3
transformers version :  4.15.0
env: TOKENIZERS_PARALLELISM=true


# Set Path

In [3]:
if not os.path.exists(ModelCfg.MODEL_PATH):
    print(f'create directory.....[{ModelCfg.MODEL_PATH}]')
    os.mkdir(ModelCfg.MODEL_PATH)
if not os.path.exists(ModelCfg.CLEAN_DATA_PATH): 
    print(f'create directory.....[{ModelCfg.CLEAN_DATA_PATH}]')
    os.mkdir(ModelCfg.CLEAN_DATA_PATH)

# Set Processor 

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device : ', device)

device :  cuda


# User Define Function

- Metric 함수

In [5]:
def MY_RMSELoss(yhat, y):
    return torch.sqrt(torch.mean((yhat-y)**2)) * 100

- 재현성 구현 함수

In [6]:
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

reset_seeds(ModelCfg.seed)

 # Load Data

In [7]:
def read_data_list():
    train = pd.read_csv(f'{DATA_PATH}train_set.ReorgE.csv')
    test = pd.read_csv(f'{DATA_PATH}test_set.csv')
    sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    
    print('train.shape: ', train.shape)
    print('test.shape: ', test.shape)
    print('sample_submission.shape: ', sample_submission.shape)

    return train, test, sample_submission

train, test, sample_submission = read_data_list()

train.shape:  (18157, 4)
test.shape:  (457, 2)
sample_submission.shape:  (457, 3)


# Create Tokenizer Object

In [8]:
def make_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(f'{ModelCfg.MODEL_NAME}', normalization=True, caeche_dir='./cache')
    tokenizer.save_pretrained(f'{ModelCfg.MODEL_PATH}')
    print(f'tokenizer object load & save......[{ModelCfg.MODEL_NAME}]')

    example = "Cn1c(=O)c2c(ncn2C)n(C)c1=O"
    tokens = tokenizer(example) # example
    tokens_ = tokenizer.tokenize(example) # example
    print(tokens)
    print(tokens_)
    
    return tokenizer

tokenizer = make_tokenizer()
ModelCfg.TOKENIZER = tokenizer

tokenizer object load & save......[seyonec/PubChem10M_SMILES_BPE_180k]
{'input_ids': [0, 315, 21, 71, 263, 51, 13, 71, 22, 71, 12, 358, 22, 39, 13, 82, 12, 39, 13, 71, 21, 33, 51, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['Cn', '1', 'c', '(=', 'O', ')', 'c', '2', 'c', '(', 'ncn', '2', 'C', ')', 'n', '(', 'C', ')', 'c', '1', '=', 'O']


# Preparing Dataset for Torch

- max_length 계산

In [9]:
def cacl_max_lenth(train_data):
    print(f'calculate max sequence length....[{ModelCfg.MODEL_NAME}]')
    for text_col in ['SMILES']:
        SMILES_lengths = []
        text_values = tqdm(train[text_col].fillna('').values, total=len(train))
        for text in text_values:
            length = len(ModelCfg.TOKENIZER(text, add_special_tokens=False)['input_ids'])
            SMILES_lengths.append(length)

    max_seq_length = max(SMILES_lengths) + 2
    print('SMILES max length : ', max(SMILES_lengths))
    print('final max data length : ', max_seq_length)

    return max_seq_length

max_seq_length = cacl_max_lenth(train)
ModelCfg.MAX_SEQ_LENGTH = max_seq_length

calculate max sequence length....[seyonec/PubChem10M_SMILES_BPE_180k]


  0%|          | 0/18157 [00:00<?, ?it/s]

SMILES max length :  200
final max data length :  202


- Dataset 준비

In [10]:
def prepare_input(tokenizer, max_seq_length, smiles):
    inputs = tokenizer(smiles, add_special_tokens=True, truncation=True, max_length=max_seq_length, padding="max_length", return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

def create_label(Reorg_g, Reorg_ex):
    return torch.tensor([Reorg_g, Reorg_ex], dtype=torch.float)

class ChemiDataset(Dataset):
    def __init__(self, cfg, df, is_train=False):
        self.cfg = cfg
        self.is_train = is_train
        self.smiles = df['SMILES'].values
        if not self.is_train:
            self.Reorg_g = df['Reorg_g'].values
            self.Reorg_ex = df['Reorg_ex'].values

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg.TOKENIZER, self.cfg.MAX_SEQ_LENGTH, self.smiles[item])
        label = np.array([])
        if not self.is_train:
            label = create_label(self.Reorg_g[item], self.Reorg_ex[item])
        return inputs, label

In [11]:
# 1개 미리보기
temp_train_dataset = ChemiDataset(ModelCfg, train)
temp_train_loader = DataLoader(temp_train_dataset, batch_size=1, shuffle=False, drop_last=True)
inputs, labels = next(iter(temp_train_loader))
print('inputs : ', inputs)
print('labels : ', labels.shape, labels)

inputs :  {'input_ids': tensor([[  0, 262,  63,  39,  36,  44,  65,  21, 373,  21,  39, 263,  51, 279,
          39,  36,  36,  44, 336,  39,  13, 271, 263,  51,  13,  71,  21,  71,
          12,  39,  13, 307,  12,  17,  82,  22, 276,  22,  13,  71,  21,  39,
           7,  50,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,

In [12]:
# 5개 미리보기
temp_train_dataset = ChemiDataset(ModelCfg, train)
temp_train_loader = DataLoader(temp_train_dataset, batch_size=5, shuffle=True, drop_last=True)
inputs, labels = next(iter(temp_train_loader))
print('inputs : ', inputs)
print('labels : ', labels, labels.shape)

inputs :  {'input_ids': tensor([[  0, 265,  21,  ...,   1,   1,   1],
        [  0, 262,  63,  ...,   1,   1,   1],
        [  0, 265,  21,  ...,   1,   1,   1],
        [  0,  51,  33,  ...,   1,   1,   1],
        [  0, 265,  21,  ...,   1,   1,   1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
labels :  tensor([[0.5942, 1.0330],
        [0.8123, 0.4900],
        [0.2900, 0.3791],
        [1.2005, 0.9994],
        [0.8034, 0.6547]]) torch.Size([5, 2])


# Define Model

In [13]:
class ChemoModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.MODEL_NAME, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.MODEL_NAME, config=self.config, cache_dir='./cache')
        else:
            self.model = AutoModel.from_config(self.config)

        self.fc_dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.config.hidden_size, 2)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [14]:
def get_optimizer_params(model):
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    return optimizer_parameters


In [15]:
def get_scheduler(cfg, optimizer, num_train_steps):
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return scheduler

# Define Loss Class Object

In [16]:
class LoseMeter(object):
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Define function for train & valid & inference

In [17]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    losses = LoseMeter()
    epoch_display = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    epoch_display.set_description(f'Epoch [{epoch+1}/{ModelCfg.epochs}]')
    for step, (inputs, labels) in epoch_display:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(inputs)
        y_preds = y_preds[::, 0]
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()
        epoch_display.set_postfix(train_loss=losses.avg, lr=scheduler.get_lr()[0])

    return losses.avg

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    input_values = tqdm(test_loader, total=len(test_loader))
    for inputs, _ in input_values:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        y_preds = y_preds[::, 0]
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

# Train Loop Function

In [18]:
def train_loop(train):
    # ====================================================
    # data loader
    # ====================================================
    x_train_ = train.copy()
    train_dataset = ChemiDataset(ModelCfg, x_train_)
    train_loader = DataLoader(train_dataset, batch_size=ModelCfg.batch_size, shuffle=True, drop_last=True)

    # ====================================================
    # model
    # ====================================================
    model = ChemoModel(ModelCfg, config_path=None, pretrained=True)
    torch.save(model.config, ModelCfg.MODEL_PATH + 'config.pth')
    model.to(device)

    # ====================================================
    # optimizer
    # ====================================================
    optimizer_parameters = get_optimizer_params(model)
    optimizer = AdamW(optimizer_parameters, lr=2e-5, eps=1e-8, betas=(0.9, 0.999))
    
    # ====================================================
    # scheduler
    # ====================================================
    num_train_steps = int(len(x_train_) / ModelCfg.batch_size * ModelCfg.epochs)
    scheduler = get_scheduler(ModelCfg, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MY_RMSELoss
    best_score = np.inf
    for epoch in range(ModelCfg.epochs):
        start_time = time.time()

        # train
        avg_train_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # scoring
        epoch_display = tqdm([0], leave=True)
        for _ in epoch_display:
            epoch_display.set_description(f'Epoch [{epoch+1}/{ModelCfg.epochs}]')

            score = avg_train_loss
            if best_score > score:
                best_score = score
                torch.save({'model': model.state_dict()}, f"{ModelCfg.MODEL_PATH}{ModelCfg.MODEL_NAME.replace('/', '-')}_best.pth")

            elapsed = time.time() - start_time
            epoch_display.set_postfix(avg_train_loss=f'{avg_train_loss:.4f}', time=f'{elapsed/60:.0f}m', Score=f'{score:.4f}')

    torch.cuda.empty_cache()
    gc.collect()

# Training or Inference

In [19]:
# 데이타 읽기
print('read data.............')
train, test, sample_submission = read_data_list()

# tokenizer 생성
print('create tokenizer.............')
tokenizer = make_tokenizer()
ModelCfg.TOKENIZER = tokenizer

# tokenizer에 입력하는 최대 길이 계산
print('calculate max sequence length.............')
max_seq_length = cacl_max_lenth(train)
ModelCfg.MAX_SEQ_LENGTH = max_seq_length

# 훈련
print(f'TRAINING [{ModelCfg.MODEL_NAME}] MODEL.............')
train_loop(train)

# 예측
print(f'INFERENCE [{ModelCfg.MODEL_NAME}] MODEL.............')
test_dataset = ChemiDataset(ModelCfg, test, True)
test_loader = DataLoader(test_dataset, batch_size=ModelCfg.batch_size, shuffle=False, drop_last=False)
model = ChemoModel(ModelCfg, config_path=None, pretrained=True)
state = torch.load(f"{ModelCfg.MODEL_PATH}{ModelCfg.MODEL_NAME.replace('/', '-')}_best.pth", map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
prediction = inference_fn(test_loader, model, device)
del model, state
gc.collect()
torch.cuda.empty_cache()

print(f'[{ModelCfg.MODEL_NAME}] predictions shape : ', np.array(prediction).shape)


read data.............
train.shape:  (18157, 4)
test.shape:  (457, 2)
sample_submission.shape:  (457, 3)
create tokenizer.............
tokenizer object load & save......[seyonec/PubChem10M_SMILES_BPE_180k]
{'input_ids': [0, 315, 21, 71, 263, 51, 13, 71, 22, 71, 12, 358, 22, 39, 13, 82, 12, 39, 13, 71, 21, 33, 51, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['Cn', '1', 'c', '(=', 'O', ')', 'c', '2', 'c', '(', 'ncn', '2', 'C', ')', 'n', '(', 'C', ')', 'c', '1', '=', 'O']
calculate max sequence length.............
calculate max sequence length....[seyonec/PubChem10M_SMILES_BPE_180k]


  0%|          | 0/18157 [00:00<?, ?it/s]

SMILES max length :  200
final max data length :  202
TRAINING [seyonec/PubChem10M_SMILES_BPE_180k] MODEL.............


Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_180k were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/756 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

INFERENCE [seyonec/PubChem10M_SMILES_BPE_180k] MODEL.............


Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_180k were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/20 [00:00<?, ?it/s]

[seyonec/PubChem10M_SMILES_BPE_180k] predictions shape :  (457, 2)


# Ready for Submission 

In [21]:
sample_submission[['Reorg_g', 'Reorg_ex']] = prediction

fname = f"./Submission_Smiles.csv"
sample_submission.to_csv(fname, index=False)
print(f"'{fname}' saved complete.")

display(sample_submission)

'./Submission_Smiles.csv' saved complete.


Unnamed: 0,index,Reorg_g,Reorg_ex
0,test_0,0.352623,0.288610
1,test_1,0.844584,0.639280
2,test_2,0.557360,0.570624
3,test_3,0.361972,0.412509
4,test_4,0.332872,0.345510
...,...,...,...
452,test_452,0.274158,0.291941
453,test_453,0.192220,0.193173
454,test_454,0.383139,0.368139
455,test_455,0.384266,0.356970


# End Of Program