In [None]:
!pip install -q transformers
!pip install -q deepchem

[K     |████████████████████████████████| 5.8 MB 29.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 81.7 MB/s 
[K     |████████████████████████████████| 182 kB 95.2 MB/s 
[K     |████████████████████████████████| 693 kB 25.7 MB/s 
[K     |████████████████████████████████| 29.3 MB 1.3 MB/s 
[?25h

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
from deepchem.feat import SmilesTokenizer
import torch
import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')



Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# smiles_df = pd.read_csv('./smiles_se.csv')
tokenizer = SmilesTokenizer('./vocab.txt')

In [None]:
with open('smiles.txt', 'r') as f:
  data = f.read()
data_list = data.split(' ')

In [None]:
index = None
smiles = []
for line in data_list:
  if index is None:
    index = line
    continue
  smiles.append([index, line])
  index = None
  
smiles_df = pd.DataFrame([{'cid': idx, 'smiles': smiles} for idx, smiles in smiles])
smiles_df.head()

Unnamed: 0,cid,smiles
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,3,C1=CC(C(C(=C1)C(=O)O)O)O
3,4,CC(CN)O
4,5,C(C(=O)COP(=O)(O)O)N


In [None]:
sum(smiles_df['smiles'].isnull())

0

In [None]:
size_per_smiles = 4
smiles_list = []
for smiles in smiles_df['smiles']:
  mol = MolFromSmiles(smiles)
  if mol:
    smiles_list.append(smiles)
    for i in range(size_per_smiles):
      smiles_list.append(MolToSmiles(mol, doRandom=True))
len(smiles_list)

In [None]:
class SmilesMaskDataset(torch.utils.data.Dataset):
    def __init__(self, smiles):
        self.smiles = smiles
    def __getitem__(self, idx):
        inputs = tokenizer(smiles_list[idx], return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        inputs['labels'] = inputs.input_ids.detach().clone()
        rand = torch.rand(inputs.input_ids.shape)
        mask_arr = (rand < 0.15) * (inputs.input_ids != 12) * \
           (inputs.input_ids != 13) * (inputs.input_ids != 0)

        selection = []

        for i in range(inputs.input_ids.shape[0]):
            selection.append(
                torch.flatten(mask_arr[i].nonzero()).tolist()
            )
        for i in range(inputs.input_ids.shape[0]):
            inputs.input_ids[i, selection[i]] = 14

        return {key: torch.tensor(val[0]) for key, val in inputs.items()}
    def __len__(self):
        return len(self.smiles)

dataset = SmilesMaskDataset(smiles_list)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


학습 로그
### 1차
* basemodel: 'bert-base-uncased'
* AdamW lr = 5e-5
* dataset: pcp cid 1 to 10000 + 4 random MolToSmiles per each: 49970
* epoch: 4
* loss: 0.205
### 2차
* basemodel: 'bert-base-uncased'
* AdamW lr = 5e-5
* dataset: pcp cid 1 to 10000 + 4 random MolToSmiles per each: 49970
* epoch: 4
* loss: 0.00469

In [None]:
from transformers import AdamW

# activate training mode
# initialize optimizer
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# optim = AdamW(model.parameters(), lr=1e-6)
# model = model.to(device)
# Load model

# from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
checkpoint = torch.load("./drive/MyDrive/colab/sm_bert_2_e_08.pt")


model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.load_state_dict(checkpoint['model_state_dict'])
# and move our model over to the selected device
model.to(device)

optim = AdamW(model.parameters(), lr=1e-6)
optim.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model = model.train()

loader = torch.utils.data.DataLoader(dataset, batch_size=24, shuffle=True)

In [None]:
from tqdm import tqdm  # for our progress bar
model.train()
epochs = 4

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[0]) for key, val in inputs.items()}
Epoch 0: 100%|██████████| 2083/2083 [22:16<00:00,  1.56it/s, loss=0.0478]
Epoch 1: 100%|██████████| 2083/2083 [22:16<00:00,  1.56it/s, loss=0.0176]
Epoch 2: 100%|██████████| 2083/2083 [22:16<00:00,  1.56it/s, loss=0.0244]
Epoch 3: 100%|██████████| 2083/2083 [22:17<00:00,  1.56it/s, loss=0.00469]


In [None]:
torch.save({
  'epoch': epoch,
  'model_state_dict': model.state_dict(),
  'optimizer_state_dict': optim.state_dict(),
  'loss': loss,
}, "./sm_bert_2_e_04.pt")

In [None]:
torch.save({
  'epoch': epoch,
  'model_state_dict': model.state_dict(),
  'optimizer_state_dict': optim.state_dict(),
  'loss': loss,
}, "./drive/MyDrive/colab/sm_bert_2_e_08.pt")

In [None]:
model = model.eval()

In [None]:
from transformers import pipeline

In [None]:
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer, device=0)

In [None]:
sider_df = pd.read_csv('./smiles_se.csv')

In [None]:
!nvidia-smi

Thu Dec 15 00:19:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    52W / 400W |   3680MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
smiles = list(sider_df['smiles'])

In [None]:
vectors = []
model.eval()
for idx, smiles in enumerate(smiles):
  inputs = tokenizer(smiles, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
  input_ids = inputs.input_ids.to(device)
  output = model(input_ids)
  vec: torch.Tensor = output.logits[0][sum(inputs['attention_mask'][0]) - 1]
  vectors.append(vec.tolist())
  # break


In [None]:
sider_df['vec'] = vectors

In [None]:
sider_df.to_csv('./drive/MyDrive/colab/sider_with_vec.csv')

In [None]:
sider_df.to_pickle('./drive/MyDrive/colab/sider_with_vec_08.pickle')

In [None]:
len(vectors[0])

30522

In [None]:
output = model(input_ids)

In [None]:
inputs = tokenizer(smiles, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
input_ids = inputs.input_ids.to(device)
output = model(input_ids)
output.logits[0][sum(inputs['attention_mask'][0])]

sum(inputs['attention_mask'][0])

tensor(46)

In [None]:
output.logits[0][sum(inputs['attention_mask'][0])]

tensor([13.4778,  7.5427,  7.2155,  ...,  4.4563,  3.4644, -0.5525],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
unmasker("CCOC(=O)C=C1CCP(CC1)C")

[[{'score': 0.35891345143318176,
   'token': 18,
   'token_str': ')',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 ) C [MASK] ( C C 1 ) C [SEP]'},
  {'score': 0.333548903465271,
   'token': 22,
   'token_str': '=',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 = C [MASK] ( C C 1 ) C [SEP]'},
  {'score': 0.19982106983661652,
   'token': 16,
   'token_str': 'C',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 C C [MASK] ( C C 1 ) C [SEP]'},
  {'score': 0.05197840929031372,
   'token': 19,
   'token_str': 'O',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 O C [MASK] ( C C 1 ) C [SEP]'},
  {'score': 0.016875997185707092,
   'token': 23,
   'token_str': 'N',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 N C [MASK] ( C C 1 ) C [SEP]'}],
 [{'score': 0.8842957615852356,
   'token': 16,
   'token_str': 'C',
   'sequence': '[CLS] C C O C ( = O ) C = C 1 [MASK] C C ( C C 1 ) C [SEP]'},
  {'score': 0.027298999950289726,
   'token': 18,
   'token_str': ')',
   'sequence': '[CLS] C C O C ( = O ) C = 

In [None]:
from transformers import AdamW

model = BertForMaskedLM.from_pretrained('bert-base-uncased')
optim = AdamW(model.parameters(), lr=1e-6)

In [None]:
checkpoint = torch.load("./drive/MyDrive/colab/sm_bert_e_04.pt")

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])
optim.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']