In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from glob import glob
from rdkit import Chem
import torch

In [3]:
files = sorted(glob('../database/structures/guest/*'))
smiles_list = []
name_list = []
smiles_dict = {}
for f in files:
    name = f.split('/')[-1][:-4]
    try:
        mol = Chem.rdmolfiles.MolFromPDBFile(f)
        smi = Chem.MolToSmiles(mol)
        smiles_list.append(smi)
        name_list.append(name)
        smiles_dict[name] = smi
    except:
        print(name)
smiles_list.append('')
name_list.append('nan')

In [6]:
import json
with open("../database/features/guest_smiles.json", "w") as json_file:
    json.dump(smiles_dict, json_file)


### ChemBERT

In [4]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

chembert = ClassificationModel('roberta', 'seyonec/PubChem10M_SMILES_BPE_396_250', 
                            num_labels=1,
                            args={'evaluate_each_epoch': True, 
                                  'evaluate_during_training_verbose': True,
                                  'no_save': False, 'num_train_epochs': 10, 
                                  'regression' : True,
                                  'auto_weights': True}) # You can set class weights by using the optional weight argument


2024-06-05 16:47:22.423327: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-05 16:47:22.442611: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_396_250 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [5]:
model = chembert.model

In [6]:
tokens = chembert.tokenizer(smiles_list,add_special_tokens=True, truncation=True, 
                                 max_length=256, padding="max_length", 
                              return_tensors='pt',
                              return_offsets_mapping=False)
for k, v in tokens.items():
    tokens[k] = torch.tensor(v, dtype=torch.long,).to(model.device)  

  tokens[k] = torch.tensor(v, dtype=torch.long,).to(model.device)


In [7]:
model.eval()
with torch.no_grad():
    outputs=model.roberta(tokens['input_ids'], tokens['attention_mask'])[0][:,0,:]
    outputs = outputs.detach().numpy()

In [8]:
smiles_feat = {name: feat.tolist() for name, feat in zip(name_list, outputs)}

In [13]:
import json
with open('../database/features/chembert_guest.json', 'w') as json_file:
    json.dump(smiles_feat, json_file)