# Fine-tuning Chem-GPT for predicting the gap in PCQM4Mv2

In [1]:
import torch
import torch.nn
from tqdm import tqdm
import os
import sys
sys.path.insert(0, '/home/shayan/phoenix/graphite/')
import numpy
import numpy as np
import pandas
from graphite.utilities.miscellaneous import count_parameters
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from transformers import DataCollatorForLanguageModeling

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [2]:
device = torch.device('cuda:0')

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-19M")
model = AutoModelForSequenceClassification.from_pretrained("ncfrey/ChemGPT-19M", num_labels=1, problem_type='regression').to(device)

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at ncfrey/ChemGPT-19M and are newly initialized: ['transformer.h.19.attn.attention.bias', 'transformer.h.21.attn.attention.bias', 'transformer.h.15.attn.attention.bias', 'transformer.h.9.attn.attention.bias', 'transformer.h.23.attn.attention.bias', 'transformer.h.5.attn.attention.bias', 'score.weight', 'transformer.h.11.attn.attention.bias', 'transformer.h.17.attn.attention.bias', 'transformer.h.1.attn.attention.bias', 'transformer.h.7.attn.attention.bias', 'transformer.h.3.attn.attention.bias', 'transformer.h.13.attn.attention.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
count_parameters(model) # 19M

19635968

In [5]:
class PCQM4Mv2SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, split, filepath='/data/pcqm4mv2_datahub/datasets/2d/pcqm4m-v2/raw/data.csv.gz'):
        super().__init__()
        idx = torch.load('/data/pcqm4mv2_datahub/datasets/2d/pcqm4m-v2/split_dict.pt')[split]
        data = pandas.read_csv(filepath)
        self.smiles, self.labels = data['smiles'][idx], data['homolumogap'][idx]
        tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-19M",  padding='max_length', truncation=True, max_length=142)
        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            model.pad_token = '[PAD]'
            # model.resize_token_embeddings(len(tokenizer))
            model.config.pad_token_id = 1 #tokenizer.vocab_size + 1 #model.config.eos_token_id
            self.tokenizer = tokenizer


    def tokenize_function(self, item):
        output = {k: v for k, v in self.tokenizer(item["text"], padding="max_length", truncation=True, max_length=142).items() if k in ['input_ids']}
        output['labels'] = item['labels']
        return output

    def __getitem__(self, idx):
        item = dict(text=self.smiles[idx])
        item["labels"] = torch.tensor([self.labels[idx]]).float()
        return self.tokenize_function(item)

    def __len__(self):
        return len(self.labels)
    
        

In [6]:
train_dataset = PCQM4Mv2SMILESDataset('train')
valid_dataset = PCQM4Mv2SMILESDataset('valid')

Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


In [7]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./bert_outputs',          
    num_train_epochs = 30,     
    per_device_train_batch_size = 64,   
    per_device_eval_batch_size = 64,   
    weight_decay = 0.01,               
    learning_rate = 2e-5,
    logging_dir = './bert_outputs/logs',            
    save_total_limit = 10,
    load_best_model_at_end = True,     
    metric_for_best_model = 'rmse',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch"
) 

# Call the Trainer
trainer = Trainer(
    model = model,
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = valid_dataset,          
    compute_metrics = compute_metrics_for_regression,
    
)

In [8]:
len(train_dataset) // 2

1689303

In [9]:
trainer.args._n_gpu = 1

In [10]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 3378606
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1583730
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshayanfazeli[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [None]:

# Call the summary
trainer.evaluate()