In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 22/01/2024

In [None]:
# Script to fine tune mixtral-dna labeled data.

In [1]:
### LOAD PYTHON MODULES
# Load basic modules
import os
import sys
import time
from os import path
import gc

# Load data and machine learning modules
import numpy as np
import pandas as pd
from random import randrange
from progressbar import ProgressBar

import torch
import triton
import transformers
from torch.utils.data import TensorDataset, DataLoader
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, set_seed, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Print numpy version for compatibility
print(sys.version)
print(np.__version__)
print(triton.__version__)
print(transformers.__version__)
print(torch.cuda.get_device_name(0))

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

  from .autonotebook import tqdm as notebook_tqdm


3.8.18 | packaged by conda-forge | (default, Dec 23 2023, 17:21:28) 
[GCC 12.3.0]
1.23.5
2.2.0
4.37.2
NVIDIA GeForce RTX 3090


In [2]:
### CHECK ENV
print(sys.prefix)

/home/mourad/miniconda3/envs/mistral_dna


In [3]:
### SET DIRECTORY
os.chdir("/home/mourad/Téléchargements/Mistral-chem/")
print(os.getcwd())

/home/mourad/Téléchargements/Mistral-chem


In [4]:
### SPECIFY PARAMETERS
model_name="RaphaelMourad/mixtral-chem-v0.4" 
lora=True

In [5]:
### LOAD FUNCTIONS MODULE
sys.path.append("scriptPython/")
from functions_chem import *

In [6]:
# TRAINING PARAMETERS
data_args=DataArguments()
print(data_args)

model_args=ModelArguments()
model_args.use_lora=lora
print(model_args)

training_args=TrainingArguments
training_args.deepspeed_plugin=None
training_args.run_name=model_name
training_args.model_max_length=1024 # max sequence length (can be increased)
training_args.gradient_accumulation_steps=1
training_args.learning_rate=2e-5
training_args.num_train_epochs=10
training_args.fp16=True 
training_args.save_steps=5000
training_args.evaluation_strategy="epoch"
training_args.warmup_steps=50
training_args.load_best_model_at_end=True
training_args.logging_steps=100000
training_args.find_unused_parameters=False

# Other arguments to add since it was bugging
bs=32
training_args.device=torch.device('cuda:0')
training_args.report_to=["tensorboard"]
training_args.world_size=1
training_args.train_batch_size=bs
training_args.eval_batch_size=bs
training_args.test_batch_size=bs
training_args.batch_size=bs
training_args.num_training_steps=100
training_args.n_gpu=1
training_args.distributed_state=None
training_args.local_rank=-1 # -1
training_args.metric_for_best_model="eval_loss"

training_args.fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False, 'xla_device': 'cpu'}
training_args.lr_scheduler_kwargs={}
training_args

DataArguments(data_path=None, kmer=-1)
ModelArguments(model_name_or_path='facebook/opt-125m', use_lora=True, lora_r=8, lora_alpha=32, lora_dropout=0.05, lora_target_modules='query,value')


functions_chem.TrainingArguments

In [7]:
# CONFIG QUANTIZATION
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
bnb_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [8]:
# CONFIG ACCELERATE
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [9]:
# CONFIG LORA
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

In [10]:
# LOAD DATA
#labeled_data=pd.read_csv("data/chemistry/finetune/antibiotic_ecoli_growth.csv",sep=',')
# "pIC50" - is a measurement used in pharmacology and drug discovery 
# to assess the potency of a compound in inhibiting a specific biological target or enzyme
chem_data=pd.read_csv("data/chemistry/finetune/SMILES_Big_Data_Set.csv",sep=',')
chem_data["label"]=chem_data["pIC50"]>1
chem_data["label"]=chem_data["label"].astype("int")
print(np.sum(chem_data["label"]))
display(chem_data)

print(np.sum(chem_data["label"]))
print(np.sum(chem_data["label"]==0))
display(chem_data)

traintmp_df, test_df = train_test_split(chem_data, test_size=0.2, random_state=42)

# Make balanced data
#traintmp_df_pos=traintmp_df[traintmp_df.label==1]
#traintmp_df_neg=traintmp_df[traintmp_df.label==0]
#traintmp_df_neg=traintmp_df_neg.sample(len(traintmp_df_pos))
#traintmp_df=pd.concat((traintmp_df_pos,traintmp_df_neg))
#traintmp_df

train_df, valid_df = train_test_split(traintmp_df, test_size=0.2, random_state=42)
print(np.sum(train_df["label"]))
print(np.sum(valid_df["label"]))
print(np.sum(test_df["label"]))

1911


Unnamed: 0,text,pIC50,mol,num_atoms,logP,label
0,O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1,4.26,<rdkit.Chem.rdchem.Mol object at 0x7f59df45bc30>,25,4.15910,1
1,O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)...,4.34,<rdkit.Chem.rdchem.Mol object at 0x7f59a320c9e0>,36,3.67430,1
2,NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO,4.53,<rdkit.Chem.rdchem.Mol object at 0x7f59a320cac0>,23,1.53610,1
3,NCCCn1c(C2CCNCC2)nc2cc(C(N)=O)ccc21,4.56,<rdkit.Chem.rdchem.Mol object at 0x7f59a320cba0>,22,0.95100,1
4,CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1,4.59,<rdkit.Chem.rdchem.Mol object at 0x7f59a320c7b0>,21,3.21300,1
...,...,...,...,...,...,...
16082,S=C(NN=C(c1ccccn1)c1ccccn1)Nc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314ed50>,24,3.21560,0
16083,S=C=NCCCCCCCCCCc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314edc0>,19,5.45270,0
16084,S=C=NCCCCCCCCc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314ee30>,17,4.67250,0
16085,S=c1[nH]nc(Cn2ccc3ccccc32)n1-c1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314eea0>,22,3.93289,0


1911
14176


Unnamed: 0,text,pIC50,mol,num_atoms,logP,label
0,O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1,4.26,<rdkit.Chem.rdchem.Mol object at 0x7f59df45bc30>,25,4.15910,1
1,O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)...,4.34,<rdkit.Chem.rdchem.Mol object at 0x7f59a320c9e0>,36,3.67430,1
2,NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO,4.53,<rdkit.Chem.rdchem.Mol object at 0x7f59a320cac0>,23,1.53610,1
3,NCCCn1c(C2CCNCC2)nc2cc(C(N)=O)ccc21,4.56,<rdkit.Chem.rdchem.Mol object at 0x7f59a320cba0>,22,0.95100,1
4,CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1,4.59,<rdkit.Chem.rdchem.Mol object at 0x7f59a320c7b0>,21,3.21300,1
...,...,...,...,...,...,...
16082,S=C(NN=C(c1ccccn1)c1ccccn1)Nc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314ed50>,24,3.21560,0
16083,S=C=NCCCCCCCCCCc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314edc0>,19,5.45270,0
16084,S=C=NCCCCCCCCc1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314ee30>,17,4.67250,0
16085,S=c1[nH]nc(Cn2ccc3ccccc32)n1-c1ccccc1,0.00,<rdkit.Chem.rdchem.Mol object at 0x7f59a314eea0>,22,3.93289,0


1227
319
365


In [11]:
# LOAD TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side  = 'left'
print(tokenizer)

tokenizer_config.json: 100%|██████████████████████████████| 1.14k/1.14k [00:00<00:00, 123kB/s]
tokenizer.json: 100%|█████████████████████████████████████| 38.2k/38.2k [00:00<00:00, 486kB/s]


PreTrainedTokenizerFast(name_or_path='RaphaelMourad/mixtral-chem-v0.3', vocab_size=1024, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [12]:
# MAKE DATA 
train_encodings = tokenizer(train_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)
val_encodings = tokenizer(valid_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)
test_encodings = tokenizer(test_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_df["label"].values.tolist())
val_dataset = NewsDataset(val_encodings, valid_df["label"].values.tolist())
test_dataset = NewsDataset(test_encodings, test_df["label"].values.tolist())

# define datasets and data collator      
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
### CREATE AND TRAIN MODEL
num_labels=2

model=transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    output_hidden_states=False,
    quantization_config=bnb_config,
    device_map='cuda:0',
)

model.config.pad_token_id = tokenizer.pad_token_id
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model = accelerator.prepare_model(model)

# Setup trainer
trainer = transformers.Trainer(model=model,
                               args=training_args,
                               compute_metrics=compute_metrics,
                               train_dataset=train_dataset, 
                               eval_dataset=val_dataset,
                               data_collator=data_collator,
                              callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
                              )        
trainer.local_rank=training_args.local_rank
trainer.train()

Some weights of MixtralForSequenceClassification were not initialized from the model checkpoint at RaphaelMourad/mixtral-chem-v0.3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
I0000 00:00:1712080012.005774   19817 cpu_client.cc:370] TfrtCpuClient created.
Using auto half precision backend
Currently training with a batch size of: 32
***** Running training *****
  Num examples = 10,295
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Training with DataParallel so batch size has been adjusted to: 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3,220
  Number of trainable parameters = 787,968
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
The following columns in the training set don't have a corresponding argument in `PeftModelForSequenceClassificat

Epoch,Training Loss,Validation Loss,Accuracy,F1,Matthews Correlation,Precision,Recall
1,No log,0.282369,0.885781,0.563208,0.263756,0.839047,0.551296
2,No log,0.216501,0.918026,0.75059,0.555125,0.895393,0.694847
3,No log,0.183907,0.929293,0.800554,0.630609,0.900269,0.748375
4,No log,0.167434,0.935509,0.825423,0.669846,0.902291,0.778836
5,No log,0.15769,0.939005,0.834612,0.68906,0.914727,0.786215
6,No log,0.151123,0.940559,0.840426,0.698386,0.914988,0.79383
7,No log,0.146625,0.94289,0.848186,0.711839,0.917758,0.803234
8,No log,0.144296,0.944833,0.854535,0.722918,0.920008,0.811071
9,No log,0.142761,0.944833,0.854535,0.722918,0.920008,0.811071
10,No log,0.142338,0.944833,0.854535,0.722918,0.920008,0.811071


***** Running Evaluation *****
  Num examples = 2574
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 2574
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 2574
  Batch size = 32
The following columns in the evaluation set don't have 

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 2574
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 2574
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3220, training_loss=0.16700507691187888, metrics={'train_runtime': 840.2538, 'train_samples_per_second': 122.523, 'train_steps_per_second': 3.832, 'total_flos': 2.3131569510144e+16, 'train_loss': 0.16700507691187888, 'epoch': 10.0})

In [14]:
# PREDICT ON TEST DATA
bst=32
pred_test=[]
idx=range(0,len(test_dataset),bst)
pbar = ProgressBar()
for i in pbar(idx):
    batch=test_dataset[i:(i+bst)]
    output = model(batch['input_ids'].cuda())
    pred_test.append(output[0][:,1].detach())
    del output, batch
    gc.collect()

y_pred=torch.concatenate(pred_test,dim=0)
y_pred=torch.sigmoid(y_pred)
y_pred=y_pred.cpu().detach().numpy()

roc_auc_score(test_dataset.labels, y_pred)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100% |#######################################################################################|


0.9341054117511488