In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 22/01/2024

In [None]:
# Script to fine tune mixtral-dna labeled data.

In [1]:
### LOAD PYTHON MODULES
# Load basic modules
import os
import sys
import time
from os import path
import gc

# Load data and machine learning modules
import numpy as np
import pandas as pd
from random import randrange
from progressbar import ProgressBar

import torch
import triton
import transformers
from torch.utils.data import TensorDataset, DataLoader
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, set_seed, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Print numpy version for compatibility
print(sys.version)
print(np.__version__)
print(triton.__version__)
print(transformers.__version__)
print(torch.cuda.get_device_name(0))

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

  from .autonotebook import tqdm as notebook_tqdm


3.8.18 | packaged by conda-forge | (default, Dec 23 2023, 17:21:28) 
[GCC 12.3.0]
1.23.5
2.2.0
4.37.2
NVIDIA GeForce RTX 3090


Process ForkProcess-9:
Process ForkProcess-11:
Process ForkProcess-6:
Process ForkProcess-12:
Process ForkProcess-7:
Process ForkProcess-15:
Process ForkProcess-10:
Process ForkProcess-13:
Process ForkProcess-8:
Process ForkProcess-3:
Process ForkProcess-5:
Process ForkProcess-16:
Process ForkProcess-1:
Process ForkProcess-14:
Process ForkProcess-4:
Process ForkProcess-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/process.py", line 315, in _boo

  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/concurrent/futures/process.py", line 233, in _process_worker
    call_item = call_queue.get(block=True)
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/concurrent/futures/process.py", line 233, in _process_worker
    call_item = call_queue.get(block=True)
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
  File "/home/mourad/miniconda3/envs/mistral_dna/lib/python3.8/concurrent/futures/process.py", line 233, in _process_worker
    call_item = call_queue

In [2]:
### CHECK ENV
print(sys.prefix)

/home/mourad/miniconda3/envs/mistral_dna


In [3]:
### SET DIRECTORY
os.chdir("/home/mourad/Téléchargements/Mistral-chem/")
print(os.getcwd())

/home/mourad/Téléchargements/Mistral-chem


In [4]:
### SPECIFY PARAMETERS
model_name="RaphaelMourad/mixtral-chem-v0.1" 
lora=True

In [5]:
### LOAD FUNCTIONS MODULE
sys.path.append("scriptPython/")
from functions_chem import *



In [26]:
# TRAINING PARAMETERS
data_args=DataArguments()
print(data_args)

model_args=ModelArguments()
model_args.use_lora=lora
print(model_args)

training_args=TrainingArguments
training_args.deepspeed_plugin=None
training_args.run_name=model_name
training_args.model_max_length=1024 # max sequence length (can be increased)
training_args.gradient_accumulation_steps=1
training_args.learning_rate=5e-5
training_args.num_train_epochs=10
training_args.fp16=True 
training_args.save_steps=5000
training_args.evaluation_strategy="epoch"
training_args.warmup_steps=50
training_args.load_best_model_at_end=True
training_args.logging_steps=100000
training_args.find_unused_parameters=False

# Other arguments to add since it was bugging
bs=8
training_args.device=torch.device('cuda:0')
training_args.report_to=["tensorboard"]
training_args.world_size=1
training_args.train_batch_size=bs
training_args.eval_batch_size=bs
training_args.test_batch_size=bs
training_args.batch_size=bs
training_args.num_training_steps=100
training_args.n_gpu=1
training_args.distributed_state=None
training_args.local_rank=-1 # -1
training_args.metric_for_best_model="eval_loss"

training_args.fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False, 'xla_device': 'cpu'}
training_args.lr_scheduler_kwargs={}
training_args

DataArguments(data_path=None, kmer=-1)
ModelArguments(model_name_or_path='facebook/opt-125m', use_lora=True, lora_r=8, lora_alpha=32, lora_dropout=0.05, lora_target_modules='query,value')


functions_chem.TrainingArguments

In [27]:
# CONFIG QUANTIZATION
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
bnb_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [28]:
# CONFIG ACCELERATE
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [29]:
# CONFIG LORA
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

In [35]:
# LOAD DATA
#labeled_data=pd.read_csv("data/chemistry/finetune/antibiotic_ecoli_growth.csv",sep=',')
labeled_data=pd.read_csv("data/chemistry/finetune/tox21_balanced_revised_no_id.csv",sep=',')
print(np.sum(labeled_data["label"]))
print(np.sum(labeled_data["label"]==0))
display(labeled_data)

traintmp_df, test_df = train_test_split(labeled_data, test_size=0.2, random_state=42)

# Make balanced data
traintmp_df_pos=traintmp_df[traintmp_df.label==1]
traintmp_df_neg=traintmp_df[traintmp_df.label==0]
traintmp_df_neg=traintmp_df_neg.sample(len(traintmp_df_pos))
traintmp_df=pd.concat((traintmp_df_pos,traintmp_df_neg))
traintmp_df

train_df, valid_df = train_test_split(traintmp_df, test_size=0.2, random_state=42)
print(np.sum(train_df["label"]))
print(np.sum(valid_df["label"]))
print(np.sum(test_df["label"]))

431
1711


Unnamed: 0,text,label
0,CCCCCCCC/C=C\CCCCCCCC(N)=O,0
1,CCCCCCOC(=O)c1ccccc1,0
2,O=C(c1ccc(Cl)cc1)c1ccc(Cl)cc1,0
3,COc1cc(Cl)c(OC)cc1N,0
4,N[C@H](Cc1c[nH]c2ccccc12)C(=O)O,0
...,...,...
2137,CCCCCCCC(=O)OC,0
2138,CN1Cc2c(N)cccc2C(c2ccccc2)C1,0
2139,CCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCc4cc(O)ccc4[C@H...,1
2140,Cc1ncsc1CCCl,0


256
81
94


In [36]:
# LOAD TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side  = 'left'
print(tokenizer)

loading file tokenizer.json from cache at /home/mourad/.cache/huggingface/hub/models--RaphaelMourad--mixtral-chem-v0.1/snapshots/8488e266c3852ac476bc55be77db219927cbf86a/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/mourad/.cache/huggingface/hub/models--RaphaelMourad--mixtral-chem-v0.1/snapshots/8488e266c3852ac476bc55be77db219927cbf86a/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/mourad/.cache/huggingface/hub/models--RaphaelMourad--mixtral-chem-v0.1/snapshots/8488e266c3852ac476bc55be77db219927cbf86a/tokenizer_config.json


PreTrainedTokenizerFast(name_or_path='RaphaelMourad/mixtral-chem-v0.1', vocab_size=1024, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [37]:
# MAKE DATA FOR GPT NEO
train_encodings = tokenizer(train_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)
val_encodings = tokenizer(valid_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)
test_encodings = tokenizer(test_df["text"].values.tolist(), return_tensors="pt",padding=True,truncation=True)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_df["label"].values.tolist())
val_dataset = NewsDataset(val_encodings, valid_df["label"].values.tolist())
test_dataset = NewsDataset(test_encodings, test_df["label"].values.tolist())

# define datasets and data collator      
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [38]:
### CREATE AND TRAIN MODEL
num_labels=2

model=transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    output_hidden_states=False,
    quantization_config=bnb_config,
    device_map='cuda:0',
)

model.config.pad_token_id = tokenizer.pad_token_id
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model = accelerator.prepare_model(model)

# Setup trainer
trainer = transformers.Trainer(model=model,
                               args=training_args,
                               compute_metrics=compute_metrics,
                               train_dataset=train_dataset, 
                               eval_dataset=val_dataset,
                               data_collator=data_collator,
                              callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
                              )        
trainer.local_rank=training_args.local_rank
trainer.train()

loading configuration file config.json from cache at /home/mourad/.cache/huggingface/hub/models--RaphaelMourad--mixtral-chem-v0.1/snapshots/8488e266c3852ac476bc55be77db219927cbf86a/config.json
Model config MixtralConfig {
  "_name_or_path": "RaphaelMourad/mixtral-chem-v0.1",
  "architectures": [
    "MixtralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "max_position_embeddings": 512,
  "model_type": "mixtral",
  "num_attention_heads": 8,
  "num_experts_per_tok": 1,
  "num_hidden_layers": 8,
  "num_key_value_heads": 8,
  "num_local_experts": 64,
  "output_router_logits": false,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.02,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.37.2",
  "use_cache": true,
  "vocab_size": 4096
}

loading weights fi

Epoch,Training Loss,Validation Loss,Accuracy,F1,Matthews Correlation,Precision,Recall
1,No log,0.672723,0.592593,0.58345,0.170138,0.583739,0.58642
2,No log,0.635588,0.62963,0.622145,0.2488,0.622293,0.626543
3,No log,0.607215,0.681481,0.667164,0.334405,0.66774,0.666667
4,No log,0.582703,0.696296,0.684597,0.369274,0.684091,0.685185
5,No log,0.571612,0.696296,0.686384,0.373482,0.685223,0.688272
6,No log,0.562362,0.674074,0.6625,0.325306,0.661731,0.66358
7,No log,0.555253,0.688889,0.679589,0.360427,0.678348,0.682099
8,No log,0.562834,0.666667,0.657573,0.317026,0.656557,0.660494
9,No log,0.559357,0.651852,0.64049,0.281642,0.639676,0.641975
10,No log,0.557784,0.651852,0.64049,0.281642,0.639676,0.641975


***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set don't have a corr

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=680, training_loss=0.6015116523293887, metrics={'train_runtime': 741.9391, 'train_samples_per_second': 7.265, 'train_steps_per_second': 0.917, 'total_flos': 7846409376368640.0, 'train_loss': 0.6015116523293887, 'epoch': 10.0})

In [40]:
# PREDICT ON TEST DATA
bst=32
pred_test=[]
idx=range(0,len(test_dataset),bst)
pbar = ProgressBar()
for i in pbar(idx):
    batch=test_dataset[i:(i+bst)]
    output = model(batch['input_ids'].cuda())
    pred_test.append(output[0][:,1].detach())
    del output, batch
    gc.collect()

y_pred=torch.concatenate(pred_test,dim=0)
y_pred=torch.sigmoid(y_pred)
y_pred=y_pred.cpu().detach().numpy()

roc_auc_score(test_dataset.labels, y_pred)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100% |#####################################################################################################|


0.5101302000635122