In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 22/01/2024

In [None]:
# Script to fine tune mixtral-dna labeled data.

In [1]:
### LOAD PYTHON MODULES
# Load basic modules
import os
import sys
import time
from os import path
import gc

# Load data and machine learning modules
import numpy as np
import pandas as pd
from random import randrange
from progressbar import ProgressBar

import torch
import triton
import transformers
from torch.utils.data import TensorDataset, DataLoader
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, set_seed, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

# Print numpy version for compatibility with spektral
print(np.__version__) # Becareful: numpy should be 1.19 (and not 1.2) for spektral to work!
print(triton.__version__)
print(transformers.__version__)
print(torch.cuda.get_device_name(0))

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

  from .autonotebook import tqdm as notebook_tqdm


1.24.4
2.1.0
4.36.2
NVIDIA GeForce RTX 3090


In [2]:
### CHECK ENV
print(sys.prefix)

/home/mourad/miniconda3/envs/mistral-dna


In [3]:
### SET DIRECTORY
os.chdir("/media/mourad/SSD2/MistralDNA")
print(os.getcwd())

/media/mourad/SSD2/MistralDNA


In [4]:
### SPECIFY PARAMETERS
model_name="mixtral-dna" #  "DNABERT2" "mixtral-dna"
lora=True

In [5]:
### LOAD FUNCTIONS MODULE
sys.path.append("scriptPython/")
from functions import *

In [6]:
# TRAINING PARAMETERS
data_args=DataArguments()
print(data_args)

model_args=ModelArguments()
model_args.use_lora=lora
print(model_args)

training_args=TrainingArguments
training_args.deepspeed_plugin=None
training_args.run_name="mixtral-dna"
training_args.model_max_length=1024 # max sequence length (can be increased)
training_args.gradient_accumulation_steps=1
training_args.learning_rate=5e-4
training_args.num_train_epochs=10
training_args.fp16=True 
training_args.save_steps=5000
training_args.eval_steps=50
training_args.evaluation_strategy="steps"
training_args.warmup_steps=50
training_args.load_best_model_at_end=True
training_args.logging_steps=100000
training_args.find_unused_parameters=False

# Other arguments to add since it was bugging
bs=1024
training_args.device=torch.device('cuda:0')
training_args.report_to=["tensorboard"]
training_args.world_size=1
#training_args.per_device_train_batch_size=bs
training_args.train_batch_size=bs
training_args.eval_batch_size=bs
training_args.test_batch_size=bs
training_args.batch_size=bs
training_args.num_training_steps=100
training_args.n_gpu=1
training_args.distributed_state=None
training_args.local_rank=-1 # -1
training_args.metric_for_best_model="eval_loss"

training_args.fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False, 'xla_device': 'cpu'}
training_args.lr_scheduler_kwargs={}
training_args

DataArguments(data_path=None, kmer=-1)
ModelArguments(model_name_or_path='facebook/opt-125m', use_lora=True, lora_r=8, lora_alpha=32, lora_dropout=0.05, lora_target_modules='query,value')


functions.TrainingArguments

In [7]:
# CONFIG QUANTIZATION
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
bnb_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [8]:
# CONFIG ACCELERATE
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [9]:
# CONFIG LORA
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

In [None]:
### CREATE AND TRAIN MODEL

# LOOP OVER DATA
numRepeats=1

expes=["tf/0","tf/1","tf/2","tf/3","tf/4",
    "prom/prom_300_all","prom/prom_300_notata","prom/prom_300_tata",
    "prom/prom_core_all","prom/prom_core_notata","prom/prom_core_tata",
    "mouse/0","mouse/1","mouse/2","mouse/3","mouse/4",
    "EMP/H3","EMP/H3K14ac","EMP/H3K36me3","EMP/H3K4me1",
    "EMP/H3K4me2","EMP/H3K4me3","EMP/H3K79me3","EMP/H3K9ac","EMP/H4","EMP/H4ac",
    "splice/reconstructed","virus/covid"]

for expe in expes: 
    print(expe)
    
    data_args.data_path="data/GUE/"+expe
    model_args.model_name_or_path="RaphaelMourad/Mistral-DNA-v0.2"
    training_args.output_dir="results/mixtral-dna/GUE/"+expe+"/"
    
    if expe!="splice/reconstructed":
        num_labels=2
    else:
        num_labels=3
    
    if os.path.exists(training_args.output_dir)==False:
        os.makedirs(training_args.output_dir)

    for k in range(numRepeats):

        set_seed(randrange(1e8))

        # load tokenizer
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            model_max_length=training_args.model_max_length,
            padding_side="right",
            use_fast=True,
            trust_remote_code=True,
        )
        tokenizer.eos_token='[EOS]'
        tokenizer.pad_token = '[PAD]'

        # define datasets and data collator      
        train_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                          data_path=os.path.join(data_args.data_path, "train.csv"), 
                                          kmer=data_args.kmer)
        val_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                         data_path=os.path.join(data_args.data_path, "dev.csv"), 
                                         kmer=data_args.kmer)   
        test_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                         data_path=os.path.join(data_args.data_path, "test.csv"), 
                                         kmer=data_args.kmer)        
        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

        # load model
        model=transformers.AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            num_labels=num_labels,
            output_hidden_states=False,
            quantization_config=bnb_config,
            device_map='cuda:0',
        )
        model.config.pad_token_id = tokenizer.pad_token_id
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, peft_config)
        model = accelerator.prepare_model(model)

        # Setup trainer
        trainer = transformers.Trainer(model=model,
                                       args=training_args,
                                       compute_metrics=compute_metrics,
                                       train_dataset=train_dataset, 
                                       eval_dataset=val_dataset,
                                       data_collator=data_collator,
                                      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
                                      )        
        trainer.local_rank=training_args.local_rank
        trainer.train()

        # get the evaluation results from trainer
        results_path = training_args.output_dir+"/metrics"
        results = trainer.evaluate(eval_dataset=test_dataset)
        os.makedirs(results_path, exist_ok=True)
        with open(os.path.join(results_path, "test_results_"+str(k)+".json"), "w") as f:
            json.dump(results, f)

        # Save model
        trainer.save_state()
        
        del trainer, model

    metrics.to_csv(training_args.output_dir+"/metrics_auroc_aupr.csv",index=False)
    print(metrics)

loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


splice/reconstructed


loading configuration file results/models/mixtral-dna/config.json
Model config MixtralConfig {
  "_name_or_path": "results/models/mixtral-dna",
  "architectures": [
    "MixtralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 256,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "mixtral",
  "num_attention_heads": 8,
  "num_experts_per_tok": 1,
  "num_hidden_layers": 8,
  "num_key_value_heads": 8,
  "num_local_experts": 64,
  "output_router_logits": false,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.02,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 4096

Step,Training Loss,Validation Loss
