In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 13/02/2024

In [1]:
# IMPORT LIBRARIES
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from transformers import AutoTokenizer
from tokenizers import ByteLevelBPETokenizer
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForPreTraining
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling, TextDataset
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cudnn.benchmark=True
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32 "

In [3]:
# SET DIRECTORY
os.chdir("/home/mourad/Téléchargements/Mistral-chem/")
print(os.getcwd())

/home/mourad/Téléchargements/Mistral-chem


In [4]:
# TRAIN YOUR OWN TOKENIZER
vocab_size=1024
file_tokenizer="data/tokenizer/mistral-chem-"+str(vocab_size)

if os.path.isfile(file_tokenizer)==False:
    from tokenizers import Tokenizer
    from tokenizers.models import BPE
    from transformers import PreTrainedTokenizerFast
    from tokenizers.pre_tokenizers import Whitespace
    
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    from tokenizers.trainers import BpeTrainer
    trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                        vocab_size=vocab_size, min_frequency=2)

    tokenizer.pre_tokenizer = Whitespace()

    files = ["data/chemistry/zinc/250k_rndm_zinc_drugs_clean_sorted.txt"]
    tokenizer.train(files, trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    tokenizer.save_pretrained(file_tokenizer)

    tokenizer.tokenize("C1CSCCSCCS1", padding="longest", truncation=True, return_tensors="pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.







In [5]:
# LOAD MIXTRAL MODEL CONFIGURATION
config = AutoConfig.from_pretrained("data/models/Mixtral-8x7B-v0.1-chem")
model = AutoModelForCausalLM.from_config(config,attn_implementation="flash_attention_2")
model

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes. No dtype was provided, you should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator.
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes. No dtype was provided, you should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator.


MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(4096, 768)
    (layers): ModuleList(
      (0-7): 8 x MixtralDecoderLayer(
        (self_attn): MixtralFlashAttention2(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear(in_features=768, out_features=64, bias=False)
          (experts): ModuleList(
            (0-63): 64 x MixtralBLockSparseTop2MLP(
              (w1): Linear(in_features=768, out_features=768, bias=False)
              (w2): Linear(in_features=768, out_features=768, bias=False)
              (w3): Linear(in_features=768, out_features=768, bias=False)
              (act_fn

In [6]:
# NUMBER OF MODEL PARAMETERS
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Model size: {pytorch_total_params/1000**2:.1f}M parameters")
print(f"Model size: {pytorch_total_params:.1f} parameters")

Model size: 931.5M parameters
Model size: 931541760.0 parameters


In [7]:
# LOAD BPE LETTER TOKENIZER
tokenizer = AutoTokenizer.from_pretrained("data/tokenizer/mistral-chem-1024/", trust_remote_code=True)
tokenizer.pad_token = '[EOS]'
tokenizer.padding_side  = 'left'
print(tokenizer)

encoding = tokenizer("C1CSCCSCCS1", padding="longest", truncation=True, return_tensors="pt")
print(encoding)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


PreTrainedTokenizerFast(name_or_path='data/tokenizer/mistral-chem-1024/', vocab_size=1024, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[EOS]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
{'input_ids': tensor([[ 63, 486, 179, 348]]), 'token_type_ids': tensor([[0, 0, 0, 0]]),

In [9]:
# LOAD DATA 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

dataset_text = load_dataset("csv", data_files="data/chemistry/zinc/250k_rndm_zinc_drugs_clean_sorted.csv.gz")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="longest", truncation=True, return_tensors="pt")

dataset = dataset_text.map(tokenize_function, batched=True)
print(dataset["train"])

train_size = int(0.8 * len(dataset["train"]))
test_size = len(dataset["train"]) - train_size
train_set, val_set = torch.utils.data.random_split(dataset["train"], [train_size, test_size])
train_set[1]

Generating train split: 249456 examples [00:00, 1592298.09 examples/s]
Map: 100%|████████████████████████████████████████████████| 249456/249456 [00:11<00:00, 22246.25 examples/s]

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 249456
})





{'text': 'CCC[C@@H]1CCc2nc(NC(=O)CN3C(=O)NC(=O)C3(C)C)sc2C1',
 'input_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  50,
  30,
  22,
  46,
  24,
  32,
  58,
  134,
  6,
  47,
  42,
  27,
  7,
  1016,
  42,
  27,
  7,
  47,
  42,
  27,
  7,
  141,
  6,
  22,
  7,
  22,
  7,
  337,
  63],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [10]:
len(train_set)

199564

In [11]:
# PARAMETERS FOR FINE-TUNING
batchsize=64 # 1024 for 200b v0.1
training_args = TrainingArguments(
        output_dir='./results/models',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=50,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        learning_rate=1e-3, # 5e-4 for v0.1
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        fp16=True,
        gradient_accumulation_steps=32,# 50 for v0.1
        report_to=['tensorboard'],
)

print(training_args)



TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=32,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=False,
group_by

I0000 00:00:1707816678.979610   61857 cpu_client.cc:370] TfrtCpuClient created.


In [None]:
# PRETRAIN MODEL
# 24h / 50 epochs for an RTX3090
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=val_set,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
 
print ('Start a trainer...')
# Start training
trainer.train()

Start a trainer...


Epoch,Training Loss,Validation Loss


In [None]:
# Save model
trainer.save_state()