In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 13/02/2024

In [None]:
# IMPORT LIBRARIES
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from transformers import AutoTokenizer
from tokenizers import ByteLevelBPETokenizer
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForPreTraining
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling, TextDataset
from datasets import load_dataset

In [None]:
torch.backends.cudnn.benchmark=True
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32 "

In [None]:
# SET DIRECTORY
os.chdir("/home/mourad/Téléchargements/Mistral-chem/")
print(os.getcwd())

In [None]:
# TRAIN YOUR OWN TOKENIZER
vocab_size=1024
file_tokenizer="data/tokenizer/mistral-chem-"+str(vocab_size)

if os.path.isfile(file_tokenizer)==False:
    from tokenizers import Tokenizer
    from tokenizers.models import BPE
    from transformers import PreTrainedTokenizerFast
    from tokenizers.pre_tokenizers import Whitespace
    
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    from tokenizers.trainers import BpeTrainer
    trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                        vocab_size=vocab_size, min_frequency=2)

    tokenizer.pre_tokenizer = Whitespace()

    files = ["data/chemistry/zinc/250k_rndm_zinc_drugs_clean_sorted.txt"]
    tokenizer.train(files, trainer)

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    tokenizer.save_pretrained(file_tokenizer)

    tokenizer.tokenize("C1CSCCSCCS1", padding="longest", truncation=True, return_tensors="pt")

In [None]:
# LOAD MIXTRAL MODEL CONFIGURATION
config = AutoConfig.from_pretrained("data/models/Mixtral-8x7B-v0.1-chem")
model = AutoModelForCausalLM.from_config(config,attn_implementation="flash_attention_2")
model

In [None]:
# NUMBER OF MODEL PARAMETERS
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Model size: {pytorch_total_params/1000**2:.1f}M parameters")
print(f"Model size: {pytorch_total_params:.1f} parameters")

In [None]:
# LOAD BPE LETTER TOKENIZER
tokenizer = AutoTokenizer.from_pretrained("data/tokenizer/mistral-chem-1024/", trust_remote_code=True)
tokenizer.pad_token = '[EOS]'
tokenizer.padding_side  = 'left'
print(tokenizer)

encoding = tokenizer("C1CSCCSCCS1", padding="longest", truncation=True, return_tensors="pt")
print(encoding)

In [None]:
# LOAD DATA 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

dataset_text = load_dataset("csv", data_files="data/chemistry/zinc/250k_rndm_zinc_drugs_clean_sorted.csv.gz")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="longest", truncation=True, return_tensors="pt")

dataset = dataset_text.map(tokenize_function, batched=True)
print(dataset["train"])

train_size = int(0.8 * len(dataset["train"]))
test_size = len(dataset["train"]) - train_size
train_set, val_set = torch.utils.data.random_split(dataset["train"], [train_size, test_size])
train_set[1]

In [None]:
len(train_set)

In [None]:
# PARAMETERS FOR FINE-TUNING
batchsize=64 # 1024 for 200b v0.1
training_args = TrainingArguments(
        output_dir='./results/models',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=50,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        learning_rate=1e-3, # 5e-4 for v0.1
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        fp16=True,
        gradient_accumulation_steps=32,# 50 for v0.1
        report_to=['tensorboard'],
)

print(training_args)

In [None]:
# PRETRAIN MODEL
# 24h / 50 epochs for an RTX3090
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=val_set,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
 
print ('Start a trainer...')
# Start training
trainer.train()

In [None]:
# Save model
trainer.save_state()