In [1]:
from accelerate import notebook_launcher
from transformers import TrainingArguments
import torch
import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from accelerate import Accelerator
from trl import SFTTrainer
import random
import shutil
import tempfile
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem
from rdkit.Chem import Fragments


In [2]:
prompt_template = {
    "prompt": (
        "Please predict the inhibitory activity for the cytochrome P450 enzyme CYP1A2 based on the provided SMILES and Functional Groups.\n"
        "Assess the activity as either 'active' or 'inactive' where applicable.\n\n"
        "### SMILES:\n{Smiles}\n\n### Functional Groups:\n{functional_groups}\n\n### RESPONSE:\n"
    ),
    "response_split": "### RESPONSE:"
}

def generate_prompt(instruction, label=None, prompt_template=prompt_template):
    # 提取功能团信息
    functional_groups = extract_functional_groups(instruction)
    
    # 格式化功能团信息为字符串
    functional_group_str = ""
    for group, present in functional_groups.items():
        if present:  # 只包括存在的功能团
            functional_group_str += f"- {group}: present\n"
    
    # 使用 SMILES 和功能团生成基础 prompt
    res = prompt_template["prompt"].format(Smiles=instruction, functional_groups=functional_group_str)
    
    # 如果有标签（label），添加到响应部分
    if label:
        res += label
    
    return res



In [3]:
checkpoint =  "/openbayes/input/input2"
#checkpoint1= "./model/llama3_8b_cyp4_epoch_3/"
checkpoint1 =  "/openbayes/input/input2"
model = AutoModelForCausalLM.from_pretrained(checkpoint1, device_map='auto',torch_dtype=torch.bfloat16, load_in_8bit=False)

tokenizer = AutoTokenizer.from_pretrained(checkpoint,padding_side='right',model_max_length=400,tokenizer_type='llama')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizerFast'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described 

Embedding(128258, 4096)

In [4]:
def extract_functional_groups(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # 仅提取与 CYP1A2 代谢相关的功能团
    functional_groups = {
        'Carboxylic Acids': Fragments.fr_COO(mol),      # 羧酸类
        'Esters': Fragments.fr_ester(mol),              # 酯类
        'Hydroxy Groups': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[OX2H]'))),  # 羟基
        'Amines': Fragments.fr_NH2(mol),                # 胺类
        'Ethers': Fragments.fr_ether(mol),              # 醚类
        'Phenols': Fragments.fr_phenol(mol)             # 酚类
    }
    return functional_groups


In [5]:
def tokenize(tokenizer, prompt, max_length=256, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors=None)

    result["labels"] = result["input_ids"].copy()
    return result


def generate_response(row):
    responses = []
    # 检查每个酶的预测结果
    for enzyme in ['labels']:
        if not pd.isna(row[enzyme]):
            activity = 'active' if row[enzyme] == 1 else 'inactive'
            responses.append(f"CYP1A2: {activity}")
    
    response = "\n".join(responses) if responses else ""
    #response += "\n".join(enzyme_predictions)
    return response

def generate_and_tokenize_prompt(data_point):
    #selfies = generate_selfies(data_point["Smiles"])
    #functional_groups = extract_functional_groups(data_point["Smiles"])
    #functional_groups_text = "\n".join([f"{group}: {count}" for group, count in functional_groups.items() if count > 0])
    lable = generate_response(data_point)
    full_prompt = generate_prompt(
        data_point["Smiles"],
        lable,
    )

    #print("Generated Prompt:\n", full_prompt)
    tokenized_full_prompt = tokenize(tokenizer, full_prompt)
    user_prompt = generate_prompt(
        data_point["Smiles"],
    )
    tokenized_user_prompt = tokenize(tokenizer, user_prompt)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])
    mask_token = [-100] * user_prompt_len
    tokenized_full_prompt["labels"] = mask_token + tokenized_full_prompt["labels"][user_prompt_len:]
    return tokenized_full_prompt

In [6]:
import shutil
import tempfile

In [7]:
def training_function(model,tokenizer):
    max_seq_length = 256
    #output_dir = "./results"
    output_dir = tempfile.mkdtemp()
    per_device_train_batch_size = 1
    gradient_accumulation_steps = 4
    save_steps = 200000000
    logging_steps = 50
    learning_rate = 2e-5
    max_grad_norm = 0.3
    warmup_ratio = 0.03
    lr_scheduler_type = "constant"

    training_arguments = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        max_grad_norm=max_grad_norm,
        warmup_ratio=warmup_ratio,
        lr_scheduler_type=lr_scheduler_type,
        fp16=False,
        group_by_length=True,
        num_train_epochs=3
    )

    dataset = load_dataset("csv", data_files={"train": "cyp1a2_merged.csv"}, split='train')
    
    #dataset = dataset.train_test_split(test_size=10, shuffle=True, seed=42)
    cols = ['Smiles', 'labels']
    train_dataset = dataset.shuffle().map(generate_and_tokenize_prompt, remove_columns=cols)
    #val_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)
    #print(len(dataset['train']))
    accelerator = Accelerator()
    
   

    
    trainer = SFTTrainer(
        model = model,
        train_dataset = train_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )

    model, trainer = accelerator.prepare(model, trainer)

    trainer.train()
  
    trainer.model.save_pretrained("./model/llama3_8b_cyp_epoch_3/")
      # 删除临时目录
    shutil.rmtree(output_dir)
training_function(model,tokenizer)

Map:   0%|          | 0/11357 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss
50,2.4615
100,1.82
150,1.7385
200,1.6417
250,1.6143
300,1.5017
350,1.4115
400,1.4804
450,1.4531
500,1.3852
