In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import pandas as pd
import os
from pathlib import Path


model_name_or_path = "/mnt/nas1/models/llama/merged_models/llama2-7b-ner-chem_gene-e3s10"
device_map = "auto"
# if we are in a distributed setting, we need to set the device map and max memory per device
if os.environ.get('LOCAL_RANK') is not None:
    local_rank = int(os.environ.get('LOCAL_RANK', '0'))
    device_map = {'': local_rank}

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        device_map=device_map,
        load_in_4bit=True,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        ),
    )


file = Path('/mnt/nas1/corpus-bio-nlp/NER/PGx_CTD_chem_x_gene.csv')
df_pgx_ctd = pd.read_csv(file, dtype=str)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [02:51<00:00, 57.14s/it]


In [2]:
question = (
    "{sentence}\n"
    "---------------\n"
    "please extract all Chemical and Gene in the above text, "
    "Gene includes gene or protein, excluding Limited variation, Genomic variation, Genomic factor, Haplotype."
    "Chemical includes chemical and drug, excluding disease."
#    "The output format should be '<entity name, entity span, entity type>' ."
#    "The output format should be '<entity name, starting position of entity name, entity type>' ."
#    "The output format should be '<entity name, entity type>' ."
    #    "The output format should be '<starting index in sentence, ending index in sentence, entity name, entity type>' ."
    "The output format should be '<leading word in sentence, entity name, trailing word in sentence, entity type>' ."
    )
df_pgx_ctd = df_pgx_ctd.drop_duplicates(subset=["sentence"])
df_pgx_ctd["prompt"] = df_pgx_ctd["sentence"].apply(lambda x: question.format(sentence=x))


In [12]:
def chat_ner(x):
    input_pattern = '<s>{}</s>'
    text = x.strip()
    text = input_pattern.format(text)
    input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.cuda()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids, max_new_tokens=500, do_sample=False,
            top_p=1, temperature=1, repetition_penalty=1,
            eos_token_id=tokenizer.eos_token_id
        )
    outputs = outputs.tolist()[0][len(input_ids[0]):]
    response = tokenizer.decode(outputs)
    # print(response)
    response = response.replace('</s>', "").strip()
    return response

prompts = df_pgx_ctd["prompt"].tolist()
sentences = df_pgx_ctd["sentence"].tolist()
print(len(prompts))
for i in range(2):
    input1 = prompts[i]
    print(sentences[i])
    r = chat_ner(input1)
    print(r)


1655
Among controls , we found women with the A2/A2 genotype to have elevated levels of estrone ( +14.3 % , P = 0.01 ) , estradiol ( +13.8 % , P = 0.08 ) , testosterone ( +8.6 % , P = 0.34 ) , androstenedione ( +17.1 % , P = 0.06 ) , dehydroepiandrosterone ( +14.4 % , P = 0.02 ) , and dehydroepiandrosterone sulfate ( +7.2 % , P = 0.26 ) compared with women with the A1/A1 genotype .
<of, A2/A2, genotype, Gene>, <of, estrone, (, Chemical)>, <of, estradiol, (, Chemical)>, <of, testosterone, (, Chemical)>, <of, androstenedione, (, Chemical)>, <of, dehydroepiandrosterone, (, Chemical)>, <of, dehydroepiandrosterone sulfate, (, Chemical)>
PACAP -induced expression of the c-fos gene was significantly reduced by pretreatment with a PACAP receptor antagonist , PACAP - ( 6-38 ) - NH2 .
<PACAP, -induced, expression, Gene>, <of, the, c-fos, Gene>, <by, pretreatment, with, a, PACAP, Chemical>, <PACAP, - (, 6-38, Chemical>, <(, 6-38, Chemical>, Chemical>


In [4]:
model_name_or_path_e3 = "/mnt/nas1/models/llama/merged_models/llama2-7b-ner-chem_gene-e3s11"

tokenizer_e3 = AutoTokenizer.from_pretrained(model_name_or_path_e3)
model_e3 = AutoModelForCausalLM.from_pretrained(
        model_name_or_path_e3,
        device_map=device_map,
        load_in_4bit=True,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        ),
    )


Loading checkpoint shards: 100%|██████████| 3/3 [02:38<00:00, 52.87s/it]


<of, estrone, (, Chemical>, <of, estradiol, (, Chemical>, <of, testosterone, (, Chemical>, <of, androstenedione, (, Chemical>, <of, dehydroepiandrosterone, (, Chemical>, <of, dehydroepiandrosterone sulfate, (, Chemical></s>
<of, estrone, (, Chemical>, <of, estradiol, (, Chemical>, <of, testosterone, (, Chemical>, <of, androstenedione, (, Chemical>, <of, dehydroepiandrosterone, (, Chemical>, <of, dehydroepiandrosterone sulfate, (, Chemical>


In [14]:
def chat_ner2(x):
    input_pattern = '<s>{}</s>'
    text = x.strip()
    text = input_pattern.format(text)
    input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.cuda()
    with torch.no_grad():
        outputs = model_e3.generate(
            input_ids=input_ids, max_new_tokens=500, do_sample=False,
            top_p=1, temperature=1, repetition_penalty=1,
            eos_token_id=tokenizer.eos_token_id
        )
    outputs = outputs.tolist()[0][len(input_ids[0]):]
    response = tokenizer.decode(outputs)
    response = response.replace('</s>', "").strip()
    return response

print(len(prompts))
for i in range(2):
    input1 = prompts[i]
    print(sentences[i])
    r = chat_ner2(input1)
    print(r)

1655
Among controls , we found women with the A2/A2 genotype to have elevated levels of estrone ( +14.3 % , P = 0.01 ) , estradiol ( +13.8 % , P = 0.08 ) , testosterone ( +8.6 % , P = 0.34 ) , androstenedione ( +17.1 % , P = 0.06 ) , dehydroepiandrosterone ( +14.4 % , P = 0.02 ) , and dehydroepiandrosterone sulfate ( +7.2 % , P = 0.26 ) compared with women with the A1/A1 genotype .
<of, estrone, (, Chemical>, <of, estradiol, (, Chemical>, <of, testosterone, (, Chemical>, <of, androstenedione, (, Chemical>, <of, dehydroepiandrosterone, (, Chemical>, <of, dehydroepiandrosterone sulfate, (, Chemical>
PACAP -induced expression of the c-fos gene was significantly reduced by pretreatment with a PACAP receptor antagonist , PACAP - ( 6-38 ) - NH2 .
<of, PACAP, -induced, Gene>, <(, PACAP, - (, Chemical>, <(, PACAP, -, Chemical>, <(, PACAP, -, NH2, Chemical>
