In [1]:
#INstruction finetuning
from transformers import AutoTokenizer, AutoModelForCausalLM,TrainingArguments,Trainer
from peft import LoraConfig,get_peft_model,TaskType
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'

In [3]:
tokenizer=AutoTokenizer.from_pretrained(model)

if tokenizer.pad_token is None:
    tokenizer.pad_token=tokenizer.eos_token

In [4]:
model_path='./tinyllama-lora/checkpoint-5'

In [5]:
non_instruction_model=AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')

In [6]:
prompt='Clinical trials demonstrated that combining atrovastratin with'


In [7]:
inputs=tokenizer(prompt,return_tensors='pt').to("cuda")

In [8]:
outputs=non_instruction_model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.5,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1
)

In [9]:
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Clinical trials demonstrated that combining atrovastratin with a statin significantly reduced the risk of cardiovascular events compared to placebo in patients with dyslipidemia.
Atrovastratin is currently being studied for the prevention of cancer and for the treatment of breast cancer, prostate cancer, ovarian cancer and other cancers. Atrovastratin is also being studied for the prevention of diabetes mellitus and for the treatment of type 2 diabetes.
The


In [10]:
from datasets import load_dataset

dataset=load_dataset('Amod/mental_health_counseling_conversations',split='train')

In [11]:
dataset['Context']

Column(["I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to ev

In [12]:
def format(data):
    question=data['Context']
    answer=data['Response']
    data['Text']=f"[INST] {question} [/INST] {answer}"
    return data

In [13]:
formatted_dataset=dataset.map(format)

In [None]:
formatted_dataset

Dataset({
    features: ['Context', 'Response', 'Text'],
    num_rows: 3512
})

In [14]:
import pandas as pd
df=pd.DataFrame(dataset)
df

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...
...,...,...
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...


In [15]:
#loading dataset

from datasets import load_dataset

dataset = load_dataset("csv", data_files="./pharma_instruction_data.csv",split="train")
dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5
})

In [16]:

def format_example(example):
    prompt = f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    return {"text": prompt}

In [17]:
dataset = dataset.map(format_example)

In [18]:
dataset['text'][3]

'### Instruction:\nSummarize the key advantages and ongoing research directions for mRNA vaccines.\n### Input:\nThe success of mRNA vaccines against SARS-CoV-2 has opened new pathways for rapid vaccine development. mRNA platforms enable flexible design and quick adaptation to emerging viral variants such as BQ.1 and XBB.1.5. Phase-II clinical trials have shown strong immunogenicity with elevated neutralizing antibody titers and robust CD8⁺ T-cell responses. Ongoing research is exploring thermostable formulations and self-amplifying mRNA constructs to enhance global distribution and cost-efficiency.\n### Response:\nmRNA platforms enable rapid, flexible vaccine design and quick variant updates (e.g., BQ.1, XBB.1.5). Phase-II data show strong immunogenicity with robust neutralizing antibodies and CD8⁺ T-cell responses. Current work targets thermostable and self-amplifying formulations to improve distribution and cost.'

In [19]:

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [20]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized = dataset.map(tokenize_fn, batched=True)


In [None]:
tokenized

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})

In [21]:
from peft import LoraConfig,get_peft_model,TaskType

In [22]:
lora_config=LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=16,lora_dropout=0.05,target_modules=["q_proj","v_proj"],bias="none")

In [23]:
model=get_peft_model(non_instruction_model,lora_config)



In [24]:
model.print_trainable_parameters()


trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [25]:
args = TrainingArguments(
    output_dir="./tinyllama-instruction",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_total_limit=1,
    report_to="none"
)

In [26]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
)


The model is already on multiple devices. Skipping the move to device specified in `args`.


In [27]:
trainer.train()
trainer.save_model("/tinyllama-instruction")

Step,Training Loss


In [None]:

prompt = "Explain the mechanism of action of Metformin."

In [None]:
from peft import PeftModel
base_model_name='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_8bit=True,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [29]:
model = PeftModel.from_pretrained(
    model,
    "/tinyllama-instruction"
)


In [30]:
model = model.merge_and_unload()




In [31]:
model.save_pretrained("./tinyllama-instruction-final")
tokenizer.save_pretrained("./tinyllama-instruction-final")


('./tinyllama-instruction-final\\tokenizer_config.json',
 './tinyllama-instruction-final\\special_tokens_map.json',
 './tinyllama-instruction-final\\tokenizer.json')

In [37]:
import torch
model = AutoModelForCausalLM.from_pretrained(
    "./tinyllama-instruction-final",
    device_map="auto",
    torch_dtype=torch.float16
)

prompt = "From the passage, extract two benefits and two challenges of AI in pharmaceutical R&D"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.8,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1
)



In [38]:
print("\nModel Output:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Model Output:

From the passage, extract two benefits and two challenges of AI in pharmaceutical R&D.
2019-05-07 The article presents the advantages of using artificial intelligence (AI) for drug development and the challenges associated with it.
