Prebuilt Data From huggingface data hub

In [58]:
from datasets import Dataset,load_dataset
from peft import LoraConfig, get_peft_model, TaskType,prepare_model_for_kbit_training

In [59]:
dataset=load_dataset('roneneldan/TinyStories' , split='train')

#loading dataset

In [60]:
dataset

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [61]:
dataset[1]

{'text': 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.\n\nBeep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.'}

In [62]:
#using own pdf for domain specific finetuning
#Steps
#Data Collection
# splitting or chunking
# tokenization
# training

import fitz
pages_text = []
def extract_pdf_to_list_fitz(pdf_path):
    doc = fitz.open(pdf_path)
   

    for page in doc:
        text = page.get_text("text").strip()
        if text:
            pages_text.append(text)
    
    doc.close()
    return pages_text

file_path = 'Metformin.pdf'
pdf_content_list_fitz = extract_pdf_to_list_fitz(file_path)
print(f"Total pages extracted with fitz: {len(pdf_content_list_fitz)}")

Total pages extracted with fitz: 1


In [63]:
pages_text

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis. \n \nClinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA red

In [64]:
#convert to chunks
import re
def chunk_paragraphs(txt):

    paragraph = []
    for block in txt:
        chunks=re.split(r'\n\s*\n', block)
        for chunk in chunks:
            clean=chunk.strip()
            if len(clean)>30:
                paragraph.append(clean)
    return paragraph

para=chunk_paragraphs(pages_text)

In [65]:
para

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.',
 'Clinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA redu

In [66]:
#convertin to list
data=[{"text":p} for p in para]

In [67]:
dataset=Dataset.from_list(data)

In [68]:
dataset[0]

{'text': 'Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.'}

In [69]:
#select the model

base_model='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'



In [70]:
from transformers import AutoTokenizer,AutoModelForCausalLM,Trainer,TrainingArguments,DataCollatorForLanguageModeling


In [71]:
tokenizer=AutoTokenizer.from_pretrained(base_model)


In [72]:
if tokenizer.pad_token is None:
    tokenizer.pad_token=tokenizer.eos_token

In [73]:
#tokenization function
def tokenize_fn(examples):
    tokens=tokenizer(examples['text'],truncation=True,padding="max_length",max_length=512)
    tokens['labels']=tokens['input_ids'].copy()
    return tokens

In [74]:
tokenized=dataset.map(tokenize_fn,batched=True,remove_columns=['text'])

Map: 100%|██████████| 4/4 [00:00<00:00, 68.31 examples/s]


In [75]:
tokenized[0]

{'input_ids': [1,
  4737,
  689,
  262,
  338,
  697,
  310,
  278,
  1556,
  17644,
  2225,
  23059,
  470,
  284,
  9418,
  24947,
  16808,
  19335,
  293,
  19518,
  29889,
  30166,
  13,
  8011,
  7601,
  13336,
  310,
  3158,
  20789,
  278,
  26229,
  310,
  319,
  3580,
  29899,
  11236,
  630,
  26823,
  19015,
  559,
  29871,
  13,
  29898,
  19297,
  29968,
  511,
  263,
  6555,
  1539,
  19388,
  293,
  1072,
  9183,
  393,
  2504,
  4769,
  3144,
  1682,
  852,
  318,
  415,
  1296,
  322,
  9950,
  1017,
  22193,
  19100,
  333,
  362,
  29871,
  13,
  8000,
  297,
  6335,
  11407,
  540,
  29886,
  2454,
  3144,
  29884,
  535,
  29872,
  6352,
  6656,
  29889,
  30166,
  13,
  18502,
  898,
  967,
  330,
  368,
  19335,
  293,
  2761,
  29892,
  4737,
  689,
  262,
  756,
  1063,
  4318,
  304,
  11157,
  5881,
  29875,
  586,
  6151,
  1070,
  714,
  26807,
  29871,
  13,
  392,
  2479,
  9418,
  29899,
  13453,
  314,
  2922,
  706,
  4426,
  29889,
  30166,
  13,
  35

In [76]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    device_map="auto",
)
model = prepare_model_for_kbit_training(model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [77]:

lora_config=LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    target_modules=['q_proj','v_proj'],
    lora_dropout=0.05,
    bias='none'
)

In [78]:
non_inst_model_lora = get_peft_model(model, lora_config)

In [79]:

args = TrainingArguments(
    output_dir="./tinyllama-lora",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_total_limit=1,
    optim="paged_adamw_8bit",
    report_to="none"
)

In [80]:
trainer = Trainer(
    model=non_inst_model_lora,
    args=args,
    train_dataset=tokenized
)
   

In [81]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=5, training_loss=9.54721450805664, metrics={'train_runtime': 24.8356, 'train_samples_per_second': 0.805, 'train_steps_per_second': 0.201, 'total_flos': 63629646888960.0, 'train_loss': 9.54721450805664, 'epoch': 5.0})

In [101]:
from peft import get_peft_model

# Assuming non_inst_model_lora is your LoRA-wrapped model
non_inst_model_lora.save_pretrained("./tinyllama-lora")


In [103]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure pad_token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model in 8-bit for efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map={"": "cuda"},  # Start by putting everything on GPU
    llm_int8_enable_fp32_cpu_offload=True  # Offload extra parts to CPU in 32-bit
)

# Load the LoRA adapter you trained
lora_model = PeftModel.from_pretrained(base_model, "./tinyllama-lora")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
lora_model.to(device)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Li

In [105]:
prompt = "Once upon a time in a faraway land,"

inputs = tokenizer(prompt, return_tensors="pt").to(device)


In [107]:
# Generate output
prompt = "Clinical trials demonstrated that combining Atorvastatin with Ezetimibe"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")



In [108]:
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.8,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1
)
     

In [109]:
print("\nModel Output:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Model Output:

Clinical trials demonstrated that combining Atorvastatin with Ezetimibe significantly improved LDL-C levels and atherosclerosis.
Vasopressin receptor antagonists are in development for the prevention of hyperkalemia due to heart failure and congestive heart failure. Vasopressin receptors antagonist have been tested in 2 clinical trials in patients with severe hyperkalemia (>4 mmol/L). Both drugs, vasopressin receptor antagonist (H
