Prebuilt Data From huggingface data hub

In [33]:
from datasets import Dataset,load_dataset

In [34]:
dataset=load_dataset('roneneldan/TinyStories' , split='train')

#loading dataset

In [35]:
dataset

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [36]:
dataset[1]

{'text': 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.\n\nBeep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.'}

In [37]:
#using own pdf for domain specific finetuning
#Steps
#Data Collection
# splitting or chunking
# tokenization
# training

import fitz
pages_text = []
def extract_pdf_to_list_fitz(pdf_path):
    doc = fitz.open(pdf_path)
   

    for page in doc:
        text = page.get_text("text").strip()
        if text:
            pages_text.append(text)
    
    doc.close()
    return pages_text

file_path = 'Metformin.pdf'
pdf_content_list_fitz = extract_pdf_to_list_fitz(file_path)
print(f"Total pages extracted with fitz: {len(pdf_content_list_fitz)}")

Total pages extracted with fitz: 1


In [38]:
pages_text

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis. \n \nClinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA red

In [39]:
#convert to chunks
import re
def chunk_paragraphs(txt):

    paragraph = []
    for block in txt:
        chunks=re.split(r'\n\s*\n', block)
        for chunk in chunks:
            clean=chunk.strip()
            if len(clean)>30:
                paragraph.append(clean)
    return paragraph

para=chunk_paragraphs(pages_text)

In [40]:
para

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.',
 'Clinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA redu

In [41]:
#convertin to list
data=[{"text":p} for p in para]

In [42]:
dataset=Dataset.from_list(data)

In [43]:
dataset[0]

{'text': 'Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.'}

In [44]:
#select the model

base_model='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'



In [45]:
from transformers import AutoTokenizer,AutoModelForCausalLM,Trainer,TrainingArguments,DataCollatorForLanguageModeling


In [46]:
tokenizer=AutoTokenizer.from_pretrained(base_model)


In [47]:
if tokenizer.pad_token is None:
    tokenizer.pad_token=tokenizer.eos_token

In [48]:
#tokenization function
def tokenize_fn(examples):
    tokens=tokenizer(examples['text'],truncation=True,padding="max_length",max_length=512)
    tokens['labels']=tokens['input_ids'].copy()
    return tokens

In [49]:
tokenized=dataset.map(tokenize_fn,batched=True,remove_columns=['text'])

Map: 100%|██████████| 4/4 [00:00<00:00, 619.70 examples/s]


In [50]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [51]:
#load model
model=AutoModelForCausalLM.from_pretrained(base_model)