_GPU is a must for fine tuning the data using below code_

# __DATA__

We will use " Amod/mental_health_counseling_conversations" dataset from HuggingFace to fine tune our LLM model

In [2]:
from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 2.82k/2.82k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 4.79M/4.79M [00:02<00:00, 2.19MB/s]
Generating train split: 100%|██████████| 3512/3512 [00:00<00:00, 164227.61 examples/s]


In [None]:
dataset

_Data Processing_

In [None]:
import pandas as pd
df = pd.DataFrame(dataset['train'])
df.head()

We wil use " microsoft/Phi-3-mini-4k-instruct " model from Hugging face to fine tune our LLM model.

for this model, the recommended prompt template is 

 <|user|> How to explain Internet for a medieval knight?<|end|> <|assistant|>


Hence, we will format the data in the above template.

In [None]:
# creating a function to format the data

def format_dataset(row):

  '''
  This function will operate row-wise

  input : row of the dataframe
  output : formatted string

  '''

  context = row['Context']
  response = row['Response']

  return f"<|user|>\n{context}<|end|>\n<|assistant|>{response}<|end|>"

In [None]:
#mapping the function
df['formatted'] = df.apply(format_dataset, axis=1)

In [None]:
print(df['formatted'][0])

In [None]:
'''

We will take the particular formatted column and add it to the dataset

'''
#assigning the new column to the dataset['train']

dataset['train'] = dataset['train'].add_column('text', df['formatted'])

In [None]:
dataset

In [None]:
'''
We are removing 'Context' and 'Response' columns from the dataset.

Keeping only 'text' column, which will be used for training the model

'''

dataset['train']= dataset['train'].remove_columns(['Context', 'Response'])

In [None]:
dataset['train']['text'][2]

# __MODEL__

In [None]:
'''

We will import the model in 4 bit config

'''

#imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

#model

model_id = "microsoft/Phi-3-mini-4k-instruct"



#setting up the BitsAndBytes config for importing the model in 4 bit format

bnb_config = BitsAndBytesConfig(

    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16

)


# importing tokenizer and the model

'''GPU is mandatory for quantization. Otherwise, it will throw error. Hence, must connect to  GPU'''

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [None]:
model.gradient_checkpointing_enable()

'''
By enabling gradient checkpointing, we are making a trade off between speed and memory.
Due to this setting, the gradient checkpoints will be stored in memory.
This will decrease the speed, but will levearge the GPU memory.

'''

In [None]:
'''
As we have converted the model into quantized model for training,
due to this, the model behaviour might be unstable.

To stablize and prepare it for PEFT (Parameter Efficient Fine Tuning) training
we need to use prepare_model_for_kbit_training() from peft library.

'''

# import

from peft import prepare_model_for_kbit_training

model_peft = prepare_model_for_kbit_training(model)

In [None]:
# configuring the peft LoRA parameters

# import

from peft import LoraConfig

lora_config = LoraConfig(

    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = "all-linear"
)


# applying the LoRA congfiguration to the model

# import

from peft import get_peft_model

model_for_training = get_peft_model(model_peft, lora_config)

__Tokenizing the data__

In [None]:
# creating tokenization function

def tokenize_function(examples):

  return tokenizer(examples['text'], padding = True, truncation = True)


# Assiging a new variable to dataset['train']

dt = dataset['train']


# mapping the tokenizing function to the dataset

tokenized_data = dt.map(tokenize_function)

# __TRAINING__

In [None]:
'''

For training, we will use Trainer() from Transformers class.
We will configure the Trainer, using TrainingArguments() from transformers library.

'''

# setting an output directory path

output_dir = "./training_results"


# import

from transformers import TrainingArguments

training_args = TrainingArguments(

    bf16 = True,
    do_eval = False,
    learning_rate = 5.0e-06,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_train_epochs=1,
    output_dir=output_dir,
    save_total_limit=3,
    logging_steps=1,
    max_steps = 80,
    optim = "paged_adamw_8bit",
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03
)



# trainer

# import

from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(

    model = model_for_training,
    train_dataset = tokenized_data,
    args = training_args,
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

'''

Data collator is used for runtime data augmentation.
DataCollatorForLanguageModeling() will augment the text data (tokenized data) during training.
This will increase the performance of the model as it will be more robust to overfitting.

'''

In [None]:
model_for_training.config.use_cache = False

# training the model
trainer.train()

## __Saving the model__

In [None]:
model_for_training.save_pretrained("./trained_model")

__Pushing it to Hugging face Hub__

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_for_training.push_to_hub("[YOUR HUGGING FACE REPOSITORY]", use_auth_token=True)

## __Loading the trained model__

In [None]:
peft_model = "[YOUR HUGGING FACE REPOSITORY]"


#import

from peft import PeftConfig

config = PeftConfig.from_pretrained(peft_model)


# model

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


# tokenizer

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)



# peft model

from peft import PeftModel

model = PeftModel.from_pretrained(model_for_training, peft_model)
