<a href="https://colab.research.google.com/github/roshjaison03/roshjaison03-Fine-tuned-Models-using-Unsloth-Framework-/blob/main/Training_of_Bert_base_uncased_with_emotional_response_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install bitsandbytes datasets transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig

# Define 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # or "fp4" if preferred
    bnb_4bit_compute_dtype="float16"
)

# Load tokenizer and model with BitsAndBytes 4-bit config
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained(
    "google-bert/bert-base-uncased",
    quantization_config=bnb_config,
    device_map="auto"  # optional: auto place on GPU if available
)


In [None]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value", "key", "dense"],  # Adjust based on your model
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM   # Use this for MLM
)
model = get_peft_model(model, peft_config)



In [None]:
import pandas as pd

# Load CSV as DataFrame
df = pd.read_csv("/content/converted_dataset.csv")

def format_row(row):
    input_text = f"{row['input']}"
    label_text = f"{row['label']}"
    return input_text, label_text

df[['input', 'label']] = df.apply(
    lambda row: pd.Series(format_row(row)), axis=1)

# Preview the result
print(df[['input', 'label']].head())


First input: I have no energy.  The thought of working out now is out of the question.  I used to bike and play tennis but that seems a lifetime ago.  I would love to feel alive again, but I am just so tired all the time.
First label: Getting back to regular physical activity is really appealing to you but also seems a bit out of reach. Youâ€™re excited to get back to a place where you feel invigorated again.


In [None]:
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = f"""You have to talk like a person where you are trying to understand emotions of a person understand the cause of it {reasoning_start} and {reasoning_end}.
Then, provide your response empathetically {solution_start} and {solution_end}."""

system_prompt

'You have to talk like a person where you are trying to understand emotions of a person understand the cause of it <start_working_out> and <end_working_out>.\nThen, provide your response empathetically <SOLUTION> and </SOLUTION>.'

In [None]:
dataset = df.apply(lambda x: {
    "prompt": [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": x["input"].strip()},
    ],
    "answer": x["label"].strip(" []',\"\n")
}, axis=1)

# Convert to list of dicts (if needed for training)
dataset = dataset.tolist()

# Preview the first example
print(dataset[0])

{'prompt': [{'role': 'system', 'content': 'You have to talk like a person where you are trying to understand emotions of a person understand the cause of it <start_working_out> and <end_working_out>.\nThen, provide your response empathetically <SOLUTION> and </SOLUTION>.'}, {'role': 'user', 'content': "I will have to look at that.  I've not been in the job market for 25 years.  I thought they may be able to provide some assistance in just where do I begin. 4\nI'm a teacher at a private school.  4"}], 'answer': 'You want to know where to start.  Sounds like you are young.'}


In [None]:
train_dataset = Dataset.from_list(dataset)
print(train_dataset[0])

{'prompt': [{'content': 'You have to talk like a person where you are trying to understand emotions of a person understand the cause of it <start_working_out> and <end_working_out>.\nThen, provide your response empathetically <SOLUTION> and </SOLUTION>.', 'role': 'system'}, {'content': "I will have to look at that.  I've not been in the job market for 25 years.  I thought they may be able to provide some assistance in just where do I begin. 4\nI'm a teacher at a private school.  4", 'role': 'user'}], 'answer': 'You want to know where to start.  Sounds like you are young.'}


In [None]:
class MLMDataCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def __call__(self, batch):
        texts = []

        for example in batch:
            prompt_text = self._format_prompt(example["prompt"])
            full_text = f"{prompt_text}\nAnswer: {example['answer']}"
            texts.append(full_text)

        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": encodings["input_ids"].clone()  # For MLM training
        }

    def _format_prompt(self, prompt_list):
        """Convert prompt list to formatted string."""
        formatted_text = ""
        for message in prompt_list:
            role = message["role"]
            content = message["content"]
            formatted_text += f"{role.capitalize()}: {content}\n"
        return formatted_text.strip()


In [None]:
data_collator = MLMDataCollator(tokenizer, max_length=1024)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/empathetic_model/results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_dir="/content/drive/MyDrive/empathetic_model/logs",
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="no",
    fp16=True,  # if on Colab GPU
    report_to="none",
    remove_unused_columns=False

)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
