In [1]:
%pip install peft bitsandbytes transformers trl accelerate einops tqdm scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dataclasses import dataclass,field
from typing import Optional

In [3]:
import torch
from datasets import load_dataset,load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    Trainer,
    TrainingArguments,    
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from tqdm.notebook import tqdm

In [5]:
from trl import SFTTrainer

In [6]:
from huggingface_hub import interpreter_login

In [7]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



In [8]:
dataset=load_dataset("Amod/mental_health_counseling_conversations",split="train")

In [9]:
dataset

Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})

In [10]:
import pandas as pd

In [11]:
df=pd.DataFrame(dataset)

In [12]:
df.head(2)

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."


In [13]:
df.shape

(3512, 2)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3512 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [15]:
def format_row(row)->str:
    question=row['Context']
    answer=row['Response']
    formatted_string=f"[INST] {question} [/INST] [RES] {answer} [/RES]"
    return formatted_string

In [16]:
df['Formatted']=df.apply(format_row,axis=1)

In [17]:
df['Formatted']

0       [INST] I'm going through some things with my f...
1       [INST] I'm going through some things with my f...
2       [INST] I'm going through some things with my f...
3       [INST] I'm going through some things with my f...
4       [INST] I'm going through some things with my f...
                              ...                        
3507    [INST] My grandson's step-mother sends him to ...
3508    [INST] My boyfriend is in recovery from drug a...
3509    [INST] The birth mother attempted suicide seve...
3510    [INST] I think adult life is making him depres...
3511    [INST] I just took a job that requires me to t...
Name: Formatted, Length: 3512, dtype: object

In [18]:
new_df=df.rename(columns={'Formatted':'Text'})

In [19]:
new_df

Unnamed: 0,Context,Response,Text
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb...",[INST] I'm going through some things with my f...
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see...",[INST] I'm going through some things with my f...
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...,[INST] I'm going through some things with my f...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...,[INST] I'm going through some things with my f...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...,[INST] I'm going through some things with my f...
...,...,...,...
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...,[INST] My grandson's step-mother sends him to ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...,[INST] My boyfriend is in recovery from drug a...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit...",[INST] The birth mother attempted suicide seve...
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...,[INST] I think adult life is making him depres...


In [20]:
new_df=new_df[['Text']]

In [21]:
new_df

Unnamed: 0,Text
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...
3,[INST] I'm going through some things with my f...
4,[INST] I'm going through some things with my f...
...,...
3507,[INST] My grandson's step-mother sends him to ...
3508,[INST] My boyfriend is in recovery from drug a...
3509,[INST] The birth mother attempted suicide seve...
3510,[INST] I think adult life is making him depres...


In [22]:
new_df.head(3)

Unnamed: 0,Text
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...


In [23]:
new_df.head(3)

Unnamed: 0,Text
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...


In [24]:
new_df.to_csv("formatted_data.csv",index=False)

In [25]:
training_dataset=load_dataset("csv",data_files="formatted_data.csv",split="train")

Generating train split: 3512 examples [00:00, 92611.38 examples/s]


In [26]:
training_dataset

Dataset({
    features: ['Text'],
    num_rows: 3512
})

### Fine Tuning Code

In [27]:
base_model="microsoft/phi-2"

In [28]:
new_model="phi-2-metal-health"

In [None]:
tokenizer=AutoTokenizer.from_pretrained(base_model,use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4but_use_double_quant=False
)

model=AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map="auto",
    revision="refs/pr/23"
)
model.config.use_cache=False
model.config.pretraining_tp=1
model=prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)


Fetching 2 files: 100%|██████████| 2/2 [01:12<00:00, 36.29s/it]
PhiModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.89s/it]
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new fo

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [31]:
training_arguments=TrainingArguments(
    output_dir="./mhGPT",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    # evaluation_strategy="steps",
    # eval_steps=1000,
    logging_steps=15,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_steps=1500,
    warmup_ratio=0.05,
    weight_decay=0.01,
    # max_step=-1
)



In [33]:
peft_config=LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["Wqkv","fc1","fc2"]
)



In [40]:
trainer=SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    # train_dataset_text_field="Text",
    # max_seq_length=690,
    # tokenizer=tokenizer,
    args=training_arguments,
    # tokenizer=tokenizer,
    # packing=False,
    # data_collator=None
)

Adding EOS to train dataset: 100%|██████████| 3512/3512 [00:00<00:00, 72315.94 examples/s]
Tokenizing train dataset:   0%|          | 0/3512 [00:00<?, ? examples/s]


KeyError: 'text'

In [None]:
trainer.train()