In [1]:
from datasets import load_dataset,Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name="bitext/Bitext-customer-support-llm-chatbot-training-dataset"

dataset=load_dataset(dataset_name,split='train')
# Convert to pandas DataFrame
df_train = dataset.to_pandas()
# Split the training data into train and validation sets
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)


In [3]:
train_data

Unnamed: 0,flags,instruction,category,intent,response
3566,BLQ,want help seeing the early termination fee,CANCEL,check_cancellation_fee,To help you understand the early termination f...
14625,BILMQ,can ya help me updating the details included o...,ACCOUNT,edit_account,I'm on it! We're here to assist you with updat...
6599,BLQZ,want assistance checking your reimbursement plicy,REFUND,check_refund_policy,I'm on it! I completely understand your desire...
9532,BIL,how do I speak to an assistant?,CONTACT,contact_human_agent,Always good to connect! I'm attuned to the fac...
24355,BIZ,can I use the {{Account Category}} acclunt?,ACCOUNT,switch_account,Sure! I'm here to assist you in making the mos...
...,...,...,...,...,...
21575,BL,assistance to notify of a problem with registr...,ACCOUNT,registration_problems,We've received your message to us regarding th...
5390,BL,I am trying to see the accepted payment methods,PAYMENT,check_payment_methods,Indeed! I'm here to assist you with viewing ou...
860,BILZ,can you help me camceling the purchase I made?,ORDER,cancel_order,I've been informed that you need assistance wi...
15795,BLQZ,i dont know what ineed to do to download invoi...,INVOICE,get_invoice,I'm not mistaken your uncertainty about how to...


In [4]:
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [5]:
def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer(sample["instruction"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["response"], max_length=256, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
train_tokenized_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True, remove_columns=['flags', 'instruction', 'category', 'intent', 'response'])
test_tokenized_dataset = Dataset.from_pandas(eval_data).map(preprocess_function, batched=True, remove_columns=['flags', 'instruction', 'category', 'intent', 'response'])
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

Map: 100%|██████████| 21497/21497 [00:19<00:00, 1115.86 examples/s]
Map: 100%|██████████| 5375/5375 [00:04<00:00, 1148.56 examples/s]

Keys of tokenized dataset: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels']





In [7]:

lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.1,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

In [8]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862


In [9]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [10]:
output_dir="lora-flan-t5-small-chat"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="tensorboard",
    push_to_hub = False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
)
model.config.use_cache = False

trainer.train()
peft_save_model_id="lora-flan-t5-small-chat"
trainer.model.save_pretrained(peft_save_model_id, push_to_hub=False)
tokenizer.save_pretrained(peft_save_model_id, push_to_hub=False)
trainer.model.base_model.save_pretrained(peft_save_model_id, push_to_hub=False)

  0%|          | 19/5375 [02:58<13:44:06,  9.23s/it]

KeyboardInterrupt: 

# Zero shot inferencing

In [15]:
def gen(model,p, maxlen, sample=True):
    toks = tokenizer(p, return_tensors="pt")
    res = model.generate(**toks, max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,)
    return toks, tokenizer.batch_decode(res,skip_special_tokens=True)

In [21]:

index = 50

prompt = train_dataset['text'][index]

tok, res = gen(model,prompt,200,)
# output = res[0].split('Output:\n')[1]