In [1]:
import pandas as pd
from datasets import load_dataset,Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq




In [2]:
df_train=pd.read_csv("Generated-Customer-Support-Data.csv")
# dataset_name="Kaludi/Customer-Support-Responses"
# dataset = load_dataset(dataset_name, split='train')
# df_train = dataset.to_pandas()
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)

In [3]:
train_data

Unnamed: 0,query,response
3994,I want to change my shipping address.,We'd be happy to help. Can you please provide ...
423,I received a damaged product.,We'd be happy to help. Can you please provide ...
2991,I was charged incorrectly.,Certainly. Please provide your order number an...
1221,I was charged incorrectly.,We apologize for the error. Can you please pro...
506,Can I cancel my order?,"Sure, you can cancel your order if it hasn't b..."
...,...,...
1130,I received a damaged product.,We apologize for the error. Can you please pro...
1294,How do I track my shipment?,We apologize for the inconvenience. Can you pl...
860,I want to change my shipping address.,"Please provide your order number, and we will ..."
3507,I received a damaged product.,No problem. Can you please provide your order ...


In [4]:
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [5]:
def preprocess_function(sample, padding="max_length"):
    model_inputs = tokenizer(sample["query"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["response"], max_length=256, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True, remove_columns=['query', 'response'])
test_tokenized_dataset = Dataset.from_pandas(eval_data).map(preprocess_function, batched=True, remove_columns=['query', 'response'])

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [6]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


In [7]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862


In [8]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
)

In [9]:
output_dir = "automatic_customer_response"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=10,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False
)

In [10]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: pramodjella1993 (pramodjella1993-swecha). Use `wandb login --relogin` to force relogin


  0%|          | 0/4000 [00:00<?, ?it/s]

{'loss': 0.1302, 'grad_norm': 0.19632676243782043, 'learning_rate': 0.0004, 'epoch': 6.0}
{'loss': 0.1296, 'grad_norm': 0.16812561452388763, 'learning_rate': 0.0003, 'epoch': 7.0}
{'loss': 0.1273, 'grad_norm': 0.1852487325668335, 'learning_rate': 0.0002, 'epoch': 8.0}




{'loss': 0.1263, 'grad_norm': 0.1832023561000824, 'learning_rate': 0.0001, 'epoch': 9.0}




{'loss': 0.125, 'grad_norm': 0.1557779163122177, 'learning_rate': 0.0, 'epoch': 10.0}




{'train_runtime': 761.0422, 'train_samples_per_second': 42.048, 'train_steps_per_second': 5.256, 'train_loss': 0.06384559917449951, 'epoch': 10.0}


TrainOutput(global_step=4000, training_loss=0.06384559917449951, metrics={'train_runtime': 761.0422, 'train_samples_per_second': 42.048, 'train_steps_per_second': 5.256, 'total_flos': 3008070942720000.0, 'train_loss': 0.06384559917449951, 'epoch': 10.0})

In [11]:
from peft import PeftModel,PeftConfig

In [12]:
import os
os.environ["HF_TOKEN"]="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"

In [30]:
# Load the original model
original_model_id = model_id
original_model = AutoModelForSeq2SeqLM.from_pretrained(original_model_id).cuda()
original_tokenizer = AutoTokenizer.from_pretrained(original_model_id)

sample = "Human: \n I was charged incorrectly. \nAssistant: "
input_ids = original_tokenizer(sample, return_tensors="pt", truncation=True, max_length=256).input_ids.cuda()

# Generate output using the original model
original_outputs = original_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_length=256)

print("Original Model Output:")
print(original_tokenizer.batch_decode(original_outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])



Original Model Output:
He has to be on the street.


In [31]:
# Load the LoRA fine tuned model
peft_model_id = "automatic_customer_response/checkpoint-4000"
config = PeftConfig.from_pretrained(peft_model_id)


In [32]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='google/flan-t5-small', revision=None, task_type='SEQ_2_SEQ_LM', inference_mode=True, r=16, target_modules={'q', 'v'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [33]:
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)




In [34]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [35]:
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0}).cuda()
model.eval()


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
             

In [36]:

# Output of LoRA fine tuned model
outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_length=256)


In [37]:

print("LoRA Modified Model Output:")
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

LoRA Modified Model Output:
We'd be happy to help. Can you please provide more details about your question?
