In [None]:
pip install transformers torch


In [1]:
pip install transformers datasets


Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, load_metric

# Load the dataset
dataset = load_dataset("Kaludi/Customer-Support-Responses")

# Create train-test split
dataset = dataset["train"].train_test_split(test_size=0.2)  # 80% train, 20% test

# Load the tokenizer and model
model_name = "t5-small"  # You can use 't5-small', 't5-base', 't5-large', etc.
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess the data
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["query"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Set up the tokenizer for targets
    labels = tokenizer(examples["response"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Ensure save strategy matches evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the model
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,3.180034
2,No log,3.079034
3,No log,3.041817


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation results: {'eval_loss': 3.0418174266815186, 'eval_runtime': 0.0616, 'eval_samples_per_second': 243.44, 'eval_steps_per_second': 32.459, 'epoch': 3.0}


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/spiece.model',
 './trained_model/added_tokens.json')

In [19]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Kaludi/Customer-Support-Responses")

# Split dataset into train and test manually
train_dataset = dataset["train"].train_test_split(test_size=0.2)["train"]
test_dataset = dataset["train"].train_test_split(test_size=0.2)["test"]

# Load tokenizer and model (using t5-base for larger capacity)
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess function with data augmentation
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["query"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Set up the tokenizer for targets
    labels = tokenizer(examples["response"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets with data augmentation
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
test_tokenized_dataset = test_dataset.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments with hyperparameter tuning
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,  # Adjusted learning rate
    per_device_train_batch_size=16,  # Increased batch size for larger model
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Increased epochs for potentially better convergence
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Trainer configuration
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.784467
2,No log,1.4627
3,No log,1.271467
4,No log,1.162289
5,No log,1.123368


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation results: {'eval_loss': 1.1233675479888916, 'eval_runtime': 0.0551, 'eval_samples_per_second': 272.397, 'eval_steps_per_second': 18.16, 'epoch': 5.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/spiece.model',
 './fine_tuned_model/added_tokens.json')

In [22]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

def generate_response(query, model_path="./fine_tuned_model"):
    # Load tokenizer and model from the specified directory
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)

    # Tokenize the input query
    inputs = tokenizer("summarize: " + query, return_tensors="pt")

    # Generate response
    outputs = model.generate(inputs.input_ids)

    # Decode and return the response as a string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage:
query = "I'm having trouble applying a promo code."
response = generate_response(query)
print("Generated Response:")
print(response)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Response:
We're sorry to hear that. Can you please provide your promo code and the promo code
