In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
import json
# Step 1: Data Preparation
# Load the dataset
dataset = load_dataset("Kaludi/Customer-Support-Responses")

# Convert to pandas DataFrame for easier exploration
df = dataset['train'].to_pandas()

# Drop rows with missing values
df.dropna(inplace=True)

# Split the dataset into training and validation sets
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

# Convert back to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)



In [None]:
print(df)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Kaludi/chatgpt-gpt4-prompts-bart-large-cnn-samsum")
model = AutoModelForSeq2SeqLM.from_pretrained("Kaludi/chatgpt-gpt4-prompts-bart-large-cnn-samsum")

def tokenize_function(examples):
    model_inputs = tokenizer(examples['query'], padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['response'], padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()


In [None]:
input_queries = [
    "I want to change my shipping address.",
    "Do you have a recycling program?",
    # Add more queries as needed
]

# Tokenize input queries
tokenized_inputs = tokenizer(input_queries, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Generate responses
generated_outputs = model.generate(
    input_ids=tokenized_inputs["input_ids"],
    attention_mask=tokenized_inputs["attention_mask"],
    max_length=50,  # Adjust as needed
    num_beams=5,    # Adjust as needed
    early_stopping=True
)

# Decode and print responses
decoded_responses = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
for query, response in zip(input_queries, decoded_responses):
    print(f"Query: {query}")
    print(f"Response: {response}")
    print()