In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

In [44]:
# data = {
#     "text": [
#         "Customer: Hello, I would like to book an appointment.\nAgent: Sure, I can help with that. When would you like to schedule it?\nCustomer: How about tomorrow at 3 PM?\nAgent: Tomorrow at 3 PM is available. I'll book it for you.\nCustomer: Thank you!\nAgent: You're welcome!",
#         "Customer: I need to reschedule my appointment.\nAgent: Sure, when would you like to reschedule it to?\nCustomer: Can we do it next Monday?\nAgent: Next Monday is available. Your appointment has been rescheduled.",
#         "Customer: Can you tell me my appointment time?\nAgent: Sure, your appointment is scheduled for next Tuesday at 2 PM."
#     ]
# }
# dataset = Dataset.from_dict(data)
# dataset

In [45]:
dataset = load_dataset('csv', data_files={'train': 'hvac_support.csv'})

In [46]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Call_ID', 'Question', 'Answer', 'Context'],
        num_rows: 100
    })
})

In [50]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['Context'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [51]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [52]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

In [53]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [55]:
trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

('./fine-tuned-gpt2\\tokenizer_config.json',
 './fine-tuned-gpt2\\special_tokens_map.json',
 './fine-tuned-gpt2\\vocab.json',
 './fine-tuned-gpt2\\merges.txt',
 './fine-tuned-gpt2\\added_tokens.json')

In [30]:
tuned_tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')
tuned_model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')

In [31]:
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [32]:
input_text = "Customer: Hello, I would like to book an appointment."
response = generate_response(input_text)
response

'Customer: Hello, I would like to book an appointment. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a woman. I am a'