## 1. Prepare Your Custom Dataset

In [34]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import pandas as pd
# from datasets import Dataset

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Prepare the dataset for GPT-2
def prepare_dataset(df, tokenizer):
    # Concatenate question and response as a single text
    df['text'] = df['question'] + tokenizer.eos_token + df['response']
    # Tokenize the text
    tokenized_data = df['text'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=128))
    # Convert to Dataset object from the datasets library
    return Dataset.from_pandas(pd.DataFrame(tokenized_data.tolist()))

# Load the dataset
file_path = "dataset/chatbot-dialogs1.csv"
df = load_dataset(file_path)

## 2. Fine-Tune the Model

In [20]:
import transformers
import torch
import accelerate

print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("Accelerate version:", accelerate.__version__)


Transformers version: 4.44.0
Torch version: 2.4.0+cu124
Accelerate version: 0.33.0


In [33]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Padding token
tokenizer.pad_token = tokenizer.eos_token

# Sample a fraction of the dataset for quick testing
sample_df = df.sample(frac=0.1, random_state=42)  # 10% of the data

# Prepare the dataset
train_dataset = prepare_dataset(df, tokenizer)

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1, # number of epochs
    per_device_train_batch_size=2,  # Adjust batch size if needed1
    save_steps=10_000,  # Save model more frequently
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


Step,Training Loss


KeyboardInterrupt: 

## 3. Predict with the Fine-Tuned Model

In [6]:
# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Function to predict the response
def predict_response(question, model, tokenizer, max_length=50):
    input_ids = tokenizer.encode(question, return_tensors='pt')
    outputs = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,  # Controls randomness: lower is more deterministic, higher is more random
        top_k=50,         # Limits sampling to the top k tokens
        top_p=0.9,        # Nucleus sampling: selects tokens with cumulative probability up to p
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
new_question = "How are you?"
predicted_response = predict_response(new_question, fine_tuned_model, fine_tuned_tokenizer)
print(f"Question: {new_question}\nResponse: {predicted_response}")


Question: How are you?
Response: How are you?
are you?	i'm a student at the school.
i'm a student at the school.	i'm a student at the school.
i'm a student at the school.	i'm a student at


## (Optional) Generate Text with the Fine-Tuned Model

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Function to generate text with attention mask and improved diversity
def generate_text(prompt, model, tokenizer, max_length=50, temperature=0.7, top_k=50, top_p=0.9):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,  # Controls randomness: lower is more deterministic, higher is more random
        top_k=top_k,              # Limits sampling to the top k tokens
        top_p=top_p,              # Nucleus sampling: selects tokens with cumulative probability up to p
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "Your custom prompt here"
generated_text = generate_text(prompt, fine_tuned_model, fine_tuned_tokenizer)
print(f"Generated text: {generated_text}")




Generated text: Your custom prompt here?
i'm not sure.	i'm not sure.
i'm not sure.	i'm not sure.
i'm not sure.	what's the matter with that?
what's the matter with that
