In [None]:
# Step 1: Install necessary libraries
!pip install transformers datasets



In [None]:
# Step 2: Import libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import json

In [None]:
# Step 3: Load and format your JSON dataset
with open('dataset.json') as f:
    data = json.load(f)

formatted_data = [{"prompt": item["He"], "response": item["She"]} for item in data]

dataset = Dataset.from_dict({"prompt": [item["prompt"] for item in formatted_data],
                             "response": [item["response"] for item in formatted_data]})

print(dataset)

Dataset({
    features: ['prompt', 'response'],
    num_rows: 2318
})


In [None]:
# Step 4: Initialize the tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
# Step 5: Tokenize the dataset
# Set the pad_token to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        [p + " " + r for p, r in zip(examples['prompt'], examples['response'])],
        padding="max_length",
        truncation=True
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/2318 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'response', 'input_ids', 'attention_mask'],
    num_rows: 2318
})


In [None]:
# Step 6: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Adjust based on available memory
    per_device_eval_batch_size=2,   # Adjust based on available memory
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
)


In [None]:

# Step 7: Initialize the model
model = GPT2LMHeadModel.from_pretrained(model_name, return_dict=True)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Define a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Use the same tokenized dataset for evaluation
    compute_metrics=None,  # Disable computing metrics during training
    tokenizer=tokenizer,  # Pass the tokenizer for padding
    data_collator=data_collator,  # Pass the data collator
`)

trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {eval_results['eval_loss']}")

In [None]:

# Step 10: Save the model
model.save_pretrained("fine-tuned-gpt2")
tokenizer.save_pretrained("fine-tuned-gpt2")
