In [1]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install transformers datasets accelerate


from google.colab import drive
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import json
import os

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113


In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Initialize the tokenizer and model with DistilGPT-2
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Ensure the tokenizer will use the same token for padding as it does for end-of-sequence
tokenizer.pad_token = tokenizer.eos_token

# Function to load data from a JSON file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()
    data = [json.loads(line) for line in lines]
    return data

data = load_data('/content/drive/MyDrive/GPT/questions_with_answers.json')

# Normalize data if necessary
for item in data:
    if 'migrated_to' not in item:
        item['migrated_to'] = None
    if 'posted_by_collectives' not in item:
        item['posted_by_collectives'] = None

# Convert data into a pandas DataFrame
df = pd.DataFrame(data)

# Convert DataFrame into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Function to tokenize and encode the dataset
def tokenize_function(examples):
    tokenized_outputs = tokenizer(examples['title'], padding="max_length", truncation=True, max_length=128)
    tokenized_outputs["labels"] = tokenized_outputs["input_ids"]
    return tokenized_outputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=df.columns.tolist())

# Split the dataset into training and validation sets
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
val_dataset = split['test']

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Limit the number of checkpoints
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Check for the last checkpoint in the output directory
last_checkpoint = None
if os.listdir(training_args.output_dir):
    last_checkpoint = training_args.output_dir

# Start training, potentially resuming from the last checkpoint
trainer.train(resume_from_checkpoint=last_checkpoint)

# Save the model
trainer.save_model("/content/drive/MyDrive/GPT/distilgpt2-finetuned")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/8320 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
