In [1]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset

# Load dataset (example: IMDb sentiment analysis dataset)
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = dataset['train'].map(tokenize_function, batched=True)
eval_dataset = dataset['test'].map(tokenize_function, batched=True)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save model checkpoints
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=16,   # Batch size per device during evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every X steps
    evaluation_strategy="epoch",     # Evaluate every epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()



KeyboardInterrupt



In [21]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Define a simple dataset
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        # Flatten tensors to avoid having a batch dimension in the dictionary values
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # remove batch dimension
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels based on your task

# Sample data
texts = ["I love programming", "I hate bugs"]
labels = [1, 0]

# Create dataset and trainer
train_dataset = SimpleDataset(texts, labels, tokenizer)
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)

# Fine-tune the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.671242872873942, metrics={'train_runtime': 31.6205, 'train_samples_per_second': 0.19, 'train_steps_per_second': 0.095, 'total_flos': 1578666332160.0, 'train_loss': 0.671242872873942, 'epoch': 3.0})

In [33]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Define a simple dataset
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        # Flatten tensors to avoid batch dimension issues
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # remove batch dimension
        item['labels'] = item['input_ids'].clone()  # Set labels as the input_ids
        return item

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Sample data
texts = ["Once upon a time", "In a land far away"]

# Create dataset and trainer
train_dataset = SimpleDataset(texts, tokenizer)
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=2)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)

# Train the model
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=6.210239410400391, metrics={'train_runtime': 46.0045, 'train_samples_per_second': 0.13, 'train_steps_per_second': 0.065, 'total_flos': 1567752192000.0, 'train_loss': 6.210239410400391, 'epoch': 3.0})