In [None]:
import outlines
import os

model = outlines.models.openai("gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])
generator = outlines.generate.choice(model, ["Yellow", "Blue", "Green", "Red"])

color = generator("What is the closest color to Indigo? ")
print(color)

### Create Custom GPT Model

In [1]:
# Imports and Setup
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from tqdm.notebook import tqdm
import psutil
import gc
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Check if CUDA is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Using device: mps


In [2]:

# Load and prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [3]:

# Create custom config
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=128,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    hidden_size=128, 
    num_attention_heads=4,
    num_hidden_layers=4,
)

model = GPT2LMHeadModel(config).to(device)

In [4]:
# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return Dataset.from_dict({"text": [line.strip() for line in lines]})

dataset = load_dataset("adult_train.json")
print(f"Dataset size: {len(dataset)} samples")

Dataset size: 32561 samples


In [5]:
# Cell 6: Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Use tqdm to show progress
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=1000, 
                                remove_columns=dataset.column_names, 
                                desc="Tokenizing dataset")

print(f"Tokenized dataset size: {len(tokenized_dataset)} samples")


Tokenizing dataset:   0%|          | 0/32561 [00:00<?, ? examples/s]

Tokenized dataset size: 32561 samples


In [None]:
class LossCallback:
    def __init__(self):
        self.training_loss = []
        self.step = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and 'loss' in logs:
            self.training_loss.append(logs['loss'])
            self.step.append(state.global_step)
            self.plot_loss()

    def plot_loss(self):
        clear_output(wait=True)
        plt.figure(figsize=(12, 6))
        plt.plot(self.step, self.training_loss)
        plt.title('Training Loss Curve')
        plt.xlabel('Step')
        plt.ylabel('Loss')
        plt.show()

loss_callback = LossCallback()

In [8]:

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disable wandb logging
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[loss_callback]
)


In [9]:

# Train model
print("Starting training...")
trainer.train()
print("Training completed!")

loss_callback.plot_loss()

# Save the model
model.save_pretrained("./checkpoint_final")
tokenizer.save_pretrained("./checkpoint_final")
print("Model saved!")


Starting training...


  0%|          | 0/6108 [00:00<?, ?it/s]

{'loss': 10.8604, 'grad_norm': 3.997952461242676, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 10.8512, 'grad_norm': 3.983393907546997, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 10.823, 'grad_norm': 4.075135231018066, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 10.7818, 'grad_norm': 4.1127729415893555, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 10.7235, 'grad_norm': 4.1194000244140625, 'learning_rate': 5e-06, 'epoch': 0.02}
{'loss': 10.6477, 'grad_norm': 3.9749879837036133, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 10.5579, 'grad_norm': 3.5390474796295166, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.03}
{'loss': 10.4589, 'grad_norm': 2.7842977046966553, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}
{'loss': 10.3572, 'grad_norm': 2.2537550926208496, 'learning_rate': 9e-06, 'epoch': 0.04}
{'loss': 10.2636, 'grad_norm': 2.0942373275756836, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 10.1772, 'grad

In [None]:

# Cell 11: Memory cleanup
del model, trainer, tokenized_dataset, dataset
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None
print(f"Current memory usage: {psutil.virtual_memory().percent}%")