### You may have to run the following otherwise put this in requirements.txt 
- pip3 install torch torchvision
- conda install pytorch torchvision -c pytorch



In [11]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
import torch

In [None]:
# Step 1: Load the dataset
dataset = load_dataset("codeparrot/codeparrot-clean")

# Step 2: Split the dataset into training and validation sets
split_datasets = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU device")


In [None]:
# Step 3: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

# Step 4: Tokenize the datasets with truncation and max_length
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding=False,
    )

tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, remove_columns=["content"]
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize_function, batched=True, remove_columns=["content"]
)