### You may have to run the following otherwise put this in requirements.txt 
- pip3 install torch torchvision
- conda install pytorch torchvision -c pytorch



In [None]:
!pip install ipykernel accelerate torch torchvision transformers datasets

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
import torch
import multiprocessing

In [None]:
# variables
tokenize_parallize = True
fraction = 0.025  # % of the data
make_sample = True

num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores: {num_cores}")

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU device")

In [4]:
# Step 1: Load the dataset
dataset = load_dataset("codeparrot/codeparrot-clean")

def sample_dataset(dataset, fraction):
    total_samples = len(dataset)
    num_samples = int(total_samples * fraction)
    import random
    random.seed(42)
    indices = random.sample(range(total_samples), num_samples)
    sampled_dataset = dataset.select(indices)
    return sampled_dataset

actual_train_data_set = sample_dataset(dataset["train"], fraction=0.1) if make_sample else dataset["train"].train_test_split(test_size=0.1)
actual_eval_data_set = sample_dataset(dataset["train"], fraction=0.01) if make_sample else dataset["train"].train_test_split(test_size=0.1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Step 4: Tokenize the datasets with truncation and max_length
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding=False,
    )

import multiprocessing
num_cores = multiprocessing.cpu_count()
num_proc = max(1, num_cores - 2)  # Leave 2 cores free for other processes


def parallel_tokenize(tokenize_function, num_proc):
    tokenized_train_dataset = actual_train_data_set.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,  # Number of processes to use
    remove_columns=["content"],)

    tokenized_eval_dataset = actual_train_data_set.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,  # Number of processes to use
    remove_columns=["content"],)

    return tokenized_train_dataset, tokenized_eval_dataset

def non_parallel_tokenize(tokenize_function):
    tokenized_train_dataset = actual_train_data_set.map(
    tokenize_function, batched=True, remove_columns=["content"])
    tokenized_eval_dataset = actual_eval_data_set.map(
    tokenize_function, batched=True, remove_columns=["content"])
    return tokenized_train_dataset, tokenized_eval_dataset

if tokenize_parallize:
    tokenized_train_dataset, tokenized_eval_dataset = parallel_tokenize(tokenize_function, num_proc)
else:
    tokenized_train_dataset, tokenized_eval_dataset = non_parallel_tokenize(tokenize_function)

In [7]:
# Step 7: Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained("gpt2")
# Move the model to the selected device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.gradient_checkpointing_enable()

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Step 10: Define training arguments

training_args = TrainingArguments(
    output_dir="./codeparrot-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    optim="adamw_torch",
    dataloader_num_workers=4,
    evaluation_strategy="no",  # Change this from "steps" to "no"
    # Remove eval_steps since we're not evaluating
    save_steps=200,
    warmup_steps=50,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    dataloader_pin_memory=False,
    fp16=False,
)

In [None]:
# Step 11: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
)

# Step 12: Start training
trainer.train()