loading datasets using the Hugging Face datasets library in Python. Creating a DatasetDict containing training and validation datasets.

In [4]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

# raw_datasets = DatasetDict(
#     {
#         "train": ds_train.shuffle().select(range(10000)),
#         "valid": ds_valid.shuffle().select(range(100))
#     }
# )
# raw_datasets = DatasetDict(
#     {
#         "train": ds_train.shuffle().select(range(10000)), # 50000
#         "valid": ds_valid.shuffle().select(range(100)) # 500
#     }
# )

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)), # 50000
        "valid": ds_valid.shuffle().select(range(500)) # 500
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 500
    })
})

Accessing the first example in the training dataset.Just showing the first 200 characters of that value.

In [5]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: alvarofierroclavero/scikit-learn
PATH: sklearn/datasets/tests/test_mldata.py
COPIES: 384
SIZE: 5221
CONTENT: """Test functionality of mldata fetching utilities."""

import os
import shutil
import tempfile
import scipy as sp

from sklearn import datasets
from sklearn.datasets import mldata_filename, fetch_mld
LICENSE: bsd-3-clause


Now that we have a dataset, we need to prepare the texts so they’re in a format suitable for pretraining.

In [6]:
from transformers import AutoTokenizer

context_length = 128 #max length of input
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"], #tokenizes the text data from the first two examples of the training dataset 
    truncation=True, # truncate sequences longer than max_length
    max_length=context_length,  #to specify the maximum length of the sequences, 
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 24
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 75, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 91]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Defining a function tokenize(element) that tokenizes each element using the tokenizer 

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets #The tokenized_datasets variable holds the resulting tokenized dataset 

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1369559
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 13049
    })
})

we have the dataset ready, now setting up the model!

 initializing a GPT-2 model. using the same configuration for our model as for the small GPT-2 model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and pass the bos and eos (beginning and end of sequence) token IDs:

In [8]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained( #loads a pre-trained tokenizer 
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length, # the maximum number of tokens in the input sequence.
    bos_token_id=tokenizer.bos_token_id, #the beginning of sequence token in your tokenizer.
    eos_token_id=tokenizer.eos_token_id, # the end of sequence token in your tokenizer.
)

understanding the size of the GPT-2 model instantiated

In [9]:
model = GPT2LMHeadModel(config) #creating a GPT-2 model instance ready for language modeling tasks.
model_size = sum(t.numel() for t in model.parameters()) #calculates the total number of parameters in the model.
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters") 

GPT-2 size: 124.2M parameters


setting up a data collator that will take care of creating the batches. We can use the DataCollatorForLanguageModeling collator

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token #sets the pad token of the tokenizer to the end of sequence token (EOS token). 
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) 

This creates a data collator for language modeling. The DataCollatorForLanguageModeling class helps in batching and padding sequences for language modeling tasks. By setting mlm=False, you indicate that you're not performing masked language modeling (MLM), which means you're not masking tokens for prediction during training.

The code below processes a batch of tokenized sequences from the training dataset and then prints out the shapes of the processed tensors. 

In [11]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) #creates a batch of tokenized sequences by selecting the first 5 samples from the training dataset 
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


Now we have everything in place to actually train our model, we start training by logging in to Hugging Face.

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

configuring the training arguments and the Trainer. 

In [13]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments( #specifying various training parameters such as output directory, batch size, evaluation strategy, logging frequency, gradient accumulation steps, number of epochs, learning rate, etc. These arguments define how the training process should be carried out.
    output_dir="codeparrot-ds_new",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=2_000, # 5_000,
    logging_steps=2_000, # 5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000, # 1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=2_000, # 5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model, #The pre-configured GPT-2 language model to be trained.
    tokenizer=tokenizer, #The tokenizer associated with the model.
    args=args, #The training arguments specifying the training configuration.
    data_collator=data_collator, #The data collator used for batching and padding the training data.
    train_dataset=tokenized_datasets["train"], #The tokenized training dataset.
    eval_dataset=tokenized_datasets["valid"], 
    # use_wandb=False
)

solving wandb issues

In [14]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'Hugging face training.ipynb'
os.environ['WANDB_MODE'] = 'disabled'
#  WANDB_MODE=disabled

In [15]:
trainer.train()



Step,Training Loss,Validation Loss
2000,3.1646,1.994213




TrainOutput(global_step=2675, training_loss=2.8233961101781544, metrics={'train_runtime': 21345.7567, 'train_samples_per_second': 64.161, 'train_steps_per_second': 0.125, 'total_flos': 8.9463713513472e+16, 'train_loss': 2.8233961101781544, 'epoch': 1.0})

pushing the model and tokenizer to the Hub:

In [16]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1707180862.oisit-selab3:   0%|          | 0.00/5.33k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shradha01/codeparrot-ds_new/commit/3479b00d0ea06cc35b3f5cc6a9eba1c1213a2643', commit_message='End of training', commit_description='', oid='3479b00d0ea06cc35b3f5cc6a9eba1c1213a2643', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# from huggingface_hub import HfApi
# api = HfApi()

# # Upload all the content from the local folder to your remote Space.
# # By default, files are uploaded at the root of the repo
# api.upload_folder(
#     folder_path="/home/user1-selab3/shradha_test/roberta/codeparrot-ds_new",
#     repo_id="shradha01/codeparrot-ds_new",
#     repo_type="model",
# )

checking how well the trained model actually works and  wrapping up the model in a text generation pipeline

In [17]:
import torch
from transformers import pipeline 
#The pipeline function allows you to easily use pre-trained models for various tasks, including text generation.

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline( #text generation pipeline using the specified model
    "text-generation", model="shradha01/codeparrot-ds_new", device=device
)

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

example code of creating a scatter plot:

In [18]:
import numpy as np
import matplotlib.pyplot as plt

txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
x1, y1 = np.meshgrid(x, y


creating a DataFrame from two arrays:

In [19]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
df = pd.DataFrame({'x': x, 'y': y


In [20]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
df.loc[df


In [21]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
iris = datasets.load_iris()
X, y = iris.


Tokens can have a whitespace prefix, so we’ll also check for those versions in the tokenizer vocabulary. To verify that it works, we’ll add one test token which should be split into multiple tokens:

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") 
# loads the tokenizer from the specified pre-trained model, which appears to be tailored for code-related tasks.

keytoken_ids = [] #initializes an empty list to store token IDs corresponding to the keywords.
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0] #It then extracts the input IDs from the tokenized output.
    if len(ids) == 1:
        keytoken_ids.append(ids[0]) #appends the token ID 
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: testtest


writing a custom loss function that takes the input sequence, the logits, and the key tokens we just selected as inputs.

In [23]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous() #shifts the input labels (inputs) by one position to the left, removing the first token. 
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False) #cross-entropy loss function without reducing it immediately.
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) #each token independently
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

Before we can start training with this awesome new loss function, we need to prepare a dataloaders to load the data in batches.

Next, we group the parameters so that the optimizer knows which ones will get an additional weight decay. 

Dataloaders allow you to iterate over datasets in mini-batches, which is crucial for training deep learning models. These data loaders will be used during the training loop to provide batches of data to the model for training and evaluation.

In [24]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch") #will be returned as PyTorch tensors.
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

Group model parameters based on whether weight decay should be applied to them or not. useful for customizing the weight decay behavior for different groups of parameters in the optimizer. It allows for finer control over regularization during training, which can be beneficial for achieving better model performance and generalization.

In [25]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

evaluating the model regularly on the validation set during training

In [26]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

With the evaluate() function we can report loss and perplexity at regular intervals. Next, we redefine our model to make sure we train from scratch again:

creating an instance of the GPT-2 model with the specified configuration.

In [27]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel(config)

defining our optimizer, using the function from before to split the parameters for weight decay: update the model parameters during training using the gradients computed during backpropagation, applying weight decay regularization to some parameters based on the grouping 

In [28]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

preparing the model, optimizer, and dataloaders so we can start training:

In [29]:
from accelerate import Accelerator

accelerator = Accelerator() #setup for distributed training and mixed-precision training.

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Now that we have sent our train_dataloader to accelerator.prepare(), we can use its length to compute the number of training steps. Remember that we should always do this after preparing the dataloader, as that method will change its length.

 setting up a learning rate scheduler using the get_scheduler() function from the Transformers library

In [30]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader) #the total number of training steps over all epochs.
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

Now that we have sent our train_dataloader to accelerator.prepare(), we can use its length to compute the number of training steps. Remember that we should always do this after preparing the dataloader, as that method will change its length. We use a classic linear schedule from the learning rate to 0

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'shradha01/codeparrot-ds-accelerate'

In [37]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/user1-selab3/shradha_test/roberta/codeparrot-ds-accelerate is already a clone of https://huggingface.co/shradha01/codeparrot-ds-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


Before training, running a quick test to see if the evaluation function works properly:

In [1]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        loss = outputs.loss
        if loss is not None:
            losses.append(accelerator.gather(loss[None]))

    if losses:
        loss_tensor = torch.cat(losses)
        mean_loss = torch.mean(loss_tensor).item()
        try:
            perplexity = torch.exp(torch.tensor(mean_loss))  # Convert mean_loss to tensor
        except OverflowError:
            perplexity = float("inf")
        return mean_loss, perplexity.item()  # Ensure perplexity is returned as a float
    else:
        return float("inf"), float("inf")

Finally, we evaluate the model on the evaluation set with our new evaluate() function:

In [None]:
evaluate()

In the training loop we iterate over the dataloader and pass the batches to the model. 

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8 #simulate larger batch sizes without increasing the memory requirements.
eval_steps = 5_000 #specifies how often to evaluate the model on the validation dataset during training.

model.train() 
completed_steps = 0 #keeps track of the total number of completed training steps across epochs.
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    #"samples": step * samples_per_step,
                    #"steps": completed_steps,
                    #"loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

In [46]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="shradha01/codeparrot-ds_new", device=device
)

In [47]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
fig, ax = plt.subplots(1, 2, sharex
