using the filtered dataset from  the Hub by downloading

In [2]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("code_search_net", "python", split="train")
ds_valid = load_dataset("code_search_net", "python", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(10000)),
        "valid": ds_valid.shuffle().select(range(100))
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 100
    })
})

looking at an example from the dataset.Just showing the first 200 characters of each field:

In [3]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPOSITORY_NAME: sernst/cauldron
FUNC_PATH_IN_REPOSITORY: cauldron/session/display/__init__.py
FUNC_NAME: code_block
WHOLE_FUNC_STRING: def code_block(
        code: str = None,
        path: str = None,
        language_id: str = None,
        title: str = None,
        caption: str = None
):
    """
    Adds a block of syntax highli
LANGUAGE: python
FUNC_CODE_STRING: def code_block(
        code: str = None,
        path: str = None,
        language_id: str = None,
        title: str = None,
        caption: str = None
):
    """
    Adds a block of syntax highli
FUNC_CODE_TOKENS: ['def', 'code_block', '(', 'code', ':', 'str', '=', 'None', ',', 'path', ':', 'str', '=', 'None', ',', 'language_id', ':', 'str', '=', 'None', ',', 'title', ':', 'str', '=', 'None', ',', 'caption', ':', 'str', '=', 'None', ')', ':', 'environ', '.', 'abort_thread', '(', ')', 'r', '=', '_get_report', '(', ')', 'r', '.', 'append_body', '(', 'render', '.', 'code_block', '(', 'block', '=', 'code', ',', 'path', 

Now that we have a dataset, we need to prepare the texts so they’re in a format suitable for pretraining.

In [4]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["func_code_string"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 4
Input chunk lengths: [128, 128, 23, 91]
Chunk mapping: [0, 0, 0, 1]


In [5]:
def tokenize(element):
    outputs = tokenizer(
        element["func_code_string"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 14686
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 159
    })
})

we have the dataset ready, now setting up the model!

 initializing a GPT-2 model. using the same configuration for our model as for the small GPT-2 model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and pass the bos and eos (beginning and end of sequence) token IDs:

In [6]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

loading a new model

In [7]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


setting up a data collator that will take care of creating the batches. We can use the DataCollatorForLanguageModeling collator

In [8]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


Now we have everything in place to actually train our model, we start training by logging in to Hugging Face.

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

configuring the training arguments and the Trainer. 

In [11]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codesearchnet-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=2_000, # 5_000,
    logging_steps=2_000, # 5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000, # 1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=2_000, # 5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    # use_wandb=False
)

In [12]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'Hugging face training.ipynb'
os.environ['WANDB_MODE'] = 'disabled'
#  WANDB_MODE=disabled

In [13]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=28, training_loss=10.007647923060826, metrics={'train_runtime': 221.4201, 'train_samples_per_second': 66.326, 'train_steps_per_second': 0.126, 'total_flos': 936470642688000.0, 'train_loss': 10.007647923060826, 'epoch': 0.97})

pushing the model and tokenizer to the Hub:

In [14]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

events.out.tfevents.1707236441.oisit-selab3:   0%|          | 0.00/4.89k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/shradha01/codesearchnet-ds/commit/34655d6fa532516c2e64cefe11dc31fcdaff5af7', commit_message='End of training', commit_description='', oid='34655d6fa532516c2e64cefe11dc31fcdaff5af7', pr_url=None, pr_revision=None, pr_num=None)

checking how well the trained model actually works and  wrapping up the model in a text generation pipeline

In [15]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="shradha01/codesearchnet-ds", device=device
)

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

creating a scatter plot:

In [19]:
import numpy as np
import matplotlib.pyplot as plt

txt = """\
def _create_function(name, doc=""): ""Create a PySpark function by its name"" 
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


def _create_function(name, doc=""): ""Create a PySpark function by its name"" 
.__,,._ friendsunmasked__:
      .__j__.__,,_____


creating a DataFrame from two arrays:

In [20]:
txt = """\
def greet():
    print('Hello World!')

# call the function
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


def greet():
    print('Hello World!')

# call the function
.,,.__/,___,______,_,_us,.____luster___


Tokens can have a whitespace prefix, so we’ll also check for those versions in the tokenizer vocabulary. To verify that it works, we’ll add one test token which should be split into multiple tokens:

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: testtest


writing a custom loss function that takes the input sequence, the logits, and the key tokens we just selected as inputs.

In [22]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

Before we can start training with this awesome new loss function, we need to prepare a dataloaders to load the data in batches.

Next, we group the parameters so that the optimizer knows which ones will get an additional weight decay. 

In [23]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

In [24]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

evaluating the model regularly on the validation set during training

In [25]:
# def evaluate():
#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(batch["input_ids"], labels=batch["input_ids"])

#         losses.append(accelerator.gather(outputs.loss))
#     loss = torch.mean(torch.cat(losses))
#     try:
#         perplexity = torch.exp(loss)
#     except OverflowError:
#         perplexity = float("inf")
#     return loss.item(), perplexity.item()

With the evaluate() function we can report loss and perplexity at regular intervals. Next, we redefine our model to make sure we train from scratch again:

In [26]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel(config)

defining our optimizer, using the function from before to split the parameters for weight decay:

In [27]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

preparing the model, optimizer, and dataloaders so we can start training:

In [28]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Now that we have sent our train_dataloader to accelerator.prepare(), we can use its length to compute the number of training steps. Remember that we should always do this after preparing the dataloader, as that method will change its length.

In [29]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [30]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codesearchnet-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'shradha01/codesearchnet-ds-accelerate'

In [None]:
output_dir = "codesearchnet-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

Before training, running a quick test to see if the evaluation function works properly:

Finally, we evaluate the model on the evaluation set with our new evaluate() function:

In [43]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        loss = outputs.loss
        if loss is not None:
            losses.append(accelerator.gather(loss[None]))

    if losses:
        loss_tensor = torch.cat(losses)
        mean_loss = torch.mean(loss_tensor).item()
        try:
            perplexity = torch.exp(torch.tensor(mean_loss))  # Convert mean_loss to tensor
        except OverflowError:
            perplexity = float("inf")
        return mean_loss, perplexity.item()  # Ensure perplexity is returned as a float
    else:
        return float("inf"), float("inf")

In [44]:
evaluate()

(11.04960823059082, 62919.30078125)

In [45]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    #"samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/459 [00:00<?, ?it/s]



{'steps': 12, 'loss/train': 80.63597869873047}
{'steps': 24, 'loss/train': 80.8174057006836}
{'steps': 37, 'loss/train': 76.69839477539062}
{'steps': 49, 'loss/train': 72.14698791503906}


In [47]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="shradha01/codesearchnet-ds", device=device
)

In [48]:
txt = """\
def _create_function(name, doc=""): ""Create a PySpark function by its name""
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


def _create_function(name, doc=""): ""Create a PySpark function by its name""
_reminder_,__._______Recs.,..___.____._


: 