# Training a causal language model from scratch (PyTorch)

In [1]:
from transformers import AutoTokenizer, GPT2LMHeadModel

In [2]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from evaluate import load
import evaluate

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
ds_train = load_dataset("code_search_net", "java", split="train")
ds_test = load_dataset("code_search_net", "java", split="test")
ds_valid = load_dataset("code_search_net", "java", split="validation")
raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(4000)), # "train": ds_train,  # .shuffle().select(range(50000)),
        "test": ds_test.shuffle().select(range(500)),
        "valid": ds_valid.shuffle().select(range(500)) # "valid": ds_valid,  # .shuffle().select(range(500))
    }
)
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
raw_datasets

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 500
    })
    valid: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 500
    })
})

In [5]:
len(tokenizer)

50000

In [6]:
tokenizer

GPT2TokenizerFast(name_or_path='huggingface-course/code-search-net-tokenizer', vocab_size=50000, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
print(raw_datasets["test"][0]["whole_func_string"])

public long rawPoll(final RawBlockHandler rawBlockHandler, final int blockLengthLimit)
    {
        long bytesConsumed = 0;
        for (final Image image : images)
        {
            bytesConsumed += image.rawPoll(rawBlockHandler, blockLengthLimit);
        }

        return bytesConsumed;
    }


In [8]:
valid_dataset = raw_datasets["test"]

for i in range(20):
    print(f"Index {i}: {valid_dataset['whole_func_string'][i]}")

Index 0: public long rawPoll(final RawBlockHandler rawBlockHandler, final int blockLengthLimit)
    {
        long bytesConsumed = 0;
        for (final Image image : images)
        {
            bytesConsumed += image.rawPoll(rawBlockHandler, blockLengthLimit);
        }

        return bytesConsumed;
    }
Index 1: @Override
	public final boolean serialize(final JsonContext jsonContext, final T value) {
		if (jsonContext.pushValue(value)) {
			// prevent circular dependencies
			return false;
		}

		serializeValue(jsonContext, value);

		jsonContext.popValue();

		return true;
	}
Index 2: public List<MFile> readFilesFromIndex(IndexReader indexReader) throws IOException {
    List<MFile> result = new ArrayList<>(100);
    if (index == null) return result;

    indexReader.readMFiles(index, result);
    return result;
  }
Index 3: protected void throwableOnManagementMethodInvocation(Throwable t) throws ProfileImplementationException, InvalidStateException, ManagementException {
      

In [9]:
import jsonlines

valid_dataset = raw_datasets["test"]

# Specify the file path where you want to save the output
output_file_path = "whole_func_strings_new.txt"

# Open the file in write mode with newline=''
with jsonlines.open(output_file_path, mode="w") as writer:
    # Iterate over the whole_func_string values in the valid dataset
    for value in valid_dataset['whole_func_s                         tring']:
        # Write each value to the file
        #data = {'whole_func_string': value}
        writer.write(value.strip('"'))
        # print(value)
        # print("*" * 50)

print(f"Whole_func_string values from the valid dataset saved to {output_file_path}")

Whole_func_string values from the valid dataset saved to whole_func_strings_new.txt


In [2]:
test_input = '"abc'
test_input.replace('"', '')

In [10]:
import jsonlines

valid_dataset = raw_datasets["test"]

# Specify the file path where you want to save the output
output_file_path = "whole_func_strings_new1.txt"

# Open the file in write mode with newline=''
with jsonlines.open(output_file_path, mode="w") as writer:
    # Iterate over the whole_func_string values in the valid dataset
    for value in valid_dataset['whole_func_string']:
        # Remove leading and trailing double quotes
        value_stripped = value.strip('"')
        # Write the processed value to the file
        writer.write(value_stripped.replace('"', '') + "\n")  # Add a newline after each string if desired

print(f"Whole_func_string values from the valid dataset saved to {output_file_path}")


Whole_func_string values from the valid dataset saved to whole_func_strings_new1.txt


In [11]:
with open("/home/user1-selab3/shradha_test/roberta/whole_func_strings_new.txt", "r") as f:
    text = f.read()

In [12]:
text



In [13]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:1000]}")

REPOSITORY_NAME: EsotericSoftware/kryonet
FUNC_PATH_IN_REPOSITORY: src/com/esotericsoftware/kryonet/rmi/ObjectSpace.java
FUNC_NAME: ObjectSpace.addConnection
WHOLE_FUNC_STRING: public void addConnection (Connection connection) {
		if (connection == null) throw new IllegalArgumentException("connection cannot be null.");

		synchronized (connectionsLock) {
			Connection[] newConnections = new Connection[connections.length + 1];
			newConnections[0] = connection;
			System.arraycopy(connections, 0, newConnections, 1, connections.length);
			connections = newConnections;
		}

		connection.addListener(invokeListener);

		if (TRACE) trace("kryonet", "Added connection to ObjectSpace: " + connection);
	}
LANGUAGE: java
FUNC_CODE_STRING: public void addConnection (Connection connection) {
		if (connection == null) throw new IllegalArgumentException("connection cannot be null.");

		synchronized (connectionsLock) {
			Connection[] newConnections = new Connection[connections.length + 1];
			newCo

In [14]:
outputs = tokenizer(
    raw_datasets["train"][:2]["whole_func_string"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 3
Input chunk lengths: [128, 1, 59]
Chunk mapping: [0, 0, 1]


In [15]:
def tokenize(element):
    outputs = tokenizer(
        element["whole_func_string"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
print(f'[DBG] tokenized_datasets: {tokenized_datasets}')
print(f'[DBG] len(tokenizer): {len(tokenizer)}')
print(f'[DBG] tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'[DBG] tokenizer.eos_token_id: {tokenizer.eos_token_id}')

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[DBG] tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3023
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 492
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 290
    })
})
[DBG] len(tokenizer): 50000
[DBG] tokenizer.bos_token_id: 0
[DBG] tokenizer.eos_token_id: 0


In [16]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [17]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())  # num of elements
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [18]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [19]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [20]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# n_gpu = torch.cuda.device_count()
# device, n_gpu

In [21]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    idss = tokenizer([keyword]).input_ids
    idsss = tokenizer(keyword)
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword} {len(ids)} {ids} {tokenizer.tokenize(keyword)}")
keytoken_ids

Keyword has not single token: testtest 2 [1824, 1824] ['test', 'test']


[8436, 4289, 1201, 2770, 5431, 2564, 2604, 2110, 2872, 4969]

In [22]:
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [23]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [24]:
# def evaluate():
#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(batch["input_ids"], labels=batch["input_ids"])

#         losses.append(accelerator.gather(outputs.loss))
#     # loss = torch.mean(torch.cat(losses))
#     loss = torch.mean(torch.stack(losses))
#     try:
#         perplexity = torch.exp(loss)
#     except OverflowError:
#         perplexity = float("inf")
#     return loss.item(), perplexity.item()
# 
# Updated by "https://huggingface.co/blog/codeparrot"


# def evaluate():
#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(batch["input_ids"], labels=batch["input_ids"])
#         loss = outputs.loss.repeat(batch_sz) # <===== Added.
#         losses.append(accelerator.gather(loss))
#     loss = torch.mean(torch.cat(losses))
#     # loss = torch.mean(torch.stack(losses))
#     try:
#         perplexity = torch.exp(loss)
#     except OverflowError:
#         perplexity = float("inf")
#     return loss.item(), perplexity.item()

In [25]:
# import torch
# from tqdm.notebook import tqdm

# def evaluate():
#     model.eval()
#     losses = []
#     accuracies = []
#     entropies = []
#     total_correct = 0
#     total_samples = 0
#     total_entropy = 0
    
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(batch["input_ids"], labels=batch["input_ids"])
#         loss = outputs.loss.repeat(batch_sz) # <===== Added.
#         losses.append(accelerator.gather(loss))
        
#         # Calculate accuracy
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#         labels = batch["input_ids"]
#         correct = (predictions == labels).sum().item()
#         total_correct += correct
#         total_samples += labels.numel()
        
#         # Calculate entropy
#         softmax_probs = torch.nn.functional.softmax(logits, dim=-1)
#         entropy = -torch.sum(softmax_probs * torch.log(softmax_probs), dim=-1)
#         total_entropy += entropy.sum().item()
        
#     loss = torch.mean(torch.cat(losses))
#     accuracy = total_correct / total_samples
#     entropy = total_entropy / total_samples
    
#     try:
#         perplexity = torch.exp(loss)
#     except OverflowError:
#         perplexity = float("inf")
        
#     return loss.item(), perplexity.item(), accuracy, entropy

In [26]:
model = GPT2LMHeadModel(config)
# model.to(device)

In [27]:
# optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [28]:
# # accelerator = Accelerator(fp16=True)
# accelerator = Accelerator(mixed_precision="fp16")

# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [29]:
# num_train_epochs = 1
# num_update_steps_per_epoch = len(train_dataloader)
# num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = get_scheduler(
#     name="linear",
#     optimizer=optimizer,
#     num_warmup_steps=1_000,
#     num_training_steps=num_training_steps,
# )

In [30]:
model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'shradha01/codeparrot-ds-accelerate'

In [31]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/user1-selab3/shradha_test/roberta/codeparrot-ds-accelerate is already a clone of https://huggingface.co/shradha01/codeparrot-ds-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


In [32]:
# evaluate()

In [33]:
# """
# Version 2 without weight
# """
# from tqdm.notebook import tqdm

# gradient_accumulation_steps = 8
# eval_steps = 100 # 5_000

# model.train()
# completed_steps = 0
# for epoch in range(num_train_epochs):
#     for step, batch in tqdm(
#         enumerate(train_dataloader, start=1), total=num_training_steps
#     ):
#         batch["input_ids"].to(device)
#         # Consider "weight"
#         # logits = model(batch["input_ids"]).logits
#         # loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
#         # ########################################################################
#         # Do not consider "weight"
#         loss = model(batch["input_ids"], labels=batch["input_ids"]).loss
#         n_gpu = torch.cuda.device_count()
#         samples_per_step = batch_sz * n_gpu
#         if step % 100 == 0:
#             accelerator.print(
#                 {
#                     "lr": lr_scheduler.get_last_lr()[0],
#                     "samples": step * samples_per_step,
#                     "steps": completed_steps,
#                     "loss/train": loss.item() * gradient_accumulation_steps,
#                 }
#             )
#         loss = loss / gradient_accumulation_steps
#         accelerator.backward(loss)
#         if step % gradient_accumulation_steps == 0:
#             accelerator.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             lr_scheduler.step()
#             optimizer.zero_grad()
#             completed_steps += 1
#         if (step % (eval_steps * gradient_accumulation_steps)) == 0:
#             eval_loss, perplexity = evaluate()
#             accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
#             model.train()
#             accelerator.wait_for_everyone()
#             unwrapped_model = accelerator.unwrap_model(model)
#             unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#             if accelerator.is_main_process:
#                 tokenizer.save_pretrained(output_dir)
#                 repo.push_to_hub(
#                     commit_message=f"Training in progress step {step}", blocking=False
#                 )

In [34]:
import os

# Disable tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [35]:
def training_function():
    model = GPT2LMHeadModel(config)

    optimizer = AdamW(get_grouped_params(model), lr=5e-4)

    accelerator = Accelerator(mixed_precision="fp16")

    batch_sz = 32

    tokenized_datasets.set_format("torch")
    train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_sz, shuffle=True)
    eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=batch_sz)

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    num_train_epochs = 1
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=1_000,
        num_training_steps=num_training_steps,

    )

    def evaluate():
        model.eval()
        losses = []
        total_correct = 0
        total_samples = 0
        total_entropy = 0

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(batch["input_ids"], labels=batch["input_ids"])
            loss = outputs.loss.repeat(batch_sz) # <===== Added.
            losses.append(accelerator.gather(loss))
            
            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).to('cuda')
            # print(predictions.shape)
            # print("&&&")
            labels = batch["input_ids"].to('cuda')
            # print(predictions)
            # print(labels.shape)
            #if predictions == labels:
            # print (predictions)
            # print (predictions.numel())
            # print("&&&")
            # print (labels)
            correct = (predictions == labels).sum().item()
            total_correct += correct
            total_samples += labels.numel()
            # Calculate accuracy
            # logits = outputs.logits
            # predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            # labels = batch["input_ids"].cpu().numpy()
            # correct = np.sum(predictions == labels)
            # total_correct += correct
            # total_samples += labels.size

            # # Print predictions and labels
            # print("Predictions:", predictions)
            # print("Labels:", labels)

            # Convert tensors to numpy arrays
            # predictions = predictions.cpu().numpy()
            # labels = labels.cpu().numpy()
            # print(labels)

            # # Post-processing predictions (e.g., extracting predicted class labels)
            # predicted_labels = predictions.argmax(axis=1)
            # print(predicted_labels)

            # # Compute accuracy score
            # accuracy = accuracy_score(labels, predicted_labels)
            # print("Accuracy Score:", accuracy)

            #Calculate entropy
            softmax_probs = torch.nn.functional.softmax(logits, dim=-1)
            entropy = -torch.sum(softmax_probs * torch.log(softmax_probs), dim=-1)
            total_entropy += entropy.sum().item()

        loss = torch.mean(torch.cat(losses))
        accuracy = total_correct/ total_samples
        entropy = total_entropy / total_samples

        try:
            perplexity = torch.exp(torch.tensor(loss))
        except OverflowError:
            perplexity = float("inf")

        return loss.item(), perplexity.item(), accuracy, entropy
        
    gradient_accumulation_steps = 1
    eval_steps = 20 # 5_000

    model.train()
    completed_steps = 0
    for epoch in range(num_train_epochs):
        for step, batch in tqdm(
            enumerate(train_dataloader, start=1), total=num_training_steps
        ):
            logits = model(batch["input_ids"]).logits
            loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
            if step % 20 == 0:
                accelerator.print(
                    {
                        # "lr": get_lr(),
                        # "samples": step * samples_per_step,
                        "steps": completed_steps,
                        "loss/train": loss.item() * gradient_accumulation_steps,
                    }
                )
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                completed_steps += 1
            if (step % (eval_steps * gradient_accumulation_steps)) == 0:
                # eval_loss, perplexity = evaluate()
                # accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
                eval_loss, perplexity, accuracy, entropy = evaluate()
                accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity, "accuracy": accuracy, "entropy": entropy})
                model.train()
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)
                unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
                if accelerator.is_main_process:
                    tokenizer.save_pretrained(output_dir)
                    repo.push_to_hub(
                        commit_message=f"Training in progress step {step}", blocking=False
                    )
                accelerator.print(f'epoch {epoch}: accuracy - {100 * accuracy:.2f}%')
notebook_launcher(training_function, num_processes= 2)

Launching training on 2 GPUs.


 40%|███▉      | 19/48 [00:04<00:05,  4.95it/s]

{'steps': 19, 'loss/train': 11.842835426330566}


  perplexity = torch.exp(torch.tensor(loss))
  perplexity = torch.exp(torch.tensor(loss))


{'loss/eval': 9.677013397216797, 'perplexity': 15946.798828125, 'accuracy': 0.048486328125, 'entropy': 10.569938468933106}


Several commits (73) will be pushed upstream.


epoch 0: accuracy - 4.85%


 81%|████████▏ | 39/48 [00:38<00:01,  4.62it/s]

{'steps': 39, 'loss/train': 9.094144821166992}


  perplexity = torch.exp(torch.tensor(loss))
  perplexity = torch.exp(torch.tensor(loss))


{'loss/eval': 9.078493118286133, 'perplexity': 8764.7490234375, 'accuracy': 0.01962890625, 'entropy': 10.327276420593261}


Several commits (74) will be pushed upstream.


epoch 0: accuracy - 1.96%


100%|██████████| 48/48 [01:11<00:00,  1.49s/it]
100%|██████████| 48/48 [01:11<00:00,  1.49s/it]


In [36]:
# from transformers import pipeline

# model_checkpoint = "shradha01/codeparrot-ds-accelerate"

# code_example = "public static Quaterniond lerp(Quaterniond a,"
# text_generation = pipeline('text-generation', model=model_checkpoint, tokenizer=tokenizer)

# outputs = text_generation(code_example)
# print(outputs)

# # pipe = pipeline('text-generation', model=model_checkpoint, tokenizer = tokenizer)
# # pipe('public Quaterniond sub(float x,')

In [38]:
model_checkpoint = "shradha01/codeparrot-ds-accelerate"
text_generation = pipeline('text-generation', model=model_checkpoint, tokenizer=tokenizer)

code_example = "static public BigDecimal scalePrec(final BigDecimal x, int d) {\n"
ground_truth = ["static public BigDecimal scalePrec(final BigDecimal x, int d) {\n        return x.setScale(d + x.scale());\n\n\n    }"]

outputs = text_generation(code_example)
print(outputs)

generated_text = outputs[0]['generated_text']
print(generated_text)
generated_text_list = [generated_text]
print (generated_text_list)
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=generated_text_list, references=ground_truth)
print(results)
# bleu_score = sentence_bleu(ground_truth, generated_text, smoothing_function=SmoothingFunction().method1)
# print("BLEU Score:", bleu_score)

rouge = evaluate.load('rouge')

results = rouge.compute(predictions=generated_text_list, references=ground_truth)
print(results)

from evaluate import load
exact_match_metric = load("exact_match")
results = exact_match_metric.compute(predictions=generated_text_list, references=ground_truth)
print(results)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': 'static public BigDecimal scalePrec(final BigDecimal x, int d) {\n {ipv.\n            if\t\n        {.get(..get.. memoryview_..get..get\t.. {.stochastics..'}]
static public BigDecimal scalePrec(final BigDecimal x, int d) {
 {ipv.
            if	
        {.get(..get.. memoryview_..get..get	.. {.stochastics..
['static public BigDecimal scalePrec(final BigDecimal x, int d) {\n {ipv.\n            if\t\n        {.get(..get.. memoryview_..get..get\t.. {.stochastics..']
{'bleu': 0.3053166338162007, 'precisions': [0.3902439024390244, 0.3, 0.28205128205128205, 0.2631578947368421], 'brevity_penalty': 1.0, 'length_ratio': 1.4642857142857142, 'translation_length': 41, 'reference_length': 28}
{'rouge1': 0.5625, 'rouge2': 0.5333333333333333, 'rougeL': 0.5625, 'rougeLsum': 0.5625}
{'exact_match': 0.0}


In [None]:
# model_checkpoint = "shradha01/codeparrot-ds-accelerate"
# text_generation = pipeline('text-generation', model=model_checkpoint, tokenizer=tokenizer)

# code_example = "public static Quaterniond lerp(Quaterniond a,"
# ground_truth = ["public static Quaterniond lerp(Quaterniond a, Quaterniond b, double percent) {\n        return a.mul(1 - percent).add(b.mul(percent));\n    }"]

# generated_text = ["public static Quaterniond lerp(Quaterniond a, Quaterniond b, double percent) {(`): {_....\t.(.(. = wit((Sparsity.get.(...get((.(.(."]

# bleu = evaluate.load("bleu")
# results = bleu.compute(predictions=generated_text, references=ground_truth)
# print(results)

# rouge = evaluate.load('rouge')

# results = rouge.compute(predictions=generated_text, references=ground_truth)
# print(results)

{'bleu': 0.3053462099672868, 'precisions': [0.44, 0.2857142857142857, 0.2708333333333333, 0.2553191489361702], 'brevity_penalty': 1.0, 'length_ratio': 1.3888888888888888, 'translation_length': 50, 'reference_length': 36}
{'rouge1': 0.6060606060606061, 'rouge2': 0.5806451612903226, 'rougeL': 0.6060606060606061, 'rougeLsum': 0.6060606060606061}


In [None]:
# input_examples = ["public Quaterniond sub(float x,"]
# reference_outputs = ["public Quaterniond sub(float x, float y, float z, float w) {return sub((double) x, (double) y, (double) z, (double) w);}"]
# # Generate outputs using the pipeline
# generated_outputs = [text_generation(input_example, max_length=50, do_sample=False)[0]['generated_text'] 
#                      for input_example in input_examples]
# generated_outputs = ["public Quaterniond sub(float x, float y, float z, float w)"]
# # Calculate accuracy
# rrectly_generated = sum(1 for gen_output, ref_output in zip(generated_outputs, reference_outputs) if gen_output == ref_output)
# total_outputs = len(reference_outputs)
# accuracy = (correctly_generated / total_outputs) * 100
# accuracy


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


0.0

In [None]:
# import evaluate
# from datasets import load_metric

# def training_function():
#     model = GPT2LMHeadModel(config)

#     optimizer = AdamW(get_grouped_params(model), lr=5e-4)

#     accelerator = Accelerator(mixed_precision="fp16")

#     batch_sz = 32

#     tokenized_datasets.set_format("torch")
#     train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_sz, shuffle=True)
#     eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=batch_sz)

#     model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#         model, optimizer, train_dataloader, eval_dataloader
#     )

#     num_train_epochs = 1
#     num_update_steps_per_epoch = len(train_dataloader)
#     num_training_steps = num_train_epochs * num_update_steps_per_epoch

#     lr_scheduler = get_scheduler(
#         name="linear",
#         optimizer=optimizer,
#         num_warmup_steps=1_000,
#         num_training_steps=num_training_steps,

#     )

#     def evaluate():
#         model.eval()
#         losses = []
#         total_correct = 0
#         total_samples = 0
#         total_entropy = 0
#         accuracy_metric = load_metric("accuracy")
#         #acc = []

#         for step, batch in enumerate(eval_dataloader):
#             with torch.no_grad():
#                 outputs = model(batch["input_ids"], labels=batch["input_ids"])
#             loss = outputs.loss.repeat(batch_sz) # <===== Added.
#             losses.append(accelerator.gather(loss))
            
#             # Calculate accuracy
#             logits = outputs.logits
#             predictions = torch.argmax(logits, dim=-1)
#             print(predictions.shape)
#             print("&&&")
#             labels = batch["input_ids"]
#             print(predictions)
#             print(labels.shape)
#             if predictions == labels:
#                 print 

#             correct = (predictions == labels).sum().item()
#             total_correct += correct
#             total_samples += labels.numel()
#             # batch_accuracy = correct/ labels.numel()
#             # acc.append(batch_accuracy)

#             # accuracy_preds = predictions == batch["input_ids"]
#             # total_correct += accuracy_preds.long().sum().item()
#             # total_samples += accuracy_preds.shape[0]
            
#             #Calculate entropy
#             softmax_probs = torch.nn.functional.softmax(logits, dim=-1)
#             entropy = -torch.sum(softmax_probs * torch.log(softmax_probs), dim=-1)
#             total_entropy += entropy.sum().item()

#             # Update accuracy metric
#             accuracy_metric.add_batch(predictions=predictions.cpu(), references=labels.cpu())
            
#         loss = torch.mean(torch.cat(losses))
#         accuracy = total_correct/ total_samples
#         #accuracy = sum(acc) / len(acc)
#         #accuracy = metric.compute()["accuracy"]
#         entropy = total_entropy / total_samples

#         try:
#             perplexity = torch.exp(torch.tensor(loss))
#         except OverflowError:
#             perplexity = float("inf")

#         eval_metric = accuracy_metric.compute()["accuracy"]

#         return loss.item(), perplexity.item(), accuracy, entropy, eval_metric
        
#     gradient_accumulation_steps = 1
#     eval_steps = 20 # 5_000

#     model.train()
#     completed_steps = 0
#     for epoch in range(num_train_epochs):
#         for step, batch in tqdm(
#             enumerate(train_dataloader, start=1), total=num_training_steps
#         ):
#             logits = model(batch["input_ids"]).logits
#             loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
#             if step % 20 == 0:
#                 accelerator.print(
#                     {
#                         # "lr": get_lr(),
#                         # "samples": step * samples_per_step,
#                         "steps": completed_steps,
#                         "loss/train": loss.item() * gradient_accumulation_steps,
#                     }
#                 )
#             loss = loss / gradient_accumulation_steps
#             accelerator.backward(loss)
#             if step % gradient_accumulation_steps == 0:
#                 accelerator.clip_grad_norm_(model.parameters(), 1.0)
#                 optimizer.step()
#                 lr_scheduler.step()
#                 optimizer.zero_grad()
#                 completed_steps += 1
#             if (step % (eval_steps * gradient_accumulation_steps)) == 0:
#                 # eval_loss, perplexity = evaluate()
#                 # accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
#                 eval_loss, perplexity, accuracy, entropy = evaluate()
#                 accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity, "accuracy": accuracy, "entropy": entropy})
#                 model.train()
#                 accelerator.wait_for_everyone()
#                 unwrapped_model = accelerator.unwrap_model(model)
#                 unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#                 if accelerator.is_main_process:
#                     tokenizer.save_pretrained(output_dir)
#                     # repo.push_to_hub(
#                     #     commit_message=f"Training in progress step {step}", blocking=False
#                     # )
#                 accelerator.print(f'epoch {epoch}: accuracy - {100 * accuracy:.2f}%')
# notebook_launcher(training_function, num_processes= 2)

In [1]:
with open("/home/user1-selab3/shradha_test/roberta/whole_func_strings_new.txt", "r") as f:
    text = f.read()

In [2]:
text

