In [22]:
import torch
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
dataset = load_dataset("code_search_net", "python",cache_dir="./Datasets")

In [25]:
dataset.column_names

{'train': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'test': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'validation': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url']}

In [26]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", cache_dir="./Models")
model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base", cache_dir="./Models").to(device)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def tokenize_code(code_snippet, tokenizer):
    return tokenizer(
        code_snippet, max_length=512, truncation=True, padding='max_length', return_tensors="pt"
    )

In [28]:
def preprocess_data(batch, tokenizer):
    batch["input_ids"] = tokenize_code(batch["func_code_string"], tokenizer)["input_ids"]
    batch["labels"] = tokenize_code(batch["func_documentation_string"], tokenizer)["input_ids"]
    return batch

In [29]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000)).map(lambda x: preprocess_data(x, tokenizer), batched=True)
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200)).map(lambda x: preprocess_data(x, tokenizer), batched=True)

In [30]:
training_args = TrainingArguments(
    output_dir="./codebert-finetuned",    # output directory
    num_train_epochs=3,                   # number of epochs
    per_device_train_batch_size=16,       # batch size
    per_device_eval_batch_size=16,        # eval batch size
    warmup_steps=500,                     # warmup steps
    weight_decay=0.01,                    # weight decay
    logging_dir="./logs",                 # logging directory
    logging_steps=10,
    evaluation_strategy="steps",          # Evaluation after every logging step
    save_total_limit=2,                   # Keep only last two checkpoints
    save_steps=500,                       # Save model every 500 steps
    report_to="none",
    fp16=True                     # No reports (e.g., to wandb)
)



In [31]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [32]:
trainer.train()

  0%|          | 0/189 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  5%|▌         | 10/189 [04:44<1:24:23, 28.29s/it]

{'loss': 18.0067, 'grad_norm': 62.93367385864258, 'learning_rate': 7.000000000000001e-07, 'epoch': 0.16}


                                                  
  5%|▌         | 10/189 [06:35<1:24:23, 28.29s/it]

{'eval_loss': 17.82748794555664, 'eval_runtime': 110.4363, 'eval_samples_per_second': 1.811, 'eval_steps_per_second': 0.118, 'epoch': 0.16}


 11%|█         | 20/189 [11:19<1:24:03, 29.84s/it]

{'loss': 17.4566, 'grad_norm': 55.4626579284668, 'learning_rate': 1.7000000000000002e-06, 'epoch': 0.32}


                                                  
 11%|█         | 20/189 [13:05<1:24:03, 29.84s/it]

{'eval_loss': 17.223665237426758, 'eval_runtime': 105.609, 'eval_samples_per_second': 1.894, 'eval_steps_per_second': 0.123, 'epoch': 0.32}


 16%|█▌        | 30/189 [17:47<1:18:11, 29.51s/it]

{'loss': 16.6815, 'grad_norm': 49.75236129760742, 'learning_rate': 2.6e-06, 'epoch': 0.48}


                                                  
 16%|█▌        | 30/189 [19:34<1:18:11, 29.51s/it]

{'eval_loss': 16.183364868164062, 'eval_runtime': 106.9426, 'eval_samples_per_second': 1.87, 'eval_steps_per_second': 0.122, 'epoch': 0.48}


 21%|██        | 40/189 [24:16<1:12:59, 29.39s/it]

{'loss': 15.336, 'grad_norm': 48.57144546508789, 'learning_rate': 3.6e-06, 'epoch': 0.63}


                                                  
 21%|██        | 40/189 [26:01<1:12:59, 29.39s/it]

{'eval_loss': 14.338003158569336, 'eval_runtime': 104.5645, 'eval_samples_per_second': 1.913, 'eval_steps_per_second': 0.124, 'epoch': 0.63}


 26%|██▋       | 50/189 [30:41<1:07:53, 29.31s/it]

{'loss': 13.4126, 'grad_norm': 45.878501892089844, 'learning_rate': 4.6e-06, 'epoch': 0.79}


                                                  
 26%|██▋       | 50/189 [32:27<1:07:53, 29.31s/it]

{'eval_loss': 12.026623725891113, 'eval_runtime': 106.3166, 'eval_samples_per_second': 1.881, 'eval_steps_per_second': 0.122, 'epoch': 0.79}


 32%|███▏      | 60/189 [37:08<1:02:51, 29.24s/it]

{'loss': 10.7985, 'grad_norm': 40.57843780517578, 'learning_rate': 5.600000000000001e-06, 'epoch': 0.95}


                                                  
 32%|███▏      | 60/189 [38:53<1:02:51, 29.24s/it]

{'eval_loss': 9.35397720336914, 'eval_runtime': 105.5608, 'eval_samples_per_second': 1.895, 'eval_steps_per_second': 0.123, 'epoch': 0.95}


 37%|███▋      | 70/189 [43:21<57:33, 29.02s/it]  

{'loss': 7.8631, 'grad_norm': 36.99539566040039, 'learning_rate': 6.6e-06, 'epoch': 1.11}


                                                
 37%|███▋      | 70/189 [45:06<57:33, 29.02s/it]

{'eval_loss': 6.722663402557373, 'eval_runtime': 105.1723, 'eval_samples_per_second': 1.902, 'eval_steps_per_second': 0.124, 'epoch': 1.11}


 42%|████▏     | 80/189 [49:48<53:50, 29.63s/it]  

{'loss': 5.2735, 'grad_norm': 33.96503829956055, 'learning_rate': 7.6e-06, 'epoch': 1.27}


                                                
 42%|████▏     | 80/189 [51:35<53:50, 29.63s/it]

{'eval_loss': 4.225109577178955, 'eval_runtime': 106.6414, 'eval_samples_per_second': 1.875, 'eval_steps_per_second': 0.122, 'epoch': 1.27}


 48%|████▊     | 90/189 [57:00<1:06:47, 40.48s/it]

{'loss': 3.0534, 'grad_norm': 14.488544464111328, 'learning_rate': 8.599999999999999e-06, 'epoch': 1.43}


                                                  
 48%|████▊     | 90/189 [1:00:20<1:06:47, 40.48s/it]

{'eval_loss': 2.609644889831543, 'eval_runtime': 200.4419, 'eval_samples_per_second': 0.998, 'eval_steps_per_second': 0.065, 'epoch': 1.43}


 53%|█████▎    | 100/189 [1:08:46<1:18:43, 53.07s/it]

{'loss': 1.8573, 'grad_norm': 6.039247989654541, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.59}


                                                     
 53%|█████▎    | 100/189 [1:11:58<1:18:43, 53.07s/it]

{'eval_loss': 2.1470143795013428, 'eval_runtime': 192.1054, 'eval_samples_per_second': 1.041, 'eval_steps_per_second': 0.068, 'epoch': 1.59}


 58%|█████▊    | 110/189 [1:17:25<41:00, 31.15s/it]   

{'loss': 1.9605, 'grad_norm': 9.008535385131836, 'learning_rate': 1.06e-05, 'epoch': 1.75}


                                                   
 58%|█████▊    | 110/189 [1:19:09<41:00, 31.15s/it]

{'eval_loss': 1.9461963176727295, 'eval_runtime': 104.2092, 'eval_samples_per_second': 1.919, 'eval_steps_per_second': 0.125, 'epoch': 1.75}


 63%|██████▎   | 120/189 [1:23:41<32:48, 28.53s/it]  

{'loss': 1.6761, 'grad_norm': 10.840803146362305, 'learning_rate': 1.16e-05, 'epoch': 1.9}


                                                   
 63%|██████▎   | 120/189 [1:25:24<32:48, 28.53s/it]

{'eval_loss': 1.9073281288146973, 'eval_runtime': 102.8442, 'eval_samples_per_second': 1.945, 'eval_steps_per_second': 0.126, 'epoch': 1.9}


 69%|██████▉   | 130/189 [1:29:43<27:04, 27.54s/it]  

{'loss': 1.505, 'grad_norm': 13.738783836364746, 'learning_rate': 1.2600000000000001e-05, 'epoch': 2.06}


                                                   
 69%|██████▉   | 130/189 [1:31:25<27:04, 27.54s/it]

{'eval_loss': 1.7985397577285767, 'eval_runtime': 102.5091, 'eval_samples_per_second': 1.951, 'eval_steps_per_second': 0.127, 'epoch': 2.06}


 74%|███████▍  | 140/189 [1:35:57<23:12, 28.41s/it]

{'loss': 1.4622, 'grad_norm': 3.4580392837524414, 'learning_rate': 1.3600000000000002e-05, 'epoch': 2.22}


                                                   
 74%|███████▍  | 140/189 [1:37:39<23:12, 28.41s/it]

{'eval_loss': 1.6921563148498535, 'eval_runtime': 102.2037, 'eval_samples_per_second': 1.957, 'eval_steps_per_second': 0.127, 'epoch': 2.22}


 79%|███████▉  | 150/189 [1:42:15<18:52, 29.04s/it]

{'loss': 1.3532, 'grad_norm': 3.9545507431030273, 'learning_rate': 1.4599999999999999e-05, 'epoch': 2.38}


                                                   
 79%|███████▉  | 150/189 [1:43:58<18:52, 29.04s/it]

{'eval_loss': 1.6555218696594238, 'eval_runtime': 102.867, 'eval_samples_per_second': 1.944, 'eval_steps_per_second': 0.126, 'epoch': 2.38}


 85%|████████▍ | 160/189 [1:48:29<13:43, 28.41s/it]

{'loss': 1.3292, 'grad_norm': 6.181820392608643, 'learning_rate': 1.56e-05, 'epoch': 2.54}


                                                   
 85%|████████▍ | 160/189 [1:50:11<13:43, 28.41s/it]

{'eval_loss': 1.5807119607925415, 'eval_runtime': 102.1535, 'eval_samples_per_second': 1.958, 'eval_steps_per_second': 0.127, 'epoch': 2.54}


 90%|████████▉ | 170/189 [1:54:42<08:58, 28.36s/it]

{'loss': 1.1491, 'grad_norm': 2.728649377822876, 'learning_rate': 1.66e-05, 'epoch': 2.7}


                                                   
 90%|████████▉ | 170/189 [1:56:24<08:58, 28.36s/it]

{'eval_loss': 1.5609850883483887, 'eval_runtime': 102.2638, 'eval_samples_per_second': 1.956, 'eval_steps_per_second': 0.127, 'epoch': 2.7}


 95%|█████████▌| 180/189 [2:00:55<04:15, 28.34s/it]

{'loss': 1.1541, 'grad_norm': 3.3209993839263916, 'learning_rate': 1.76e-05, 'epoch': 2.86}


                                                   
 95%|█████████▌| 180/189 [2:02:37<04:15, 28.34s/it]

{'eval_loss': 1.5190330743789673, 'eval_runtime': 102.3456, 'eval_samples_per_second': 1.954, 'eval_steps_per_second': 0.127, 'epoch': 2.86}


100%|██████████| 189/189 [2:06:30<00:00, 40.16s/it]

{'train_runtime': 7590.0833, 'train_samples_per_second': 0.395, 'train_steps_per_second': 0.025, 'train_loss': 6.4713023755916215, 'epoch': 3.0}





TrainOutput(global_step=189, training_loss=6.4713023755916215, metrics={'train_runtime': 7590.0833, 'train_samples_per_second': 0.395, 'train_steps_per_second': 0.025, 'total_flos': 789796389888000.0, 'train_loss': 6.4713023755916215, 'epoch': 3.0})

In [33]:
model.save_pretrained("./codebert-finetuned")
tokenizer.save_pretrained("./codebert-finetuned")

('./codebert-finetuned\\tokenizer_config.json',
 './codebert-finetuned\\special_tokens_map.json',
 './codebert-finetuned\\vocab.json',
 './codebert-finetuned\\merges.txt',
 './codebert-finetuned\\added_tokens.json')

In [34]:
def generate_documentation(code_snippet, model, tokenizer):
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True).to(device)
    print(inputs)
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [35]:
code_example = """
def add_numbers(a, b):
    return a + b
"""

In [50]:
documentation = generate_documentation(code_example, model, tokenizer)
print("Generated Documentation:", documentation)

TypeError: The current model class (RobertaForMaskedLM) is not compatible with `.generate()`, as it doesn't have a language model head. Classes that support generation often end in one of these names: ['ForCausalLM', 'ForConditionalGeneration', 'ForSpeechSeq2Seq', 'ForVision2Seq'].

In [59]:
from transformers import RobertaTokenizer, RobertaForCausalLM
import torch

# Define the path to your saved model directory
saved_model_path = './codebert-finetuned-t5base/'

# Load the tokenizer and the fine-tuned model from the saved checkpoint
tokenizer = RobertaTokenizer.from_pretrained(saved_model_path)
model = RobertaForCausalLM.from_pretrained(saved_model_path, is_decoder=True)

# Set the model to eval mode for inference
model.eval()

# Ensure to use the correct device (GPU/TPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example Python code to generate documentation for



RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [60]:
code_snippet = "def add_numbers(a, b):\n    return a + b"

# Tokenize the input code
inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate documentation using the fine-tuned model
with torch.no_grad():
    outputs = model.generate(inputs['input_ids'], max_length=300, num_beams=5, early_stopping=True)

# Decode the output to get the generated documentation
generated_doc = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Documentation:\n", generated_doc)

KeyboardInterrupt: 

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import shutil
import os

# Define device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Delete the existing cache directory to avoid loading corrupted files
cache_dir = "./Models"
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)

# Re-download the tokenizer and model, forcing a fresh download
tokenizer = T5Tokenizer.from_pretrained("Salesforce/codet5-base", cache_dir=cache_dir, force_download=True)
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base", cache_dir=cache_dir, force_download=True).to(device)


# Test the model and tokenizer by encoding and decoding
test_input = "def add_numbers(a, b): return a + b"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
outputs = model.generate(inputs.input_ids)

# Decode the generated output
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_code)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'T5Tokenizer'.


TypeError: not a string