In [1]:
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
dataset = load_dataset("code_search_net", "python",cache_dir="./Datasets")

In [4]:
dataset.column_names

{'train': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'test': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'validation': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url']}

In [5]:
tokenizer = T5Tokenizer.from_pretrained("Salesforce/codet5-base", cache_dir="./Models", force_download=True)
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base", cache_dir="./Models", force_download=True).to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'T5Tokenizer'.


TypeError: not a string

In [7]:
def preprocess_data(batch, tokenizer):
    batch["input_ids"] = tokenizer(batch["func_code_string"], max_length=512, truncation=True, padding='max_length', return_tensors="pt")["input_ids"]
    batch["labels"] = tokenizer(batch["func_documentation_string"], max_length=512, truncation=True, padding='max_length', return_tensors="pt")["input_ids"]
    return batch

In [8]:
# Larger training dataset
train_dataset = dataset["train"].shuffle(seed=42).select(range(5000)).map(lambda x: preprocess_data(x, tokenizer), batched=True)
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(1000)).map(lambda x: preprocess_data(x, tokenizer), batched=True)

In [None]:
model

In [None]:
training_args = TrainingArguments(
    output_dir="./codet5-finetuned/results",    # Output directory
    num_train_epochs=20,                        # Increase the number of epochs
    learning_rate=5e-5,                         # Adjusted learning rate
    per_device_train_batch_size=8,              # Batch size for training
    per_device_eval_batch_size=8,               # Batch size for evaluation
    warmup_steps=1000,                          # Number of warmup steps
    weight_decay=0.01,                          # Weight decay
    logging_dir="./logs",                       # Logging directory
    logging_steps=100,                          # Log every 100 steps
    evaluation_strategy="steps",                # Evaluate at each logging step
    save_total_limit=2,                         # Limit to 2 saved checkpoints
    save_steps=1000,                            # Save model every 1000 steps
    report_to="none",                           # Disable external logging
    fp16=True                                  # Mixed precision training
)

In [None]:
# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=None  # You can add a custom metric for logging if needed
)

In [None]:
trainer.train()

In [None]:
# Save the model and tokenizer after training
model.save_pretrained("./codet5-finetuned")
tokenizer.save_pretrained("./codet5-finetuned")

In [17]:
def generate_documentation(code_snippet, model, tokenizer):
    # Tokenize the input code snippet
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Generate documentation
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)
    
    # Decode the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
code_example = """
def add_numbers(a, b):
    return a + b
"""

In [None]:
documentation = generate_documentation(code_example, model, tokenizer)
print("Generated Documentation:", documentation)

In [None]:
model