#### 1. Imports

In [1]:
import optuna
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertModel, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
import accelerate
import csv
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if CUDA (GPU) is available and if so, set the device to GPU
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  

device = torch.device(dev)

#### 2. Generate examples for fine-tuning CodeBERT (from POS)

In [4]:
# TODO : Add fine tuning examples here (use POS classes to begin with)

# Labels are 0: Application, 1: Utility, 2: Entity

def process_file(filepath, label):
    with open(filepath, 'r') as f:
        for line in f:
            class_labels[line.strip()] = label

class_labels = {}
process_file("ground_truths/v_imen/pos/classes/application.txt", 0)
process_file("ground_truths/v_imen/pos/classes/utility.txt", 1)
process_file("ground_truths/v_imen/pos/classes/entity.txt", 2)

root_folder = 'src_code/pos/src_code_formatted/'

def read_java_file(file_path):
    with open(file_path, encoding="ISO-8859-1", errors="ignore") as java_file:
        return java_file.read()

class_code = {file.replace(".java", ""): read_java_file(os.path.join(root_folder, file))
                for file in os.listdir(root_folder)}

# Put data in the below format by combining class_code and class_labels based on key if label exists
# examples = [
#     {"text": "<your Java class code here>", "label": 0},
#     {"text": "<another Java class code here>", "label": 1},
#     ...]


examples = []
for key in class_code.keys():
    if key in class_labels.keys():
        examples.append({"text": class_code[key], "label": class_labels[key]})

# Print first 5 examples
print(examples[:5])

[{'text': 'package com.rafsan.inventory.controller.admin;\n\nimport com.rafsan.inventory.entity.Invoice;\nimport com.rafsan.inventory.entity.Product;\nimport com.rafsan.inventory.model.InvoiceModel;\nimport com.rafsan.inventory.model.ProductModel;\nimport java.net.URL;\nimport java.text.DateFormatSymbols;\nimport java.util.Locale;\nimport java.util.ResourceBundle;\nimport javafx.animation.TranslateTransition;\nimport javafx.collections.FXCollections;\nimport javafx.collections.ObservableList;\nimport javafx.event.ActionEvent;\nimport javafx.fxml.FXML;\nimport javafx.fxml.FXMLLoader;\nimport javafx.fxml.Initializable;\nimport javafx.scene.Node;\nimport javafx.scene.Parent;\nimport javafx.scene.Scene;\nimport javafx.scene.chart.BarChart;\nimport javafx.scene.chart.CategoryAxis;\nimport javafx.scene.chart.LineChart;\nimport javafx.scene.chart.PieChart;\nimport javafx.scene.chart.XYChart;\nimport javafx.scene.control.Button;\nimport javafx.scene.image.Image;\nimport javafx.scene.input.Mous

#### 3. Fine-tuning and optimization

In [10]:
# Implement a PyTorch Dataset
class CodingDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        example = self.examples[idx]
        encoding = self.tokenizer(example['text'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        encoding = {key: torch.flatten(value) for key, value in encoding.items()}  # Flatten tensors
        encoding['labels'] = torch.tensor(example['label'])
        return encoding

In [11]:
# Split your data into train and validation sets
train_examples, val_examples = train_test_split(examples, test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base", force_download=False)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=3, force_download=False)
model = model.to(device)

best_params = None

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Define objective function for optuna to optimize
def objective(trial):
    # Define hyperparameters for this trial
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)  # Learning rate
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64])  # Batch size

    # Create data loaders
    train_dataset = CodingDataset(train_examples, tokenizer)
    val_dataset = CodingDataset(val_examples, tokenizer)

    # Specify the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=trial.suggest_int('num_train_epochs', 1, 10), # tune hyperparameter here
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        logging_dir='./logs',
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset             
    )

    trainer.train()

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
        
    # Optuna seeks for the minimum so return loss as it is
    return eval_result["eval_loss"]

In [None]:
# TODO : Eventually, add argument to .py script to run optuna if desired
if False:
    # Create a study to run the hyperparameter optimization
    study = optuna.create_study(direction="minimize")

    # Run the optimization
    study.optimize(objective, n_trials=10)

    # Print the results
    best_params = study.best_params
    print(f"Best hyperparameters: {best_params}")

In [13]:
def custom_collate_fn(batch):
    keys = batch[0].keys()
    output_batch = {key: torch.stack([item[key] for item in batch]) for key in keys}
    return output_batch

In [14]:
# Trial 4 finished with value: 0.0006419435958378017 and parameters: {'lr': 0.0002536818790618518, 'batch_size': 16, 'num_train_epochs': 8}. 
# Best is trial 4 with value: 0.0006419435958378017.

if best_params is None:
    best_params = {'lr': 0.0002536818790618518, 'batch_size': 16, 'num_train_epochs': 8}

# Fine tuning the model with the best parameters
dataset = CodingDataset(examples, tokenizer)
dataloader = DataLoader(dataset, batch_size=best_params['batch_size'], collate_fn=custom_collate_fn)

# We'll use Adam as our optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=best_params['lr'])

# Put your model into training mode
model.train()

# Training loop
epochs = best_params['num_train_epochs'] # for fine tune, we also use best epochs
for epoch in range(epochs):
    for idx, batch in enumerate(dataloader):
        batch = {key: value.to(device) for key, value in batch.items()} # moving batch data to the device
        optimizer.zero_grad()  # Reset gradients
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss  # Calculate the loss from the outputs
        loss.backward()  # Backpropagation
        optimizer.step()  # Adjust model weights based on gradients

    print("Training completed.")

Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.


In [15]:
# Save the model
model.save_pretrained("./codebert_finetuned")
tokenizer.save_pretrained("./codebert_finetuned")

('./codebert_finetuned\\tokenizer_config.json',
 './codebert_finetuned\\special_tokens_map.json',
 './codebert_finetuned\\vocab.json',
 './codebert_finetuned\\merges.txt',
 './codebert_finetuned\\added_tokens.json',
 './codebert_finetuned\\tokenizer.json')

#### 4. Embedding generation

In [6]:
# Original method for embedding generation
def generate_embeddings_for_java_file(code, model, tokenizer):
    '''Generate embeddings for the provided java file.'''
    
    # Tokenize the code
    all_code_tokens = tokenizer.tokenize(code)

    # Initialize an empty list to store the embeddings
    embeddings_for_file = []

    # Process the tokens in chunks of maximum length 510 (to account for [CLS] and [SEP])
    chunk_size = 510

    for n in range(0, len(all_code_tokens), chunk_size):
        chunk_code_tokens = all_code_tokens[n:n+chunk_size]

        # Add CLS (start) and SEP (end) tokens to the chunk tokens
        tokens = [tokenizer.cls_token] + chunk_code_tokens + [tokenizer.sep_token]

        # Convert the tokens to input IDs and create a PyTorch tensor
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)  # add batch dimension and move to device

        # Generate embeddings using the model
        with torch.no_grad():
            outputs = model(input_tensor)
        
        # Retrieve the [CLS] token's embeddings (the first token) from the outputs
        cls_embedding = outputs.last_hidden_state[0][0].cpu().numpy()

        # Append this embedding to our embeddings list
        embeddings_for_file.append(cls_embedding)

    # Compute the mean of all embeddings for this file
    mean_of_embeddings = np.mean(embeddings_for_file, axis=0)

    return mean_of_embeddings

# Other method for generating embeddings (remove if not needed)
# def generate_embeddings_for_java_file(code, model, tokenizer):
#     encoding = tokenizer(code, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
#     encoding = {key: value.to(device) for key, value in encoding.items()} 
#     with torch.no_grad():  # Set torch to not compute gradients for inference
#         outputs = model(**encoding)
#     embeddings = outputs.last_hidden_state[0].mean(dim=0)  # Take the mean of the embeddings across all tokens
#     print(embeddings[:5].cpu().numpy())  # print first 5 values
#     return embeddings.cpu().numpy()

In [8]:
# Load the fine-tuned model and tokenizer
model = AutoModel.from_pretrained("./codebert_finetuned")
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained("./codebert_finetuned", force_download=False)  # Use the fine-tuned tokenizer

# TODO : Generate embeddings for all projects, save them to csv file and use them in the next step
for project in ['jforum']:
    root_folder = 'src_code/' + project + '/src_code_formatted/'

    def read_java_file(file_path):
        with open(file_path, encoding="ISO-8859-1", errors="ignore") as java_file:
            return java_file.read()

    class_code = {file.replace(".java", ""): read_java_file(os.path.join(root_folder, file))
                    for file in os.listdir(root_folder) if file.endswith('.java')}

    class_embeddings = {}
    for class_name, code in class_code.items():
        class_embeddings[class_name] = generate_embeddings_for_java_file(code, model, tokenizer)

    # Save the embeddings to csv file
    with open(project + '_finetuned_codebert_embeddings.csv', 'w') as f:
        writer = csv.writer(f, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')

        for class_name, embedding in class_embeddings.items():
            writer.writerow([class_name, embedding])

OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./codebert_finetuned.