#### 1. Imports

In [14]:
import optuna
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
import accelerate # leave here
import numpy as np
from utils import load_class_code_from_directory, save_embeddings_to_csv
from generate_embeddings import generate_embeddings_for_java_file

ImportError: cannot import name 'save_embeddings_to_csv' from 'utils' (c:\Users\bianc\Documents\GitHub\MicroserviceML\utils.py)

In [4]:
# Check if CUDA (GPU) is available and if so, set the device to GPU
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  

device = torch.device(dev)

#### 2. Generate examples for fine-tuning CodeBERT (from POS)

In [5]:
# TODO : Add fine tuning examples here (use POS classes to begin with)

# Labels are 0: Application, 1: Utility, 2: Entity

def process_file(filepath, label):
    with open(filepath, 'r') as f:
        for line in f:
            class_labels[line.strip()] = label

class_labels = {}
process_file("ground_truths/v_imen/pos/classes/application.txt", 0)
process_file("ground_truths/v_imen/pos/classes/utility.txt", 1)
process_file("ground_truths/v_imen/pos/classes/entity.txt", 2)

root_folder = 'src_code/pos/src_code_formatted/'

def read_java_file(file_path):
    with open(file_path, encoding="ISO-8859-1", errors="ignore") as java_file:
        return java_file.read()

class_code = {file.replace(".java", ""): read_java_file(os.path.join(root_folder, file))
                for file in os.listdir(root_folder)}

# Put data in the below format by combining class_code and class_labels based on key if label exists
# examples = [
#     {"text": "<your Java class code here>", "label": 0},
#     {"text": "<another Java class code here>", "label": 1},
#     ...]


examples = []
for key in class_code.keys():
    if key in class_labels.keys():
        examples.append({"text": class_code[key], "label": class_labels[key]})

# Print first 5 examples
print(examples[:5])

[{'text': 'package com.rafsan.inventory.controller.admin;\n\nimport com.rafsan.inventory.entity.Invoice;\nimport com.rafsan.inventory.entity.Product;\nimport com.rafsan.inventory.model.InvoiceModel;\nimport com.rafsan.inventory.model.ProductModel;\nimport java.net.URL;\nimport java.text.DateFormatSymbols;\nimport java.util.Locale;\nimport java.util.ResourceBundle;\nimport javafx.animation.TranslateTransition;\nimport javafx.collections.FXCollections;\nimport javafx.collections.ObservableList;\nimport javafx.event.ActionEvent;\nimport javafx.fxml.FXML;\nimport javafx.fxml.FXMLLoader;\nimport javafx.fxml.Initializable;\nimport javafx.scene.Node;\nimport javafx.scene.Parent;\nimport javafx.scene.Scene;\nimport javafx.scene.chart.BarChart;\nimport javafx.scene.chart.CategoryAxis;\nimport javafx.scene.chart.LineChart;\nimport javafx.scene.chart.PieChart;\nimport javafx.scene.chart.XYChart;\nimport javafx.scene.control.Button;\nimport javafx.scene.image.Image;\nimport javafx.scene.input.Mous

#### 3. Fine-tuning and optimization

In [6]:
# Implement a PyTorch Dataset
class CodingDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, idx):
        example = self.examples[idx]
        encoding = self.tokenizer(example['text'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        encoding = {key: torch.flatten(value) for key, value in encoding.items()}  # Flatten tensors
        encoding['labels'] = torch.tensor(example['label'])
        return encoding

In [7]:
# Split your data into train and validation sets
train_examples, val_examples = train_test_split(examples, test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base", force_download=False)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=3, force_download=False)
model = model.to(device)

best_params = None

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Define objective function for optuna to optimize
def objective(trial):
    # Define hyperparameters for this trial
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)  # Learning rate
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64])  # Batch size

    # Create data loaders
    train_dataset = CodingDataset(train_examples, tokenizer)
    val_dataset = CodingDataset(val_examples, tokenizer)

    # Specify the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=trial.suggest_int('num_train_epochs', 1, 10), # tune hyperparameter here
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        logging_dir='./logs',
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset             
    )

    trainer.train()

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
        
    # Optuna seeks for the minimum so return loss as it is
    return eval_result["eval_loss"]

In [9]:
# TODO : Eventually, add argument to .py script to run optuna if desired
if False:
    # Create a study to run the hyperparameter optimization
    study = optuna.create_study(direction="minimize")

    # Run the optimization
    study.optimize(objective, n_trials=10)

    # Print the results
    best_params = study.best_params
    print(f"Best hyperparameters: {best_params}")

In [10]:
def custom_collate_fn(batch):
    keys = batch[0].keys()
    output_batch = {key: torch.stack([item[key] for item in batch]) for key in keys}
    return output_batch

In [11]:
# Trial 4 finished with value: 0.0006419435958378017 and parameters: {'lr': 0.0002536818790618518, 'batch_size': 16, 'num_train_epochs': 8}. 
# Best is trial 4 with value: 0.0006419435958378017.

if best_params is None:
    best_params = {'lr': 0.0002536818790618518, 'batch_size': 16, 'num_train_epochs': 8}

# Fine tuning the model with the best parameters
dataset = CodingDataset(examples, tokenizer)
dataloader = DataLoader(dataset, batch_size=best_params['batch_size'], collate_fn=custom_collate_fn)

# We'll use Adam as our optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=best_params['lr'])

# Put your model into training mode
model.train()

# Training loop
epochs = best_params['num_train_epochs'] # for fine tune, we also use best epochs
for epoch in range(epochs):
    for idx, batch in enumerate(dataloader):
        batch = {key: value.to(device) for key, value in batch.items()} # moving batch data to the device
        optimizer.zero_grad()  # Reset gradients
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss  # Calculate the loss from the outputs
        loss.backward()  # Backpropagation
        optimizer.step()  # Adjust model weights based on gradients

    print("Training completed.")

Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.
Training completed.


In [12]:
# Save the model
model.save_pretrained("./codebert_finetuned")
tokenizer.save_pretrained("./codebert_finetuned")

('./codebert_finetuned\\tokenizer_config.json',
 './codebert_finetuned\\special_tokens_map.json',
 './codebert_finetuned\\vocab.json',
 './codebert_finetuned\\merges.txt',
 './codebert_finetuned\\added_tokens.json',
 './codebert_finetuned\\tokenizer.json')

#### 4. Embedding generation

In [13]:
# Load the fine-tuned model and tokenizer
model = AutoModel.from_pretrained("./codebert_finetuned")
tokenizer = AutoTokenizer.from_pretrained("./codebert_finetuned", force_download=False)  # Use the fine-tuned tokenizer

model = model.to(device)
version = 'v_team'

# TODO : Generate embeddings for all systems, save them to csv file and use them in the next step
for system in ['jforum']:
    class_code = load_class_code_from_directory(system)

    class_embeddings = {}
    for class_name, code in class_code.items():
        class_embeddings[class_name] = generate_embeddings_for_java_file(code, model, tokenizer, device)

    save_embeddings_to_csv(version, system, 'ft_codebert', class_embeddings)

Some weights of RobertaModel were not initialized from the model checkpoint at ./codebert_finetuned and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NameError: name 'generate_embeddings_for_java_file' is not defined