In [None]:
import os
import sys
import warnings
from IPython.display import clear_output


os.environ["TOKENIZERS_PARALLELISM"] = "false"
sys.path.append("..")
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:

# ! pip install datasets
# ! pip install scikit-learn
# ! pip install transformers
# ! pip install "accelerate>=0.26.0"
# ! pip install torch
# ! pip install tabulate
# ! pip install torch torchvision torchaudio
# ! pip install transformers[torch]
# ! pip install transformers accelerate -U


In [None]:
import pandas as pd
import json
from tabulate import tabulate
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import torch
from torch.nn import BCEWithLogitsLoss
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


# from ...utils.smells_labels import LABELS
LABELS = [
    "LongMethod",
    "FeatureEnvy",
    "SwitchStatement",
    "LongParametersList"
]

clear_output()

In [None]:
import json
from tabulate import tabulate

dataset_path = "./../../datasets/04_fontana/output/fontana_mld_sc.json"

with open(dataset_path, "r") as f:
    dataset = json.load(f)

display(len(dataset))

print(
    tabulate(
        dataset[:5],
        headers="keys",
        tablefmt="grid",
        stralign="left",
        showindex=True,
        maxcolwidths=75
    )
)

In [None]:
from transformers import AutoTokenizer, AutoModel
import random
import numpy as np

np.random.seed(123456)


MODEL_NAME = "answerdotai/ModernBERT-base"  # TODO: Retrieve from config.json
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


PADDING = "max_length"  # TODO: Retrieve from config.json
TRUNCATION = True  # TODO: Retrieve from config.json
MAX_LENGTH = 512  # TODO: Retrieve from config.json


def tokenize_function(batch):
    return tokenizer(
        batch["source_code"],
        padding=PADDING,  # *@ Pad to max length
        truncation=TRUNCATION,  # *@ Truncate if too long
        max_length=MAX_LENGTH,  # *@ Max Length of instances
    )

In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from tabulate import tabulate

TEST_SIZE = 0.3  # TODO: Retrieve from config.json 0.25 0.3

train_data, test_data = train_test_split(
    dataset, test_size=TEST_SIZE, random_state=42)

hf_datasets = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

display(hf_datasets)

print(
    tabulate(
        hf_datasets["train"][:1],
        headers="keys",
        tablefmt="grid",
        stralign="left",
        showindex=True,
        maxcolwidths=80
    )
)

In [None]:

#! Tokenize Dataset
tokenized_datasets = hf_datasets.map(tokenize_function, batched=True)

clear_output()

display(tokenized_datasets)

print(
    tabulate(
        tokenized_datasets["train"][:1],
        headers="keys",
        tablefmt="grid",
        stralign="left",
        showindex=True,
        maxcolwidths=27, #112 // 4
    )
)

In [None]:
# from utils.smells_labels import LABELS

#! Create a LabelIndexMap
label_index_map = {label: i for i, label in enumerate(LABELS)}


def encode_labels(batch):
    label_vector = [0] * len(LABELS)  #! A Zero Vector [0, 0, 0, 0]
    for label in batch["code_smells"]:
        if label in label_index_map:
            label_vector[label_index_map[label]] = 1 #! LABEL Present
    batch["labels"] = label_vector
    return batch


#! Enode Labels
encoded_datasets = tokenized_datasets.map(encode_labels)

clear_output()

#! Display Encoded Labels
display(label_index_map)

#! Display Encoded Dataset
print(encoded_datasets)


#! Remove code_smells & source_code Columns
encoded_datasets = encoded_datasets.remove_columns(["code_smells", "source_code"])


#! Convert to PyTorch Format
encoded_datasets.set_format(type="torch", columns=[
                            "input_ids", "attention_mask", "labels"])
# #! Display Final Dataset
display(encoded_datasets)

In [None]:
from torch.nn import BCEWithLogitsLoss, Module
from transformers import AutoModelForSequenceClassification

class MultiLabelClassifier(Module):

    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels)
        self.loss_fct = BCEWithLogitsLoss()


    #! Define a Loss Function
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask)
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels.float())
        return {"loss": loss, "logits": logits}

In [None]:
!pip install wandb


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# MODEL_NAME = "microsoft/codebert-base"  # TODO: Retrieve from config.json
PROBLEM_TYPE = "multi_label_classification"

#! Default Classifier
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),  # *@ Number of Labels
    problem_type=PROBLEM_TYPE  # *@ Multi-Label Classification Problem
)

#! Custom Classifier
model = MultiLabelClassifier(
    MODEL_NAME,
    num_labels=len(LABELS),  # *@ Number of Categories
)

def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = BCEWithLogitsLoss()
    loss = loss_fct(logits, labels.float())
    return (loss, outputs) if return_outputs else loss

# training_arguments = TrainingArguments(
#     output_dir="./../output/result",
#     logging_dir="./../output/logs",
#     logging_steps=10,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=6,  # ! 10
#     weight_decay=0.01,
#     load_best_model_at_end=True
# )


# trainer = Trainer(
#     model=model,
#     args=training_arguments,
#     train_dataset=encoded_datasets["train"],
#     eval_dataset=encoded_datasets["test"]
# )

# trainer.train()


# Define training args
training_args = TrainingArguments(
    output_dir="./../output/result",
    logging_dir="./../output/logs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,
    num_train_epochs=4,
    # bf16=True, # bfloat16 training
    optim="adamw_torch_fused", # improved optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    weight_decay=0.01
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["test"],
    # compute_metrics=compute_metrics,
)

print("HELLO")
trainer.train()

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

PREDICTION_THRESHOLD = 0.5

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, hamming_loss
import numpy as np

pred = []

def compute_metrics(pred):
    logits, labels = pred
    # Sigmoid for multi-label
    probs = 1 / (1 + np.exp(-logits))

    # Thresholding at 0.5
    preds = (probs > 0.5).astype(int)

    # pred.append(preds)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )

    acc = accuracy_score(labels, preds)
    hamming = hamming_loss(labels, preds)

    return {
        'accuracy': acc,
        'macro_precision': precision,
        'macro_recall': recall,
        'macro_f1': f1,
        'hamming_loss': hamming,
    }

#! Evaluation Method
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     print(logits)
#     predictions = torch.sigmoid(
#         torch.tensor(logits)
#     ) > PREDICTION_THRESHOLD
#     return {
#         "f1": f1_score(labels, predictions, average="micro"),
#         "precision": precision_score(labels, predictions, average="micro"),
#         "recall": recall_score(labels, predictions, average="micro"),
#         "accuracy": accuracy_score(labels, predictions)  # ! Not Very Useful
#     }


trainer.compute_metrics = compute_metrics

metrics = trainer.evaluate()
display(metrics)

# precision_recall_fscore_support(LABELS, pred, average=None)


In [None]:
# import pandas as pd

# experiences_file = "./../output/codebert_experiences.csv"

# experience_schema = {
#     "id": [],
#     "epoch": [],
#     "learning_rate": [],
#     "loss_value": [],
#     "precision": [],
#     "f1_score": [],
#     "recall": [],
#     "accuracy": []
# }

# experiences_df = None

# try:
#     experiences_df = pd.read_csv(experiences_file)
# except:
#     experiences_df = pd.DataFrame(experience_schema)

# recent_experience = {
#     "id": experiences_df["id"].max() + 1 if not experiences_df.empty else 1,
#     "epoch": metrics["epoch"],
#     "learning_rate": 2e-5,
#     "loss_value": metrics["eval_loss"],
#     "precision": metrics["eval_precision"],
#     "f1_score": metrics["eval_f1"],
#     "recall": metrics["eval_recall"],
#     "accuracy": metrics["eval_accuracy"],
# }

# # TODO Compare If Experience Already Exists

# experiences_df = pd.concat(
#     [experiences_df, pd.DataFrame([recent_experience])], ignore_index=True)

# experiences_df.to_csv(experiences_file, index=False)

In [None]:
# from transformers import AutoConfig

# LAST_MODEL_NAME = "../output/last_model"
# BEST_MODEL_NAME = "../output/best_model"

# config = AutoConfig.from_pretrained(
#     MODEL_NAME,
#     num_labels=len(LABELS),
#     problem_type=PROBLEM_TYPE
# )

# config.save_pretrained(LAST_MODEL_NAME)

# trainer.save_model(LAST_MODEL_NAME)

# tokenizer.save_pretrained(LAST_MODEL_NAME)

# # TODO: If The Best, Override the best_model Folder

# # config.save_pretrained(BEST_MODEL_NAME)

# # trainer.save_model(BEST_MODEL_NAME)

# # tokenizer.save_pretrained(BEST_MODEL_NAME)

In [None]:
# from transformers import pipeline

# classifier = pipeline(
#     "text-classification",
#     model=LAST_MODEL_NAME,
#     tokenizer=tokenizer,
#     return_all_scores=True
# )

# sample = "public class Example { private int x; }"

# prediction = classifier(sample)

# display(prediction)

In [None]:
# from transformers import AutoTokenizer, AutoModel
# import torch
# import pandas as pd
# import json

# dataset_path = "./../../../datasets/output/smells_dataset.json"

# with open(dataset_path, "r") as f:
#     dataset = json.load(f)

# MODEL_NAME = "microsoft/codebert-base"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModel.from_pretrained(MODEL_NAME)

# schema = {
#     "id": []
# }

# for i in range(1, 769):
#     feature = f"f{i:03}"
#     schema[feature] = []

# schema["isGodClass"] = []
# schema["isDataClass"] = []

# for i in range(len(dataset)):
#     instance = dataset[i]
#     code = instance["source_code"]

#     # Use a different variable name for tokenized output
#     tokens = tokenizer(
#         code,
#         return_tensors="pt",
#         padding=True,
#         truncation=True,
#         max_length=512
#     )

#     # Get the embeddings
#     with torch.no_grad():
#         outputs = model(**tokens)

#     # Extract Embeddings (CodeBERT Features)
#     embedding_vector = outputs.last_hidden_state[:, 0, :].squeeze()

#     schema["id"].append(i + 1)

#     for i in range(1, 769):
#         feature = f"f{i:03}"
#         schema[feature].append(embedding_vector[i-1].item())

#     schema["isGodClass"].append(
#         1 if "GodClass" in instance["code_smells"] else 0)
#     schema["isDataClass"].append(
#         1 if "DataClass" in instance["code_smells"] else 0)


# embedded_smells_datasets_csv = "./../output/embedded_smells_dataset.csv"
# embedded_smells_datasets_xlsx = "./../output/embedded_smells_dataset.xlsx"

# embedded_smells_df = pd.DataFrame(schema)
# embedded_smells_df.to_csv(embedded_smells_datasets_csv, index=False)
# embedded_smells_df.to_excel(
#     embedded_smells_datasets_xlsx, index=False, engine="openpyxl")

# display(embedded_smells_df)