# Phase 1: Classification (assign Utility, Application, or Entity Tag)

## 1.1 Create Embeddings

In [66]:
version = "v_team" # TODO changer
system = "pos" # TODO changer
model_type = "albert" # or ft_codebert

In [67]:
from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel, RobertaModel, RobertaTokenizer, BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from utils import load_class_code_from_directory, load_data_from_csv, save_embeddings_to_csv, process_files
from embeddings import generate_embeddings_for_java_file

In [68]:
# Check if CUDA (GPU) is available and if so, set the device to GPU
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  

device = torch.device(dev)

In [69]:
# Select the model and tokenizer
if (model_type == "codebert"):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base",force_download=False)
    model = AutoModel.from_pretrained("microsoft/codebert-base",force_download=False)
elif (model_type == "ft_codebert"):
    tokenizer = AutoTokenizer.from_pretrained("./codebert_finetuned",force_download=False)
    model = AutoModel.from_pretrained("./codebert_finetuned",force_download=False)
elif (model_type == "hugging-face"):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 
    model = BertModel.from_pretrained("bert-base-uncased") 
elif (model_type == "roberta"):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")
elif (model_type == "albert"): 
    # pip3 install sentencepiece
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    model = AlbertModel.from_pretrained("albert-base-v2")
else:
    raise NameError("model type not supported")

# Move the model to the GPU if available
model = model.to(device)

In [70]:
# Labels are 0: Application, 1: Utility, 2: Entity
class_labels = process_files(version, system)
print(class_labels)

{'net.jforum.actions.extensions.TopicWatchExtension': 0, 'net.jforum.actions.interceptors.ControllerSecurityInterceptor': 0, 'net.jforum.actions.interceptors.ExtendsAnnotationInterceptor': 0, 'net.jforum.actions.interceptors.MethodSecurityInterceptor': 0, 'net.jforum.actions.interceptors.SecurityInterceptor': 0, 'net.jforum.actions.interceptors.SessionManagerInterceptor': 0, 'net.jforum.api.JForumExecutionContext': 0, 'net.jforum.controllers.AvatarAdminController': 0, 'net.jforum.controllers.BadWordAdminController': 0, 'net.jforum.controllers.BanlistAdminController': 0, 'net.jforum.controllers.CategoryAdminController': 0, 'net.jforum.controllers.ConfigController': 0, 'net.jforum.controllers.ForumAdminController': 0, 'net.jforum.controllers.ForumController': 0, 'net.jforum.controllers.GroupAdminController': 0, 'net.jforum.controllers.HibernateStatisticsController': 0, 'net.jforum.controllers.LuceneAdminController': 0, 'net.jforum.controllers.MessageController': 0, 'net.jforum.controller

In [71]:
# For each class in class_code, generate embeddings and add to class_embeddings dictionary
class_embeddings = {}
class_code = load_class_code_from_directory(system)
for class_name, code in class_code.items():
    class_embeddings[class_name] = generate_embeddings_for_java_file(code, model, tokenizer, device)

# Write embeddings to csv file
save_embeddings_to_csv(version, system, model_type, class_embeddings, class_labels)

KeyboardInterrupt: 

## 1.2 Train ML models

In [None]:
filename = f"{version}_{system}_{model_type}_embeddings.csv"
class_names, labels, embeddings = load_data_from_csv(filename)

Xtrain, Xtest, names_train, names_test = train_test_split(embeddings, class_names, test_size=0.3, random_state=0)

ytrain = [labels[class_names.index(name)] for name in names_train]
ytest = [labels[class_names.index(name)] for name in names_test]

Xtrain = np.array(Xtrain)
Xtest = np.array(Xtest)

# Ensure that there's at least one instance of the "Utility" label in the training data
if 1 not in ytrain:
    utility_index = labels.index(1)
    Xtrain = np.append(Xtrain, [embeddings[utility_index]], axis=0)
    ytrain.append(1)

print(Xtrain)
print(len(Xtest))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
def generate_classification_report(y_true, y_pred):
    # Identify unique labels in both true labels and predictions
    unique_labels = np.unique(np.concatenate((y_true, y_pred)))

    # Map unique labels to their corresponding names
    label_names_map = {-1: "None", 0: "Application", 1: "Utility", 2: "Entity"}
    dynamic_label_names = [label_names_map[label] for label in unique_labels]

    # Generate and print the classification report
    print(classification_report(y_true, y_pred, target_names=dynamic_label_names))

## Decision Tree

In [None]:
decision_tree_classifier = DecisionTreeClassifier(max_depth=2).fit(Xtrain, ytrain)
decision_tree_predictions = decision_tree_classifier.predict(Xtest)
decision_tree_accuracy = accuracy_score(ytest, decision_tree_predictions)
decision_tree_confusion_matrix = confusion_matrix(ytest, decision_tree_predictions)
print(decision_tree_accuracy)
print(decision_tree_confusion_matrix)
generate_classification_report(ytest, decision_tree_predictions)

## SVM

In [None]:
svm_classifier = SVC(kernel='linear', C=2).fit(Xtrain, ytrain)
svm_predictions = svm_classifier.predict(Xtest)
svm_accuracy = accuracy_score(ytest, svm_predictions)
svm_confusion_matrix = confusion_matrix(ytest, svm_predictions)
print(svm_accuracy)
print(svm_confusion_matrix)
generate_classification_report(ytest, svm_predictions)

## KNN

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5).fit(Xtrain, ytrain)
knn_predictions = knn_classifier.predict(Xtest)
knn_accuracy = accuracy_score(ytest, knn_predictions)
knn_confusion_matrix = confusion_matrix(ytest, knn_predictions)
print(knn_accuracy)
print(knn_confusion_matrix)
generate_classification_report(ytest, knn_predictions)

## LogisticRegression

In [None]:
logistic_regression_classifier = LogisticRegression(random_state=0).fit(Xtrain, ytrain)
logistic_regression_predictions = logistic_regression_classifier.predict(Xtest)
logistic_regression_accuracy = accuracy_score(ytest, logistic_regression_predictions)
logistic_regression_confusion_matrix = confusion_matrix(ytest, logistic_regression_predictions)
print(logistic_regression_accuracy)
print(logistic_regression_confusion_matrix)
generate_classification_report(ytest, logistic_regression_predictions)

## Gaussian NB

In [None]:
naive_bayes_classifier = GaussianNB().fit(Xtrain, ytrain)
naive_bayes_predictions = naive_bayes_classifier.predict(Xtest)
naive_bayes_accuracy = accuracy_score(ytest, naive_bayes_predictions)
naive_bayes_confusion_matrix = confusion_matrix(ytest, naive_bayes_predictions)
print(naive_bayes_accuracy)
print(naive_bayes_confusion_matrix)
generate_classification_report(ytest, naive_bayes_predictions)