# Phase 1: Classification (assign Utility, Application, or Entity Tag)

## 1.1 Create Embeddings

In [1]:
version = "v_team" # TODO changer
system = "jforum" # TODO changer
model_type = "codebert" # or ft_codebert

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from utils import load_class_code_from_directory, load_data_from_csv, save_embeddings_to_csv
from generate_embeddings import generate_embeddings_for_java_file

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check if CUDA (GPU) is available and if so, set the device to GPU
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  

device = torch.device(dev)

In [4]:
# Select the model and tokenizer
if (model_type == "codebert"):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base",force_download=False)
    model = AutoModel.from_pretrained("microsoft/codebert-base",force_download=False)
elif (model_type == "ft_codebert"):
    tokenizer = AutoTokenizer.from_pretrained("./codebert_finetuned",force_download=False)
    model = AutoModel.from_pretrained("./codebert_finetuned",force_download=False)
else:
    raise NameError("model type not supported")

# Move the model to the GPU if available
model = model.to(device)

In [5]:
# Labels are 0: Application, 1: Utility, 2: Entity

# Get the manually typed classes
def process_file(filepath, label):
    with open(filepath, 'r') as f:
        for line in f:
            class_labels[line.strip()] = label

class_labels = {}
process_file("ground_truths/" + version + "/" + system + "/classes/application.txt", 0)
process_file("ground_truths/" + version + "/" + system + "/classes/utility.txt", 1)
process_file("ground_truths/" + version + "/" + system + "/classes/entity.txt", 2)

print(class_labels)

{'net.jforum.actions.extensions.TopicWatchExtension': 0, 'net.jforum.actions.interceptors.ControllerSecurityInterceptor': 0, 'net.jforum.actions.interceptors.ExtendsAnnotationInterceptor': 0, 'net.jforum.actions.interceptors.MethodSecurityInterceptor': 0, 'net.jforum.actions.interceptors.SecurityInterceptor': 0, 'net.jforum.actions.interceptors.SessionManagerInterceptor': 0, 'net.jforum.api.JForumExecutionContext': 0, 'net.jforum.controllers.AvatarAdminController': 0, 'net.jforum.controllers.BadWordAdminController': 0, 'net.jforum.controllers.BanlistAdminController': 0, 'net.jforum.controllers.CategoryAdminController': 0, 'net.jforum.controllers.ConfigController': 0, 'net.jforum.controllers.ForumAdminController': 0, 'net.jforum.controllers.ForumController': 0, 'net.jforum.controllers.GroupAdminController': 0, 'net.jforum.controllers.HibernateStatisticsController': 0, 'net.jforum.controllers.LuceneAdminController': 0, 'net.jforum.controllers.MessageController': 0, 'net.jforum.controller

In [6]:
# For each class in class_code, generate embeddings and add to class_embeddings dictionary
class_embeddings = {}
class_code = load_class_code_from_directory(system)
for class_name, code in class_code.items():
    class_embeddings[class_name] = generate_embeddings_for_java_file(code, model, tokenizer, device)

# Write embeddings to csv file
save_embeddings_to_csv(version, system, model_type, class_embeddings, class_labels)

TypeError: generate_embeddings_for_java_file() missing 1 required positional argument: 'device'

## 1.2 Train ML models

In [30]:
filename = f"{version}_{system}_{model_type}_embeddings.csv"
class_names, labels, embeddings = load_data_from_csv(filename)

Xtrain, Xtest, names_train, names_test = train_test_split(embeddings, class_names, test_size=0.3, random_state=0)

ytrain = [labels[class_names.index(name)] for name in names_train]
ytest = [labels[class_names.index(name)] for name in names_test]

Xtrain = np.array(Xtrain)
Xtest = np.array(Xtest)

# Ensure that there's at least one instance of the "Utility" label in the training data
if 1 not in ytrain:
    utility_index = labels.index(1)
    Xtrain = np.append(Xtrain, [embeddings[utility_index]], axis=0)
    ytrain.append(1)

print(Xtrain)
print(len(Xtest))

[[-0.18812622 -0.07322095 -0.04142916 ... -0.12509549 -0.6195657
   0.5668942 ]
 [-0.18266301 -0.02652007 -0.01852127 ... -0.1292542  -0.59713876
   0.54415834]
 [-0.18029451  0.13444571  0.01210325 ... -0.23672481 -0.49645227
   0.4131651 ]
 ...
 [-0.18025655 -0.03189673 -0.04783064 ... -0.14247401 -0.59984833
   0.55347806]
 [-0.24345882 -0.04653927 -0.02923113 ... -0.13501357 -0.6327489
   0.47451097]
 [-0.20725228  0.25259238 -0.01621119 ... -0.21184425 -0.46678564
   0.45599747]]
17


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [51]:
def generate_classification_report(y_true, y_pred):
    # Identify unique labels in both true labels and predictions
    unique_labels = np.unique(np.concatenate((y_true, y_pred)))

    # Map unique labels to their corresponding names
    label_names_map = {-1: "None", 0: "Application", 1: "Utility", 2: "Entity"}
    dynamic_label_names = [label_names_map[label] for label in unique_labels]

    # Generate and print the classification report
    print(classification_report(y_true, y_pred, target_names=dynamic_label_names))

## Decision Tree

In [52]:
decision_tree_classifier = DecisionTreeClassifier(max_depth=2).fit(Xtrain, ytrain)
decision_tree_predictions = decision_tree_classifier.predict(Xtest)
decision_tree_accuracy = accuracy_score(ytest, decision_tree_predictions)
decision_tree_confusion_matrix = confusion_matrix(ytest, decision_tree_predictions)
print(decision_tree_accuracy)
print(decision_tree_confusion_matrix)
generate_classification_report(ytest, decision_tree_predictions)

1.0
[[8 0]
 [0 9]]
              precision    recall  f1-score   support

 Application       1.00      1.00      1.00         8
      Entity       1.00      1.00      1.00         9

    accuracy                           1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17



## SVM

In [53]:
svm_classifier = SVC(kernel='linear', C=2).fit(Xtrain, ytrain)
svm_predictions = svm_classifier.predict(Xtest)
svm_accuracy = accuracy_score(ytest, svm_predictions)
svm_confusion_matrix = confusion_matrix(ytest, svm_predictions)
print(svm_accuracy)
print(svm_confusion_matrix)
generate_classification_report(ytest, svm_predictions)

0.9411764705882353
[[7 1 0]
 [0 0 0]
 [0 0 9]]
              precision    recall  f1-score   support

 Application       1.00      0.88      0.93         8
     Utility       0.00      0.00      0.00         0
      Entity       1.00      1.00      1.00         9

    accuracy                           0.94        17
   macro avg       0.67      0.62      0.64        17
weighted avg       1.00      0.94      0.97        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [54]:
knn_classifier = KNeighborsClassifier(n_neighbors=5).fit(Xtrain, ytrain)
knn_predictions = knn_classifier.predict(Xtest)
knn_accuracy = accuracy_score(ytest, knn_predictions)
knn_confusion_matrix = confusion_matrix(ytest, knn_predictions)
print(knn_accuracy)
print(knn_confusion_matrix)
generate_classification_report(ytest, knn_predictions)

0.8235294117647058
[[8 0]
 [3 6]]
              precision    recall  f1-score   support

 Application       0.73      1.00      0.84         8
      Entity       1.00      0.67      0.80         9

    accuracy                           0.82        17
   macro avg       0.86      0.83      0.82        17
weighted avg       0.87      0.82      0.82        17



## LogisticRegression

In [55]:
logistic_regression_classifier = LogisticRegression(random_state=0).fit(Xtrain, ytrain)
logistic_regression_predictions = logistic_regression_classifier.predict(Xtest)
logistic_regression_accuracy = accuracy_score(ytest, logistic_regression_predictions)
logistic_regression_confusion_matrix = confusion_matrix(ytest, logistic_regression_predictions)
print(logistic_regression_accuracy)
print(logistic_regression_confusion_matrix)
generate_classification_report(ytest, logistic_regression_predictions)

1.0
[[8 0]
 [0 9]]
              precision    recall  f1-score   support

 Application       1.00      1.00      1.00         8
      Entity       1.00      1.00      1.00         9

    accuracy                           1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Gaussian NB

In [56]:
naive_bayes_classifier = GaussianNB().fit(Xtrain, ytrain)
naive_bayes_predictions = naive_bayes_classifier.predict(Xtest)
naive_bayes_accuracy = accuracy_score(ytest, naive_bayes_predictions)
naive_bayes_confusion_matrix = confusion_matrix(ytest, naive_bayes_predictions)
print(naive_bayes_accuracy)
print(naive_bayes_confusion_matrix)
generate_classification_report(ytest, naive_bayes_predictions)

1.0
[[8 0]
 [0 9]]
              precision    recall  f1-score   support

 Application       1.00      1.00      1.00         8
      Entity       1.00      1.00      1.00         9

    accuracy                           1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17

