In [101]:
import os
import pickle
from tqdm import tqdm, trange
from dataclasses import dataclass
import pandas as pd
import json
from dataclasses import dataclass
from typing import Optional
from drl_patches.sparse_autoencoders.get_vectorizer import load_tfidf_vectorizer
from drl_patches.sparse_autoencoders.getting_experiment_config import (
    load_training_indexes,
)
import torch
from drl_patches.logger import logger
from drl_patches.sparse_autoencoders.classical_data_mining import get_metrics


In [102]:
@dataclass
class Dataset:
    dataset_path: str
    training_idx_path: str

GBUG_DATASET = Dataset(
    dataset_path="../artifacts/gbug-java.csv",
    training_idx_path="../artifacts/gbug-java_train_indexes.json",
)

DEFECT_DATASET = Dataset(
    dataset_path="../artifacts/defects4j.csv",
    training_idx_path="../artifacts/defects4j_train_indexes.json",
)

HUMAN_DATASET = Dataset(
    dataset_path="../artifacts/humaneval.csv",
    training_idx_path="../artifacts/humaneval_train_indexes.json",
)

In [103]:

def get_testing_dataset(dataset: Dataset):
    """
    Get the testing dataset path and the training indexes path.
    """
    df = pd.read_csv(dataset.dataset_path)
    print(df.shape)
    with open( dataset.training_idx_path, "r") as f:
        training_indices = json.load(f)
    train_df = df.iloc[training_indices]
    test_df = df.drop(train_df.index)

    return train_df, test_df



In [None]:
@dataclass
class SAE_ACTIVATIONS:
    feature_diff_path: str
    model_path: str
    layer: int
    top_k: int
    dataset: Dataset
    gbug_feature_safe_path: str
    gbug_feature_vuln_path: str
    defects_feature_safe_path: str
    defects_feature_vuln_path: str
    humaneval_feature_safe_path: str
    humaneval_feature_vuln_path: str

SAE_ACTIVATIONS_GBUG_LAYER_1 = SAE_ACTIVATIONS(
    feature_diff_path="../gpt2_gbug-java/layer1/feature_importance_diff.jsonl",
    gbug_feature_safe_path="../gpt2_gbug-java/layer1/feature_importance_safe.jsonl",
    gbug_feature_vuln_path="../gpt2_gbug-java/layer1/feature_importance_vuln.jsonl",
    defects_feature_safe_path="../gpt2_defects4j/layer1/feature_importance_safe.jsonl",
    defects_feature_vuln_path="../gpt2_defects4j/layer1/feature_importance_vuln.jsonl",
    humaneval_feature_safe_path="../gpt2_humaneval/layer1/feature_importance_safe.jsonl",
    humaneval_feature_vuln_path="../gpt2_humaneval/layer1/feature_importance_vuln.jsonl",
    model_path = "models/gbug_decision_tree_layer10_k_76.pt",
    layer = 10,
    top_k = 76,
    dataset = GBUG_DATASET,
)


In [None]:

def read_jsonl_file(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            yield json.loads(line)

def get_feature_diff_path(dataset: Dataset, training_indexes):
    """
    Get the feature diff path for the given dataset.
    """
    import warnings
    warnings.filterwarnings("ignore")
    diff_data = list(read_jsonl_file(dataset.feature_diff_path))
    diff_df = pd.DataFrame(diff_data)

    columns = diff_df.columns.to_list()
    columns.remove("values")

    diff_df.drop(columns=columns, inplace=True)

    for i in trange(len(diff_df["values"][0])):
        diff_df[f"feature_{i}"] = diff_df["values"].apply(lambda x: x[i])

    diff_df.drop(columns=["values"], inplace=True)

    
    return diff_df



def get_most_important_features(train_df_diff, n=100):
    features = train_df_diff.sum(axis=0).sort_values(ascending=False).index[1 : n + 1]
    return features


In [None]:
OUR_CONFIG = SAE_ACTIVATIONS_GBUG_LAYER_1

In [None]:
_, test_df = get_testing_dataset(OUR_CONFIG.dataset)

with open(OUR_CONFIG.dataset.training_idx_path, "r") as f:
    training_indices = json.load(f)

train_df_diff = get_feature_diff_path(
    SAE_ACTIVATIONS_GBUG_LAYER_1, training_indices
)

In [None]:
top_k = get_most_important_features(train_df_diff, n=OUR_CONFIG.top_k)

In [None]:
def get_vuln_safe_data(vuln_jsonl_path, safe_jsonl_path, train_indexes):
    vuln_data = list(read_jsonl_file(vuln_jsonl_path))
    safe_data = list(read_jsonl_file(safe_jsonl_path))
    vuln_df = pd.DataFrame(vuln_data)
    safe_df = pd.DataFrame(safe_data)
    vuln_df.drop(columns=["labels", "model", "plot_type"], inplace=True)
    vuln_df["vuln"] = 1

    safe_df.drop(columns=["labels", "model", "plot_type"], inplace=True)
    safe_df["vuln"] = 0

    for i in trange(len(vuln_df["values"][0])):
        vuln_df[f"feature_{i}"] = vuln_df["values"].apply(lambda x: x[i])
        safe_df[f"feature_{i}"] = safe_df["values"].apply(lambda x: x[i])

    safe_df_train = safe_df.loc[train_indexes]
    safe_df_test = safe_df.drop(train_indexes)

    vuln_df_train = vuln_df.loc[train_indexes]
    vuln_df_test = vuln_df.drop(train_indexes)

    df_train = pd.concat([safe_df_train, vuln_df_train])
    df_test = pd.concat([safe_df_test, vuln_df_test])

    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)
    df_train.drop(columns=["values"], inplace=True)
    df_test.drop(columns=["values"], inplace=True)

    return df_train, df_test


In [None]:
def evaluate_clf(clf, test_df):
    """
    Evaluate the model on the test dataset.
    """
    #bug_id	func_before	func_after
    tp = 0
    fp = 0
    fn = 0
    tn = 0

    for i, row in tqdm(test_df.iterrows()):
        X = row[top_k.tolist()]
        y = row["vuln"]

        # Get the prediction
        pred = clf.predict([X])

        # Update the confusion matrix
        if pred == 1 and y == 1:
            tp += 1
        elif pred == 1 and y == 0:
            fp += 1
        elif pred == 0 and y == 1:
            fn += 1
        elif pred == 0 and y == 0:
            tn += 1


    # Calculate the accuracy
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    # Calculate the precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    # Calculate the recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    # Calculate the F1 score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "tp": tp,
        "fp": fp,
        "fn": fn,
        "tn": tn
    }

In [None]:
with open(SAE_ACTIVATIONS_GBUG_LAYER_1.model_path, "rb") as f:
    clf = pickle.load(f)

# Original (GBUG)
_, df_test = get_vuln_safe_data(
    OUR_CONFIG.gbug_feature_vuln_path,
    OUR_CONFIG.gbug_feature_safe_path,
    training_indices,
)
# filter columns based on top_k
df_test_filtered = df_test[
    top_k.tolist() + ["vuln"]
]

results = evaluate_clf(clf, df_test_filtered)
results

In [None]:
# Defects4J
_, df_test = get_vuln_safe_data(
    OUR_CONFIG.defects_feature_vuln_path,
    OUR_CONFIG.defects_feature_safe_path,
    training_indices,
)
# filter columns based on top_k
df_test_filtered = df_test[
    top_k.tolist() + ["vuln"]
]

results = evaluate_clf(clf, df_test_filtered)
results

In [None]:
# Defects4J
_, df_test = get_vuln_safe_data(
    OUR_CONFIG.humaneval_feature_vuln_path,
    OUR_CONFIG.humaneval_feature_safe_path,
    training_indices,
)
# filter columns based on top_k
df_test_filtered = df_test[
    top_k.tolist() + ["vuln"]
]

results = evaluate_clf(clf, df_test_filtered)
results

# Baselines Transferability

In [105]:
@dataclass
class BaselineClassifier:
    path: str
    tfidf_vectorizer_path: str
    base_dataset: Dataset
    input_size: int

DEFECTS4J_KNN_BASELINE = BaselineClassifier(
    path="../ole/defects4j_knn_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=DEFECT_DATASET,
    input_size=5000,
)
GBUG_KNN_BASELINE = BaselineClassifier(
    path="../ole/gbug_knn_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=GBUG_DATASET,
    input_size=5000,
)
HUMANEVAL_KNN_BASELINE = BaselineClassifier(
    path="../ole/human_knn_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=HUMAN_DATASET,
    input_size=5000,
)
DEFECTS4J_RF_BASELINE = BaselineClassifier(
    path="../ole/defects4j_random_forest_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=DEFECT_DATASET,
    input_size=5000,
)
GBUG_RF_BASELINE = BaselineClassifier(
    path="../ole/gbug_random_forest_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=GBUG_DATASET,
    input_size=5000,
)
HUMANEVAL_RF_BASELINE = BaselineClassifier(
    path="../ole/human_random_forest_k_5000.pt",
    tfidf_vectorizer_path = "../artifacts/vectorizer.pkl",
    base_dataset=HUMAN_DATASET,
    input_size=5000,
)

def load_baseline_classifier(classifier: BaselineClassifier):
    with open(classifier.path, "rb") as f:
        clf = pickle.load(f)
    return clf
@dataclass
class Results:
    precision: float
    recall: float
    accuracy: float
    f1: float


@dataclass
class TransferabilityPerformance:
    on_defects: Optional[int] = None
    on_humaneval: Optional[int] = None
    on_gbug: Optional[int] = None

def calculate_f1_shift(
    results: Results,
    baseline_results: Results,
) -> dict:
    """
    Calculate the shift in perfoamcene in terms of F1 score % changed
    """
    f1_shift = (results.f1 - baseline_results.f1) / baseline_results.f1 * 100
    
    return f1_shift

def test_baseline_classifier(clf, 
                             vectorizer, 
                             df, 
                             train_indexes,
                             before_func_col="func_before", 
                             after_func_col="func_after") -> Results:
    """
    Test the baseline classifier on the test dataset.
    """

    df["tokenized_before"] = df[before_func_col].progress_apply(
        lambda x: vectorizer.transform([x]).toarray()[0]
    )
    df["tokenized_after"] = df[after_func_col].progress_apply(
        lambda x: vectorizer.transform([x]).toarray()[0]
    )
    # Pad to 5000 tokens
    df["tokenized_before"] = df["tokenized_before"].apply(
        lambda x: x[:5000] + [0] * (5000 - len(x)) if len(x) < 5000 else x[:5000]
    )
    df["tokenized_after"] = df["tokenized_after"].apply(
        lambda x: x[:5000] + [0] * (5000 - len(x)) if len(x) < 5000 else x[:5000]
    )

    df_test = df.drop(train_indexes)

    df_classical_test = pd.DataFrame()
    for row in df_test.iterrows():
        row = row[1]
        df_classical_test = pd.concat(
            [
                df_classical_test,
                pd.DataFrame(
                    {"tokens": [row["tokenized_before"].tolist()], "vuln": 1}, index=[0]
                ),
            ]
        )
        df_classical_test = pd.concat(
            [
                df_classical_test,
                pd.DataFrame(
                    {"tokens": [row["tokenized_after"].tolist()], "vuln": 0}, index=[0]
                ),
            ]
        )

    X_test = df_classical_test["tokens"].values.tolist()
    X_test = [torch.tensor(x) for x in X_test]
    y_test = df_classical_test["vuln"]

    # Get the prediction
    y_pred = clf.predict(X_test)
    precision, recall, accuracy, f1 = get_metrics(y_pred, y_test)
    logger.info(
        "Classification report:",
        precision=precision,
        recall=recall,
        accuracy=accuracy,
        f1=f1,
    )
    return Results(
        precision=precision,
        recall=recall,
        accuracy=accuracy,
        f1=f1,
    )
    



In [106]:

results_gbug_knn = test_baseline_classifier(
    load_baseline_classifier(GBUG_KNN_BASELINE),
    load_tfidf_vectorizer(GBUG_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)

results_gbug_knn_defects = test_baseline_classifier(
    load_baseline_classifier(GBUG_KNN_BASELINE),
    load_tfidf_vectorizer(GBUG_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)

results_gbug_knn_human = test_baseline_classifier(
    load_baseline_classifier(GBUG_KNN_BASELINE),
    load_tfidf_vectorizer(GBUG_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)


transf_knn_gbug_on_defects= calculate_f1_shift(
    results_gbug_knn_defects,
    results_gbug_knn,
)
transf_knn_gbug_on_human= calculate_f1_shift(
    results_gbug_knn_human,
    results_gbug_knn,
)

transf_knn_gbug = TransferabilityPerformance(
    on_defects=transf_knn_gbug_on_defects,
    on_humaneval=transf_knn_gbug_on_human,
)

100%|██████████| 148/148 [00:00<00:00, 3945.50it/s]
100%|██████████| 148/148 [00:00<00:00, 4841.34it/s]

[2m2025-04-21 19:23:47[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.6666666666666666[0m [36mf1[0m=[35m0.6774193548387096[0m [36mprecision[0m=[35m0.65625[0m [36mrecall[0m=[35m0.7[0m



100%|██████████| 465/465 [00:00<00:00, 6626.66it/s]
100%|██████████| 465/465 [00:00<00:00, 6679.70it/s]


[2m2025-04-21 19:23:47[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5[0m [36mf1[0m=[35m0.4431137724550898[0m [36mprecision[0m=[35m0.5[0m [36mrecall[0m=[35m0.3978494623655914[0m


100%|██████████| 162/162 [00:00<00:00, 7042.24it/s]
100%|██████████| 162/162 [00:00<00:00, 7635.00it/s]

[2m2025-04-21 19:23:47[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.48484848484848486[0m [36mf1[0m=[35m0.43333333333333324[0m [36mprecision[0m=[35m0.48148148148148145[0m [36mrecall[0m=[35m0.3939393939393939[0m





In [115]:
### Random Forest from Gbug
results_gbug_rf = test_baseline_classifier(
    load_baseline_classifier(GBUG_RF_BASELINE),
    load_tfidf_vectorizer(GBUG_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)

results_gbug_rf_defects = test_baseline_classifier(
    load_baseline_classifier(GBUG_RF_BASELINE),
    load_tfidf_vectorizer(GBUG_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)

results_gbug_rf_human = test_baseline_classifier(
    load_baseline_classifier(GBUG_KNN_BASELINE),
    load_tfidf_vectorizer(GBUG_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)

transf_rf_gbug_on_defects= calculate_f1_shift(
    results_gbug_rf_defects,
    results_gbug_rf,
)
transf_rf_gbug_on_human= calculate_f1_shift(
    results_gbug_rf_human,
    results_gbug_rf,
)

transf_rf_gbug = TransferabilityPerformance(
    on_defects=transf_rf_gbug_on_defects,
    on_humaneval=transf_rf_gbug_on_human,
)

100%|██████████| 148/148 [00:00<00:00, 4733.72it/s]
100%|██████████| 148/148 [00:00<00:00, 5198.84it/s]

[2m2025-04-21 19:24:42[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.75[0m [36mf1[0m=[35m0.7058823529411764[0m [36mprecision[0m=[35m0.8571428571428571[0m [36mrecall[0m=[35m0.6[0m



100%|██████████| 465/465 [00:00<00:00, 7168.78it/s]
100%|██████████| 465/465 [00:00<00:00, 7024.37it/s]


[2m2025-04-21 19:24:42[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5161290322580645[0m [36mf1[0m=[35m0.1176470588235294[0m [36mprecision[0m=[35m0.6666666666666666[0m [36mrecall[0m=[35m0.06451612903225806[0m


100%|██████████| 162/162 [00:00<00:00, 3930.32it/s]
100%|██████████| 162/162 [00:00<00:00, 6022.35it/s]


[2m2025-04-21 19:24:42[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.48484848484848486[0m [36mf1[0m=[35m0.43333333333333324[0m [36mprecision[0m=[35m0.48148148148148145[0m [36mrecall[0m=[35m0.3939393939393939[0m


In [116]:
results_defects_knn = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_KNN_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)
results_defects_knn_gbug = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_KNN_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)
results_defects_knn_human = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_KNN_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)
transf_knn_defects_on_gbug= calculate_f1_shift(
    results_defects_knn_gbug,
    results_defects_knn,
)
transf_knn_defects_on_human= calculate_f1_shift(
    results_defects_knn_human,
    results_defects_knn,
)

transf_knn_defects = TransferabilityPerformance(
    on_defects=transf_knn_defects_on_gbug,
    on_humaneval=transf_knn_defects_on_human,
)


100%|██████████| 465/465 [00:00<00:00, 6626.30it/s]
100%|██████████| 465/465 [00:00<00:00, 6975.46it/s]


[2m2025-04-21 19:25:39[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5376344086021505[0m [36mf1[0m=[35m0.35820895522388063[0m [36mprecision[0m=[35m0.5853658536585366[0m [36mrecall[0m=[35m0.25806451612903225[0m


100%|██████████| 148/148 [00:00<00:00, 5247.67it/s]
100%|██████████| 148/148 [00:00<00:00, 4808.94it/s]


[2m2025-04-21 19:25:39[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5333333333333333[0m [36mf1[0m=[35m0.43999999999999995[0m [36mprecision[0m=[35m0.55[0m [36mrecall[0m=[35m0.36666666666666664[0m


100%|██████████| 162/162 [00:00<00:00, 7323.37it/s]
100%|██████████| 162/162 [00:00<00:00, 7360.02it/s]


[2m2025-04-21 19:25:39[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5454545454545454[0m [36mf1[0m=[35m0.4642857142857143[0m [36mprecision[0m=[35m0.5652173913043478[0m [36mrecall[0m=[35m0.3939393939393939[0m


In [117]:
results_defects_rf = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_RF_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)
results_defects_rf_gbug = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_RF_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)
results_defects_rf_human = test_baseline_classifier(
    load_baseline_classifier(DEFECTS4J_RF_BASELINE),
    load_tfidf_vectorizer(DEFECTS4J_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)
transf_rf_defects_on_gbug= calculate_f1_shift(
    results_defects_rf_gbug,
    results_defects_rf,
)
transf_rf_defects_on_human= calculate_f1_shift(
    results_defects_rf_human,
    results_defects_rf,
)

transf_rf_defects = TransferabilityPerformance(
    on_defects=transf_rf_defects_on_gbug,
    on_humaneval=transf_rf_defects_on_human,
)

100%|██████████| 465/465 [00:00<00:00, 6360.91it/s]
100%|██████████| 465/465 [00:00<00:00, 6919.94it/s]


[2m2025-04-21 19:26:15[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.521505376344086[0m [36mf1[0m=[35m0.41830065359477125[0m [36mprecision[0m=[35m0.5333333333333333[0m [36mrecall[0m=[35m0.34408602150537637[0m


100%|██████████| 148/148 [00:00<00:00, 5642.17it/s]
100%|██████████| 148/148 [00:00<00:00, 5108.06it/s]


[2m2025-04-21 19:26:15[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5333333333333333[0m [36mf1[0m=[35m0.36363636363636365[0m [36mprecision[0m=[35m0.5714285714285714[0m [36mrecall[0m=[35m0.26666666666666666[0m


100%|██████████| 162/162 [00:00<00:00, 8152.59it/s]
100%|██████████| 162/162 [00:00<00:00, 3012.11it/s]


[2m2025-04-21 19:26:15[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5[0m [36mf1[0m=[35m0.6117647058823529[0m [36mprecision[0m=[35m0.5[0m [36mrecall[0m=[35m0.7878787878787878[0m


In [118]:
results_humaneval_knn = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)
results_humaneval_knn_gbug = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)
results_humaneval_knn_defects = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)
transf_knn_humaneval_on_gbug= calculate_f1_shift(
    results_humaneval_knn_gbug,
    results_humaneval_knn,
)
transf_knn_humaneval_on_defects= calculate_f1_shift(
    results_humaneval_knn_defects,
    results_humaneval_knn,
)

transf_knn_humaneval = TransferabilityPerformance(
    on_defects=transf_knn_humaneval_on_defects,
    on_gbug=transf_knn_humaneval_on_gbug,
)


100%|██████████| 162/162 [00:00<00:00, 6077.89it/s]
100%|██████████| 162/162 [00:00<00:00, 7935.41it/s]

[2m2025-04-21 19:26:38[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5757575757575758[0m [36mf1[0m=[35m0.5625[0m [36mprecision[0m=[35m0.5806451612903226[0m [36mrecall[0m=[35m0.5454545454545454[0m



100%|██████████| 148/148 [00:00<00:00, 5185.38it/s]
100%|██████████| 148/148 [00:00<00:00, 5131.41it/s]


[2m2025-04-21 19:26:38[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.48333333333333334[0m [36mf1[0m=[35m0.5373134328358209[0m [36mprecision[0m=[35m0.4864864864864865[0m [36mrecall[0m=[35m0.6[0m


100%|██████████| 465/465 [00:00<00:00, 6854.45it/s]
100%|██████████| 465/465 [00:00<00:00, 6708.23it/s]


[2m2025-04-21 19:26:39[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5053763440860215[0m [36mf1[0m=[35m0.5[0m [36mprecision[0m=[35m0.5054945054945055[0m [36mrecall[0m=[35m0.4946236559139785[0m


In [112]:
results_humaneval_knn = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)
results_humaneval_knn_gbug = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)
results_humaneval_knn_defects = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_KNN_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_KNN_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)
transf_knn_humaneval_on_gbug= calculate_f1_shift(
    results_humaneval_knn_gbug,
    results_humaneval_knn,
)
transf_knn_humaneval_on_defects= calculate_f1_shift(
    results_humaneval_knn_defects,
    results_humaneval_knn,
)
transf_knn_humaneval = TransferabilityPerformance(
    on_defects=transf_knn_humaneval_on_defects,
    on_gbug=transf_knn_humaneval_on_gbug,
)


100%|██████████| 162/162 [00:00<00:00, 7209.61it/s]
100%|██████████| 162/162 [00:00<00:00, 7751.90it/s]


[2m2025-04-21 19:23:50[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5757575757575758[0m [36mf1[0m=[35m0.5625[0m [36mprecision[0m=[35m0.5806451612903226[0m [36mrecall[0m=[35m0.5454545454545454[0m


100%|██████████| 148/148 [00:00<00:00, 4932.51it/s]
100%|██████████| 148/148 [00:00<00:00, 5297.24it/s]


[2m2025-04-21 19:23:50[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.48333333333333334[0m [36mf1[0m=[35m0.5373134328358209[0m [36mprecision[0m=[35m0.4864864864864865[0m [36mrecall[0m=[35m0.6[0m


100%|██████████| 465/465 [00:00<00:00, 6772.59it/s]
100%|██████████| 465/465 [00:00<00:00, 6714.54it/s]


[2m2025-04-21 19:23:50[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5053763440860215[0m [36mf1[0m=[35m0.5[0m [36mprecision[0m=[35m0.5054945054945055[0m [36mrecall[0m=[35m0.4946236559139785[0m


In [113]:
results_humaneval_rf = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_RF_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(HUMAN_DATASET.dataset_path),
    load_training_indexes(HUMAN_DATASET.training_idx_path),
)
results_humaneval_rf_gbug = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_RF_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(GBUG_DATASET.dataset_path),
    load_training_indexes(GBUG_DATASET.training_idx_path),
)
results_humaneval_rf_defects = test_baseline_classifier(
    load_baseline_classifier(HUMANEVAL_RF_BASELINE),
    load_tfidf_vectorizer(HUMANEVAL_RF_BASELINE.tfidf_vectorizer_path),
    pd.read_csv(DEFECT_DATASET.dataset_path),
    load_training_indexes(DEFECT_DATASET.training_idx_path),
)
transf_rf_humaneval_on_gbug= calculate_f1_shift(
    results_humaneval_rf_gbug,
    results_humaneval_rf,
)
transf_rf_humaneval_on_defects= calculate_f1_shift(
    results_humaneval_rf_defects,
    results_humaneval_rf,
)
transf_rf_humaneval = TransferabilityPerformance(
    on_defects=transf_rf_humaneval_on_defects,
    on_gbug=transf_rf_humaneval_on_gbug,
)

  0%|          | 0/162 [00:00<?, ?it/s]

100%|██████████| 162/162 [00:00<00:00, 7087.49it/s]
100%|██████████| 162/162 [00:00<00:00, 7137.37it/s]

[2m2025-04-21 19:23:50[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5606060606060606[0m [36mf1[0m=[35m0.4912280701754386[0m [36mprecision[0m=[35m0.5833333333333334[0m [36mrecall[0m=[35m0.42424242424242425[0m



100%|██████████| 148/148 [00:00<00:00, 5375.54it/s]
100%|██████████| 148/148 [00:00<00:00, 2934.24it/s]


[2m2025-04-21 19:23:51[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.5[0m [36mf1[0m=[35m0.16666666666666669[0m [36mprecision[0m=[35m0.5[0m [36mrecall[0m=[35m0.1[0m


100%|██████████| 465/465 [00:00<00:00, 6698.65it/s]
100%|██████████| 465/465 [00:00<00:00, 7032.25it/s]


[2m2025-04-21 19:23:51[0m [[32m[1minfo     [0m] [1mClassification report:        [0m [36maccuracy[0m=[35m0.4946236559139785[0m [36mf1[0m=[35m0.11320754716981131[0m [36mprecision[0m=[35m0.46153846153846156[0m [36mrecall[0m=[35m0.06451612903225806[0m


In [114]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")

fig, ax = plt.subplots(2, 3, figsize=(15, 7))
fig.suptitle("Transferability Performance of KNN and RF Classifiers", fontsize=16)

# Gbug KNN
ax[0, 0].barh(
    ["Defects4J", "HumanEval"],
    [transf_knn_gbug.on_defects, transf_knn_gbug.on_humaneval],
)
ax[0, 0].set_title("Gbug KNN")
ax[0, 0].set_xlabel("F1 Shift (%)")
ax[0, 0].set_xlim(-100, 100)

# Defects4J KNN
ax[0, 1].barh(
    ["Gbug", "HumanEval"],
    [transf_knn_defects.on_defects, transf_knn_defects.on_humaneval],
)
ax[0, 1].set_title("Defects4J KNN")
ax[0, 1].set_xlabel("F1 Shift (%)")
ax[0, 1].set_xlim(-100, 100)

# HumanEval KNN
ax[0, 2].barh(
    ["Gbug", "Defects4J"],
    [transf_knn_humaneval.on_gbug, transf_knn_humaneval.on_defects],
)
ax[0, 2].set_title("HumanEval KNN")
ax[0, 2].set_xlabel("F1 Shift (%)")
ax[0, 2].set_xlim(-100, 100)

# Gbug RF
ax[1, 0].barh(
    ["Defects4J", "HumanEval"],
    [transf_rf_gbug.on_defects, transf_rf_gbug.on_humaneval],
)
ax[1, 0].set_title("Gbug RF")
ax[1, 0].set_xlabel("F1 Shift (%)")
ax[1, 0].set_xlim(-100, 100)

# Defects4J RF
ax[1, 1].barh(
    ["Gbug", "HumanEval"],
    [transf_rf_defects.on_defects, transf_rf_defects.on_humaneval],
)
ax[1, 1].set_title("Defects4J RF")
ax[1, 1].set_xlabel("F1 Shift (%)")
ax[1, 1].set_xlim(-100, 100)

# HumanEval RF
ax[1, 2].barh(
    ["Gbug", "Defects4J"],
    [transf_rf_humaneval.on_gbug, transf_rf_humaneval.on_defects],
)
ax[1, 2].set_title("HumanEval RF")
ax[1, 2].set_xlabel("F1 Shift (%)")
ax[1, 2].set_xlim(-100, 100)

plt.tight_layout()
plt.subplots_adjust(top=0.88)
plt.show()

# Save the figure
fig.savefig("transferability_performance_horizontal.png", dpi=300)
