In [None]:
!pip install scikit-multilearn


In [None]:
import numpy as np
import random
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import json

dataset_path = "./../../../datasets/04_fontana/output/fontana_mld_sc.json"


with open(dataset_path, "r") as f:
    dataset = json.load(f)

MODEL_NAME = "microsoft/codebert-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

schema = {
    "id": []
}

for i in range(1, 769):
    feature = f"f{i:03}"
    schema[feature] = []

schema["isFeatureEnvy"] = []
schema["isLongMethod"] = []
schema["isLongParametersList"] = []
schema["isSwitchStatement"] = []

max_length = 512
stride = 256


np.random.seed(123456)


MODEL_NAME = "microsoft/codebert-base"  # TODO: Retrieve from config.json
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MODEL_CODE_BERT = AutoModel.from_pretrained("microsoft/codebert-base")


PADDING = "max_length"  # TODO: Retrieve from config.json
TRUNCATION = True  # TODO: Retrieve from config.json
MAX_LENGTH = 512  # TODO: Retrieve from config.json

In [None]:
for j in range(len(dataset)):
    print(j)
    instance = dataset[j]
    code = instance["source_code"]

    final_embedding = None
    method = "SW"

    if method == "FIRST":
        # Use a different variable name for tokenized output
        tokens = tokenizer(
            code,
            padding="max_length",
            truncation=False,
            max_length=512
        )

        # Get the embeddings
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract Embeddings (CodeBERT Features)
        embedding_vector = outputs.last_hidden_state[:, 0, :].squeeze()

    if method == "SW":
        encoding = tokenizer(
            code,
            add_special_tokens=False,
            padding="max_length",
            truncation=False,
            max_length=512
        )

        input_ids = encoding['input_ids']

        windows = []
        # Use a sliding window: note that we subtract one token for the CLS token later.
        # Here, we assume we add special tokens manually.
        effective_window = max_length - 2  # for CLS and SEP
        for i in range(0, len(input_ids), effective_window - stride):
            window = input_ids[i: i + effective_window]
            # Add special tokens: CLS at beginning, SEP at end.
            window = [tokenizer.cls_token_id] + \
                window + [tokenizer.sep_token_id]
            # Pad if needed to ensure consistent length (max_length)
            if len(window) < max_length:
                window = window + [tokenizer.pad_token_id] * \
                    (max_length - len(window))
            windows.append(window)
            if i + effective_window >= len(input_ids):
                break

        print(" ==========> ", len(windows))
        # Get the embeddings
        allEmbeddings = []
        for window in windows:
            inputs = {
                # add batch dimension
                "input_ids": torch.tensor([window], dtype=torch.long),
                "attention_mask": torch.tensor([[1 if token != tokenizer.pad_token_id else 0 for token in window]], dtype=torch.long)
            }
            with torch.no_grad():
                outputs = model(**inputs)
            # Extract Embeddings (CodeBERT Features)
            embedding_vector = outputs.last_hidden_state[:, 0, :].squeeze()
            allEmbeddings.append(embedding_vector)

        # Compute mean embedding
        mean_embedding = torch.mean(torch.stack(allEmbeddings), dim=0)

    final_embedding = embedding_vector

    schema["id"].append(i + 1)

    for k in range(1, 769):
        feature = f"f{k:03}"
        schema[feature].append(final_embedding[k-1].item())

    schema["isFeatureEnvy"].append(
        1 if "FeatureEnvy" in instance["code_smells"] else 0)
    schema["isLongMethod"].append(
        1 if "LongMethod" in instance["code_smells"] else 0)
    schema["isLongParametersList"].append(
        1 if "LongParametersList" in instance["code_smells"] else 0)
    schema["isSwitchStatement"].append(
        1 if "SwitchStatement" in instance["code_smells"] else 0)


embedded_smells_datasets_csv = "./../output/embedded_smells_dataset_fontana.csv"
embedded_smells_datasets_xlsx = "./../output/embedded_smells_dataset_fontana.xlsx"

embedded_smells_df = pd.DataFrame(schema)
embedded_smells_df.to_csv(embedded_smells_datasets_csv, index=False)
embedded_smells_df.to_excel(
    embedded_smells_datasets_xlsx, index=False, engine="openpyxl")

display(embedded_smells_df)

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
import csv
from sklearn.utils import shuffle
from itertools import product
from skmultilearn.model_selection import iterative_train_test_split
from IPython.display import clear_output


np.random.seed(123456)

embedded_smells_datasets_csv = "./../../embeddings/modern_bert/mlcq_class_max_embedding.csv"


# print(features)
# print(labels)

csv_file = 'mlcq_rf_results.csv'
csv_columns = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features',
               'train_accuracy', 'val_accuracy', 'precision', 'recall', 'f1_score']


# print(X_train.shape)
df = pd.read_csv(embedded_smells_datasets_csv)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


# # display(df)

# god_class_df = df[(df['isGodClass'] == 1) & (df['isDataClass'] == 0)]
# both_class_df = df[(df['isGodClass'] == 1) & (df['isDataClass'] == 1)]
# data_class_df = df[(df['isGodClass'] == 0) & (df['isDataClass'] == 1)]

# # Sample 350 from None
# none_sample_df = df[(df['isGodClass'] == 0) & (df['isDataClass'] == 0)].sample(n=len(both_class_df), random_state=42)

# # Concatenate all together
# df = pd.concat([both_class_df, god_class_df, data_class_df, none_sample_df], ignore_index=True)

# # Optional: Shuffle the combined dataset
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

display(df)

features = df.drop(columns=["id", "isGodClass", "isDataClass"])

labels = df[["isGodClass", "isDataClass"]]

#         # Create and train the model
#         # # Train Random Forest with multi-label support
features, labels = shuffle(features, labels, random_state=None)

X_train, X_val, y_train, y_val = train_test_split(
    features, labels, test_size=0.3, random_state=None)

# end = False
# a = 1
# while not end:
#     X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.3, random_state=None)
#     a += 1
#     # print(y_train.shape)
#     god_class_series = y_train["isGodClass"]
#     data_class_series = y_train["isDataClass"]
#     # non_class_series = y_train[(y_train["isGodClass"] == 0) & (y_train["isDataClass"] == 0)]


#     god_class_count = god_class_series.sum()
#     data_class_count = data_class_series.sum()
#     non_class_series = y_train[(y_train["isGodClass"] == 0) | (y_train["isDataClass"] == 0)]
#     none_class_count = len(non_class_series)

#     if 210 <= god_class_count <= 230:
#         if 210 <= data_class_count <= 230:
#             # if 210 <= none_class_count <= 230:
#                 end = True
#     else:
#         if a % 10 == 0:
#             print(a)
clear_output()
initial_accuracy = 0
with open(csv_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()

#     # X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

#     # Define hyperparameter grid
    param_grid = {
        'n_estimators': [150],           # Smaller values for faster training
        # 'n_estimators': [200, 250, 300],           # Smaller values for faster training
        # Keep max_depth small or unlimited
        'max_depth': [5, 10, 15, 20],
        # Try to avoid splits too small
        'min_samples_split': [2, 5, 10],
        # Leaf nodes with at least 1, 2, or 4 samples
        'min_samples_leaf': [1, 2, 4, 8],
        # Try different feature selection strategies
        'max_features': ['log2']
    }

#     # Create list of all combinations
    keys, values = zip(*param_grid.items())
    experiments = [dict(zip(keys, v)) for v in product(*values)]

#     # Loop over each hyperparameter combination
    for params in experiments:

        base_rf = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            max_features=params['max_features'],
            random_state=42
        )
        multi_rf = MultiOutputClassifier(base_rf)
        multi_rf.fit(X_train, y_train)

#         # # Predict and evaluate
        y_train_pred = multi_rf.predict(X_train)
        y_val_pred = multi_rf.predict(X_val)

#         # Metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        precision = precision_score(
            y_val, y_val_pred, average='weighted', zero_division=0)
        recall = recall_score(
            y_val, y_val_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)

        if (val_accuracy > initial_accuracy):
            print(val_accuracy)
            initial_accuracy = val_accuracy

#         # Save result
        result_row = {
            'n_estimators': params['n_estimators'],
            'max_depth': params['max_depth'],
            'min_samples_split': params['min_samples_split'],
            'min_samples_leaf': params['min_samples_leaf'],
            'max_features': params['max_features'],
            'train_accuracy': train_accuracy,
            'val_accuracy': val_accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }

        writer.writerow(result_row)

print(f"Grid search complete. Results saved in {csv_file}")