In [None]:
pip install datasets evaluate

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
import torch
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
# Data loading
df = pd.read_csv("dataset.tsv", sep="\t", escapechar="\\")

In [None]:
print(df.head())

In [None]:
print(df.info())

In [None]:
print(df.describe())

In [None]:
print(df.dtypes)

In [None]:

# Removing the lines where refactoring_type = "no_refactoring"
df = df[~df["refactoring_type"].isin(["no_refactoring", "Extract Variable", "Move Method"])]


In [None]:

#Creating a list of labels
labels = sorted(df["refactoring_type"].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
df["label"] = df["refactoring_type"].map(label2id)


In [None]:
print(df.head())

In [None]:

print("Updated list of labels:", label2id)


In [None]:

# Downloading the CodeBERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")


In [None]:

# Sliding Window
def sliding_window_tokenization(code, tokenizer, max_length=256, stride=128):
    tokens = tokenizer.tokenize(code)
    segments = []

    for i in range(0, len(tokens), stride):
        segment = tokens[i : i + max_length]
        segments.append(tokenizer.convert_tokens_to_string(segment))

    return " ".join(segments)


In [None]:

# Applying the Sliding Window to the entire dataset
df["processed_code"] = df["code"].apply(lambda x: sliding_window_tokenization(x, tokenizer))


In [None]:

# Filtering of empty values
df = df[df["processed_code"].notna()]
df = df[df["processed_code"].str.strip() != ""]


In [None]:

# Converting a DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Tokenization of the processed code
def tokenize_function(examples):
    return tokenizer(
        examples["processed_code"],
        padding="max_length",
        truncation=True,
        max_length=256  # CodeBERT processes up to 256 tokens at a time
    )


In [None]:

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# train/test
split_ds = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_ds = split_ds["train"]
test_ds = split_ds["test"]


In [None]:

# 📌 11. Loading CodeBERT
model = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


In [None]:

# Improving the model parameters (adding a dropout)
model.config.hidden_dropout_prob = 0.4  # Optimal for stable learning
model.config.attention_probs_dropout_prob = 0.4  # Optimized Attention dropout


In [None]:

# Metrics (Accuracy + F1)
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")


In [None]:
import evaluate
import numpy as np
from sklearn.metrics import confusion_matrix

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


In [None]:

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=y_pred, references=y_true)["accuracy"]

    f1 = f1_metric.compute(predictions=y_pred, references=y_true, average="weighted")["f1"]

    precision = precision_metric.compute(predictions=y_pred, references=y_true, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=y_pred, references=y_true, average="weighted")["recall"]

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }


In [None]:

# Optimized learning parameters
training_args = TrainingArguments(
    output_dir="./codebert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=3e-5,
    weight_decay=0.05,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [None]:

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:

# model training
trainer.train()


In [None]:
# Save model
trainer.save_model("./codebert_refactor_suggester")
tokenizer.save_pretrained("./codebert_refactor_suggester")


In [None]:
preds_output = trainer.predict(test_ds)
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

cm = confusion_matrix(y_true, y_pred)
labels = list(id2label.values())

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()