In [11]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    f1_score, 
    precision_score, 
    recall_score, 
    accuracy_score, 
    classification_report
)
import pickle
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.impute import SimpleImputer

In [12]:
data = pd.read_pickle("/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/merged/merged_metrics_class.pkl")
data.head()

Unnamed: 0,sample_id,severity,file,class,type,cbo,cboModified,fanin,fanout,wmc,...,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty
0,5553,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,IAopReferenceModel,interface,9,9,0,9,16,...,0,0,0,0,0,0,0,34,1,0
1,9341,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,CachePolicyConfOther,class,7,7,0,7,2,...,1,0,1,1,0,0,0,60,1,0
2,12232,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,DruidPooledCallableStatement,class,23,24,1,23,232,...,8,0,7,2,0,0,0,90,1,0
3,10234,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayHub$Gateway$GatewayEndpoint,innerclass,3,3,0,3,6,...,3,0,3,0,0,0,0,46,9,0
4,10234,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayHub$Gateway,innerclass,9,9,0,9,18,...,9,0,9,1,0,2,0,114,9,0


In [13]:
data['label'] = np.where(data.severity == 'none', 0, 1)
data.head()

Unnamed: 0,sample_id,severity,file,class,type,cbo,cboModified,fanin,fanout,wmc,...,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,label
0,5553,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,IAopReferenceModel,interface,9,9,0,9,16,...,0,0,0,0,0,0,34,1,0,1
1,9341,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,CachePolicyConfOther,class,7,7,0,7,2,...,0,1,1,0,0,0,60,1,0,1
2,12232,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,DruidPooledCallableStatement,class,23,24,1,23,232,...,0,7,2,0,0,0,90,1,0,1
3,10234,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayHub$Gateway$GatewayEndpoint,innerclass,3,3,0,3,6,...,0,3,0,0,0,0,46,9,0,1
4,10234,critical,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayHub$Gateway,innerclass,9,9,0,9,18,...,0,9,1,0,2,0,114,9,0,1


In [14]:

# Prepare the data
X = data.drop(columns=['sample_id', 'file', 'class', 'type', 'severity', 'label']).values  # Drop non-feature columns
y = data['label'].values  # Use 'severity' as the label
    # Define class weights
class_weights = {0: 1, 1: 4}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

    # Define classifiers with hyperparameter grids
classifiers = {
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42, class_weight=class_weights),
            "params": {
                "n_estimators": [50, 100, 200],
                "max_depth": [5,10, 20, None],
                "min_samples_split": [2, 5, 10]
            }
        },
        "Bagging": {
            "model": BaggingClassifier(estimator=RandomForestClassifier(random_state=42, class_weight=class_weights), random_state=42),
            "params": {
                "n_estimators": [10, 50, 100],
                "max_samples": [0.5, 0.7, 1.0]
            }
        },
        "XGBoost": {
            "model": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
            "params": {
                "n_estimators": [50, 100, 200],
                "max_depth": [3, 6, 10],
                "learning_rate": [0.01, 0.1, 0.3]
            }
        },
        "SVM": {
            "model": SVC(probability=True, class_weight=class_weights, random_state=42),
            "params": {
                "C": [0.1, 1, 10],
                "kernel": ["linear", "rbf", "poly"],
                "gamma": ["scale", "auto"]
            }
        }
    }

In [15]:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store cross-validation results
results = {}

# Perform cross-validation for each classifier
for clf_name, clf in classifiers.items():
    metrics_summary = {
        "f1_micro": [],
        "f1_macro": [],
        "accuracy": [],
        "precision_macro": [],
        "recall_macro": []
    }
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        # Split the training data into training and validation folds
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
        
        # Train the classifier
        clf.fit(X_train_fold, y_train_fold)
        
        # Make predictions on the validation set
        y_pred = clf.predict(X_val_fold)
        
        # Calculate metrics
        metrics_summary["f1_micro"].append(f1_score(y_val_fold, y_pred, average='micro'))
        metrics_summary["f1_macro"].append(f1_score(y_val_fold, y_pred, average='macro'))
        metrics_summary["accuracy"].append(accuracy_score(y_val_fold, y_pred))
        metrics_summary["precision_macro"].append(precision_score(y_val_fold, y_pred, average='macro'))
        metrics_summary["recall_macro"].append(recall_score(y_val_fold, y_pred, average='macro'))
    
    # Store mean metrics for the classifier
    results[clf_name] = {metric: np.mean(scores) for metric, scores in metrics_summary.items()}

# Print cross-validation results
print("\nCross-Validation Results:")
for clf_name, metrics in results.items():
    print(f"\n{clf_name} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric.replace('_', ' ').capitalize()}: {value:.4f}")


Cross-Validation Results:

SVM Metrics:
F1 micro: 0.7717
F1 macro: 0.6401
Accuracy: 0.7717
Precision macro: 0.6272
Recall macro: 0.7032

Random Forest Metrics:
F1 micro: 0.8888
F1 macro: 0.6850
Accuracy: 0.8888
Precision macro: 0.8119
Recall macro: 0.6475

Bagging Metrics:
F1 micro: 0.8893
F1 macro: 0.6689
Accuracy: 0.8893
Precision macro: 0.8433
Recall macro: 0.6308

XGBoost Metrics:
F1 micro: 0.8875
F1 macro: 0.7159
Accuracy: 0.8875
Precision macro: 0.7788
Recall macro: 0.6859


In [16]:
# Final evaluation on the test set
final_results = {}

for clf_name, clf in classifiers.items():
    # Train the classifier on the entire training set
    clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred_test = clf.predict(X_test)
    
    # Calculate test metrics
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    precision_test = precision_score(y_test, y_pred_test, average='macro')
    recall_test = recall_score(y_test, y_pred_test, average='macro')
    accuracy_test = accuracy_score(y_test, y_pred_test)
    report_test = classification_report(y_test, y_pred_test)
    
    # Save final metrics for the classifier
    final_results[clf_name] = {
        "f1_micro": f1_micro_test,
        "f1_macro": f1_macro_test,
        "precision_macro": precision_test,
        "recall_macro": recall_test,
        "accuracy": accuracy_test,
        "classification_report": report_test
    }

# Print final test results
print("\nFinal Test Results:")
for clf_name, metrics in final_results.items():
    print(f"\n{clf_name} Metrics:")
    for metric, value in metrics.items():
        if metric != "classification_report":
            print(f"{metric.capitalize()}: {value:.4f}")
    print(f"Classification Report:\n{metrics['classification_report']}")



Final Test Results:

SVM Metrics:
F1_micro: 0.7731
F1_macro: 0.6452
Precision_macro: 0.6301
Recall_macro: 0.7102
Accuracy: 0.7731
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.80      0.86       858
           1       0.33      0.62      0.43       138

    accuracy                           0.77       996
   macro avg       0.63      0.71      0.65       996
weighted avg       0.85      0.77      0.80       996


Random Forest Metrics:
F1_micro: 0.8916
F1_macro: 0.7241
Precision_macro: 0.7966
Recall_macro: 0.6877
Accuracy: 0.8916
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       858
           1       0.68      0.41      0.51       138

    accuracy                           0.89       996
   macro avg       0.80      0.69      0.72       996
weighted avg       0.88      0.89      0.88       996


Bagging Metrics:
F1_micro: 0.8815
F1_macro: 0.67