# Import packages

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, balanced_accuracy_score, recall_score, precision_score, accuracy_score, roc_auc_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

# ML Feature engineering

In [2]:
# Remove irrelevant features and select important features
def Feature_Importance(df):
  print('Step: Running Feature_Importance')
  labels = df['fe'].values
  features = df.drop(['fe'],axis=1).values
  feature_names = list(df.drop(['fe'],axis=1).columns)
  # labels = df['is_long_method'].values
  # features = df.drop(['is_long_method'], axis='columns').values
  # feature_names = list(df.drop(['is_long_method'], axis='columns').columns)
  print(feature_names)

  model = lgb.LGBMRegressor(verbose = -1)
  model.fit(features, labels)

  feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})
  feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

  # Normalize & sort
  feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
  feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])
  feature_importances = feature_importances.sort_values('cumulative_importance')

  # Only keep important features with cumulative importance scores >= 90%
  cumulative_importance=0.90
  record_low_importance = feature_importances[feature_importances['cumulative_importance'] > cumulative_importance]
  to_drop = list(record_low_importance['feature'])
  print('Step: Finishing Feature_Importance, features to drop:', to_drop)
  return to_drop

In [7]:
# Remove redundant features
def Feature_Redundancy_Pearson(df):
  print('Step: Running Feature_Redundancy_Pearson')
  # Remove features with the redundancy > 99%
  correlation_threshold=0.95
  features = df.drop(['fe'],axis=1)
  # features = df.drop(['is_long_method'], axis='columns')
  corr_matrix = features.corr().abs()

  # Extract the upper triangle of the correlation matrix
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))

  # Find columns with correlation greater than 0.9
  to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
  print('Step: Finishing Feature_Redundancy_Pearson, features to drop:', to_drop)
  return to_drop

In [8]:
# Perform feature engineering
def Auto_Feature_Engineering(df):
    print('Step: Running Auto_Feature_Engineering')
    dropped_features = Feature_Importance(df)
    # print(dropped_features)
    df = df.drop(columns = dropped_features)

    dropped_features = Feature_Redundancy_Pearson(df)
    # print(dropped_features)
    df = df.drop(columns = dropped_features)
    print('Step: Finishing Auto_Feature_Engineering')
    return df

# Data preparation

In [9]:
df = pd.read_csv('/content/defects_smells.csv')
df = df.iloc[:,11:] # remove first 11 columns
df = df.iloc[:,:-14] # remove all labels except fe
df = Auto_Feature_Engineering(df)
print(df.columns)
print(df.head())

Step: Running Auto_Feature_Engineering
Step: Running Feature_Importance
['CC', 'CCL', 'CCO', 'CI', 'CLC', 'CLLC', 'LDC', 'LLDC', 'LCOM5', 'NL', 'NLE', 'WMC', 'CBO', 'CBOI', 'NII', 'NOI', 'RFC', 'AD', 'CD', 'CLOC', 'DLOC', 'PDA', 'PUA', 'TCD', 'TCLOC', 'DIT', 'NOA', 'NOC', 'NOD', 'NOP', 'LLOC', 'LOC', 'NA', 'NG', 'NLA', 'NLG', 'NLM', 'NLPA', 'NLPM', 'NLS', 'NM', 'NOS', 'NPA', 'NPM', 'NS', 'TLLOC', 'TLOC', 'TNA', 'TNG', 'TNLA', 'TNLG', 'TNLM', 'TNLPA', 'TNLPM', 'TNLS', 'TNM', 'TNOS', 'TNPA', 'TNPM', 'TNS']
Step: Finishing Feature_Importance, features to drop: ['NOA', 'NOC', 'PDA', 'LDC', 'TCLOC', 'LLDC', 'CCO', 'TNLS', 'TNLG', 'NOP', 'TNS', 'NLPA', 'TNLPA', 'NOD', 'CCL', 'CI']
Step: Running Feature_Redundancy_Pearson
Step: Finishing Feature_Redundancy_Pearson, features to drop: ['CLC', 'CLLC', 'TCD', 'NOS', 'NPA', 'TLLOC', 'TLOC', 'TNA', 'TNLA', 'TNOS', 'TNPA']
Step: Finishing Auto_Feature_Engineering
Index(['CC', 'LCOM5', 'NL', 'NLE', 'WMC', 'CBO', 'CBOI', 'NII', 'NOI', 'RFC',
       'A

# Read and split data

In [10]:
X, y = df.drop(['fe'],axis=1), df['fe']
# X, y = df.drop(['is_long_method'], axis=1), df['is_long_method']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size = 0.3, shuffle=True,random_state = 42, stratify=y)

# Techniques

## Random forest params

In [11]:
# Define the objective function
def objective(params):
    params = {
        "n_estimators": int(params['n_estimators']),
        "max_depth": int(params['max_depth']),
        "max_features": int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    model = RandomForestClassifier( **params)
    mcc = cross_val_score(model, Xtrain, ytrain, scoring='matthews_corrcoef', cv=StratifiedKFold(n_splits=3)).mean()
    return {'loss':-mcc, 'status': STATUS_OK }

# Define the hyperparameter configuration space
space = {
    "n_estimators": hp.quniform('n_estimators', 10, 100, 1),
    "max_depth": hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 64, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("RF: Hyperopt estimated optimum {}".format(best))

100%|██████████| 20/20 [01:42<00:00,  5.14s/trial, best loss: -0.6571828012529234]
RF: Hyperopt estimated optimum {'criterion': 0, 'max_depth': 26.0, 'max_features': 19.0, 'min_samples_leaf': 2.0, 'min_samples_split': 5.0, 'n_estimators': 79.0}


In [12]:
criterion = {0: 'gini', 1: 'entropy'}
trainedRF = RandomForestClassifier(criterion = criterion[best['criterion']],max_depth = int(best['max_depth']), max_features = int(best['max_features']),
                                   n_estimators = int(best['n_estimators']), min_samples_leaf = int(best['min_samples_leaf']), min_samples_split = int(best['min_samples_split'])
                                   ).fit(Xtrain, ytrain)
y_pred = trainedRF.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7950778503264692
MCC: 0.7084318237375808
Recall: 0.7788437884378844
Precision: 0.8120030777122339
Accuracy: 0.8778443113772455
AUC: 0.8499920932506798
Balanced Accuracy: 0.8499920932506797


## Extra trees classifier

In [13]:
# Define the objective function
def objective(params):
    params = {
        "n_estimators": int(params['n_estimators']),
        "max_depth": int(params['max_depth']),
        "max_features": int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    model = ExtraTreesClassifier( **params)
    mcc = cross_val_score(model, Xtrain, ytrain, scoring='matthews_corrcoef', cv=StratifiedKFold(n_splits=3)).mean()
    return {'loss':-mcc, 'status': STATUS_OK }

best_ET_models = []
for i in range(5):
  # Define the hyperparameter configuration space
  space = {
      "n_estimators": hp.quniform('n_estimators', 10, 100, 1),
      "max_depth": hp.quniform('max_depth', 5, 50, 1),
      "max_features":hp.quniform('max_features', 1, 64, 1),
      "min_samples_split":hp.quniform('min_samples_split',2,11,1),
      "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
      "criterion":hp.choice('criterion',['gini','entropy'])
  }

  best = fmin(fn=objective,
              space=space,
              algo=tpe.suggest,
              max_evals=60)
  best_ET_models.append(best)
  print("RF: Hyperopt estimated optimum {}".format(best))

print(best_ET_models)

100%|██████████| 60/60 [01:56<00:00,  1.95s/trial, best loss: -0.6608833725248277]
RF: Hyperopt estimated optimum {'criterion': 1, 'max_depth': 29.0, 'max_features': 33.0, 'min_samples_leaf': 3.0, 'min_samples_split': 5.0, 'n_estimators': 74.0}
100%|██████████| 60/60 [01:59<00:00,  1.99s/trial, best loss: -0.6623925279393786]
RF: Hyperopt estimated optimum {'criterion': 0, 'max_depth': 23.0, 'max_features': 13.0, 'min_samples_leaf': 2.0, 'min_samples_split': 6.0, 'n_estimators': 80.0}
100%|██████████| 60/60 [01:24<00:00,  1.41s/trial, best loss: -0.6632533379972263]
RF: Hyperopt estimated optimum {'criterion': 1, 'max_depth': 44.0, 'max_features': 15.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 65.0}
100%|██████████| 60/60 [01:55<00:00,  1.93s/trial, best loss: -0.6690169497777125]
RF: Hyperopt estimated optimum {'criterion': 0, 'max_depth': 42.0, 'max_features': 14.0, 'min_samples_leaf': 1.0, 'min_samples_split': 3.0, 'n_estimators': 83.0}
100%|██████████| 60/

In [None]:
criterion = {0: 'gini', 1: 'entropy'}
trained_ET_models = []
for model in best_ET_models:
  trainedET = ExtraTreesClassifier(criterion = criterion[model['criterion']],max_depth = int(model['max_depth']), max_features = int(model['max_features']),
                                    n_estimators = int(model['n_estimators']), min_samples_leaf = int(model['min_samples_leaf']), min_samples_split = int(model['min_samples_split'])
                                    ).fit(Xtrain, ytrain)
  y_pred = trainedET.predict(Xtest)
  trained_ET_models.append(trainedET)

  print(trainedET)

  f1 = f1_score(ytest, y_pred)
  mcc = matthews_corrcoef(ytest, y_pred)
  recall = recall_score(ytest, y_pred)
  precision = precision_score(ytest, y_pred)
  accuracy = accuracy_score(ytest, y_pred)
  auc = roc_auc_score(ytest, y_pred)
  balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

  print(f"F1 score: {f1}")
  print(f"MCC: {mcc}")
  print(f"Recall: {recall}")
  print(f"Precision: {precision}")
  print(f"Accuracy: {accuracy}")
  print(f"AUC: {auc}")
  print(f"Balanced Accuracy: {balanced_accuracy}")

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=40, max_features=12,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=72, n_jobs=None, oob_score=False,
                     random_state=None, verbose=0, warm_start=False)
F1 score: 0.8047919293820933
MCC: 0.7229470348617137
Recall: 0.7849938499384994
Precision: 0.825614489003881
Accuracy: 0.8841317365269461
AUC: 0.8562408733285826
Balanced Accuracy: 0.8562408733285827
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=49, max_features=14,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=10, mi

## KNN params

In [None]:
space = {'n_neighbors': hp.choice('n_neighbors', np.arange(1, 10+1, dtype=int)), 'metric': hp.choice('metric',['euclidean', 'manhattan']), 'weights': hp.choice('weights',['uniform', 'distance'])}

def objective(space):
    model = KNeighborsClassifier(n_neighbors=space["n_neighbors"], weights=space["weights"], metric=space["metric"])
    accuracy = cross_val_score(model, Xtrain, ytrain, cv = 3).mean()
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 10,
            trials= trials)
best

100%|██████████| 10/10 [00:14<00:00,  1.48s/trial, best loss: -0.8265500247274762]


{'metric': 1, 'n_neighbors': 7, 'weights': 1}

In [None]:
metric = {0: 'euclidean', 1: 'manhattan'}
weights = {0: 'uniform', 1: 'distance'}
n_neighbors = {0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9, 9:10}

trainedKNN = KNeighborsClassifier(n_neighbors = n_neighbors[best['n_neighbors']], weights = weights[best['weights']], metric = metric[best['metric']]).fit(Xtrain, ytrain)
y_pred = trainedKNN.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7476379910492292
MCC: 0.6390348674489124
Recall: 0.7397293972939729
Precision: 0.7557175169640613
Accuracy: 0.8480538922155688
AUC: 0.817578523283888
Balanced Accuracy: 0.817578523283888


## Decision trees params

In [None]:
# Define the objective function
def objective(params):
    params = {
        "max_depth": int(params['max_depth']),
        "max_features": int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    model = DecisionTreeClassifier( **params)
    accuracy = cross_val_score(model, Xtrain, ytrain, scoring='matthews_corrcoef', cv=StratifiedKFold(n_splits=3)).mean()
    return {'loss':-accuracy, 'status': STATUS_OK }

# Define the hyperparameter configuration space
space = {
    "max_depth": hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 64, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("DTs: Hyperopt estimated optimum {}".format(best))

100%|██████████| 20/20 [00:04<00:00,  4.92trial/s, best loss: -0.5568533679799742]
DTs: Hyperopt estimated optimum {'criterion': 1, 'max_depth': 32.0, 'max_features': 62.0, 'min_samples_leaf': 4.0, 'min_samples_split': 5.0}


In [None]:
criterion = {0: 'gini', 1: 'entropy'}
trainedDT = DecisionTreeClassifier(criterion = criterion[best['criterion']],max_depth = int(best['max_depth']), max_features = int(best['max_features']),
                                   min_samples_leaf = int(best['min_samples_leaf']), min_samples_split = int(best['min_samples_split'])).fit(Xtrain, ytrain)
y_pred = trainedDT.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.6843349508523081
MCC: 0.5486441844282879
Recall: 0.6765067650676507
Precision: 0.6923464249748238
Accuracy: 0.8101047904191617
AUC: 0.7725191167995598
Balanced Accuracy: 0.7725191167995596


## Stacking classifier

In [None]:
# cls1 = ('RF', trainedRF)
# cls3 = ('ET-base', ExtraTreesClassifier())
# cls4 = ('RF-base', RandomForestClassifier())
predictors = []
for i in range(len(trained_ET_models)):
  model_name = 'ET' + str(i)
  predictors.append((model_name, trained_ET_models[i]))

clf = StackingClassifier(estimators=predictors, final_estimator=GradientBoostingClassifier(), cv='prefit')

In [None]:
trainedCLF = clf.fit(Xtrain, ytrain)

y_pred = trainedCLF.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.8047919293820933
MCC: 0.7229470348617137
Recall: 0.7849938499384994
Precision: 0.825614489003881
Accuracy: 0.8841317365269461
AUC: 0.8562408733285826
Balanced Accuracy: 0.8562408733285827


# Santos et. al paper

In [None]:
# !pip install pycaret
from pycaret.classification import *
from imblearn.ensemble import BalancedRandomForestClassifier

s = setup(data=df, target='fe', feature_selection=True,
          remove_multicollinearity=True,
          multicollinearity_threshold=0.85,
          feature_selection_method='classic',
          fix_imbalance=True, fold=10, fix_imbalance_method='deep_forest')

NameError: ignored

In [None]:
top5_models = compare_models(n_select=5, sort='f1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8843,0.9451,0.7801,0.8298,0.8041,0.7222,0.7229,2.077
rf,Random Forest Classifier,0.8827,0.9425,0.7747,0.8289,0.8007,0.7178,0.7187,2.093
xgboost,Extreme Gradient Boosting,0.8677,0.9291,0.7525,0.801,0.7757,0.682,0.6829,1.257
lightgbm,Light Gradient Boosting Machine,0.8621,0.9285,0.7395,0.7939,0.7654,0.668,0.6691,0.859
gbc,Gradient Boosting Classifier,0.8519,0.9159,0.7186,0.7781,0.7469,0.6425,0.6437,1.963
ada,Ada Boost Classifier,0.8415,0.9048,0.7107,0.7546,0.7318,0.6194,0.6202,1.311
knn,K Neighbors Classifier,0.8378,0.8862,0.7055,0.7473,0.7257,0.6107,0.6113,0.339
dt,Decision Tree Classifier,0.8266,0.7969,0.7208,0.7127,0.7166,0.5918,0.5918,0.397
svm,SVM - Linear Kernel,0.8197,0.0,0.6084,0.7569,0.6697,0.5487,0.5579,0.501
lr,Logistic Regression,0.8259,0.8932,0.5604,0.8091,0.6617,0.5497,0.5668,1.134


Processing:   0%|          | 0/69 [00:00<?, ?it/s]