# Import packages

In [87]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, balanced_accuracy_score, recall_score, precision_score, accuracy_score, roc_auc_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

# Read data and split

In [9]:
def read_data(filename):

    data=pd.read_csv(filename)
    features=data.drop(['fe'], axis='columns')
    label=data['fe']

    test_features=features.values.reshape(-1,61)
    test_label=label.values.reshape(-1,1)

    return test_features, test_label.ravel()

# AutoML Feature engineering

In [72]:
# Remove irrelevant features and select important features
def Feature_Importance(df):
  print('Step: Running Feature_Importance')
  labels = df['fe'].values
  features = df.drop(['fe'],axis=1).values
  feature_names = list(df.drop(['fe'],axis=1).columns)
  print(feature_names)

  model = lgb.LGBMRegressor(verbose = -1)
  model.fit(features, labels)

  feature_importances = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})
  feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

  # Normalize & sort
  feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
  feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])
  feature_importances = feature_importances.sort_values('cumulative_importance')

  # Only keep important features with cumulative importance scores >= 90%
  cumulative_importance=0.90
  record_low_importance = feature_importances[feature_importances['cumulative_importance'] > cumulative_importance]
  to_drop = list(record_low_importance['feature'])
  print('Step: Finishing Feature_Importance, features to drop:', to_drop)
  return to_drop

In [78]:
# Remove redundant features
def Feature_Redundancy_Pearson(df):
  print('Step: Running Feature_Redundancy_Pearson')
  # Remove features with the redundancy > 90%
  correlation_threshold=0.90
  features = df.drop(['fe'],axis=1)
  corr_matrix = features.corr().abs()

  # Extract the upper triangle of the correlation matrix
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))

  # Find columns with correlation greater than 0.9
  to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
  print('Step: Finishing Feature_Redundancy_Pearson, features to drop:', to_drop)
  return to_drop

In [79]:
# Perform feature engineering
def Auto_Feature_Engineering(df):
    print('Step: Running Auto_Feature_Engineering')
    dropped_features = Feature_Importance(df)
    # print(dropped_features)
    df = df.drop(columns = dropped_features)

    dropped_features = Feature_Redundancy_Pearson(df)
    # print(dropped_features)
    df = df.drop(columns = dropped_features)
    print('Step: Finishing Auto_Feature_Engineering')
    return df

# Data preparation

In [80]:
df = pd.read_csv('/content/defects_smells.csv')
df = df.iloc[:,11:] # remove first 11 columns
df = df.iloc[:,:-14] # remove all labels except fe
df = Auto_Feature_Engineering(df)
print(df.columns)
print(df.head())

Step: Running Auto_Feature_Engineering
Step: Running Feature_Importance
['CC', 'CCL', 'CCO', 'CI', 'CLC', 'CLLC', 'LDC', 'LLDC', 'LCOM5', 'NL', 'NLE', 'WMC', 'CBO', 'CBOI', 'NII', 'NOI', 'RFC', 'AD', 'CD', 'CLOC', 'DLOC', 'PDA', 'PUA', 'TCD', 'TCLOC', 'DIT', 'NOA', 'NOC', 'NOD', 'NOP', 'LLOC', 'LOC', 'NA', 'NG', 'NLA', 'NLG', 'NLM', 'NLPA', 'NLPM', 'NLS', 'NM', 'NOS', 'NPA', 'NPM', 'NS', 'TLLOC', 'TLOC', 'TNA', 'TNG', 'TNLA', 'TNLG', 'TNLM', 'TNLPA', 'TNLPM', 'TNLS', 'TNM', 'TNOS', 'TNPA', 'TNPM', 'TNS']
Step: Finishing Feature_Importance, features to drop: ['NOA', 'NOC', 'PDA', 'LDC', 'TCLOC', 'LLDC', 'CCO', 'TNLS', 'TNLG', 'NOP', 'TNS', 'NLPA', 'TNLPA', 'NOD', 'CCL', 'CI']
Step: Running Feature_Redundancy_Pearson
Step: Finishing Feature_Redundancy_Pearson, features to drop: ['CLC', 'CLLC', 'RFC', 'TCD', 'LLOC', 'LOC', 'NLPM', 'NOS', 'NPA', 'NPM', 'TLLOC', 'TLOC', 'TNA', 'TNG', 'TNLA', 'TNLM', 'TNLPM', 'TNM', 'TNOS', 'TNPA', 'TNPM']
Step: Finishing Auto_Feature_Engineering
Index(['CC'

# Read and split data

In [81]:
X, y = df.drop(['fe'],axis=1), df['fe']
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size = 0.3, shuffle=True,random_state = 42, stratify=y)

# Techniques

## Random forest params

In [109]:
# Define the objective function
def objective(params):
    params = {
        "n_estimators": int(params['n_estimators']),
        "max_depth": int(params['max_depth']),
        "max_features": int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    model = RandomForestClassifier( **params)
    accuracy = cross_val_score(model, Xtrain, ytrain, scoring='matthews_corrcoef', cv=StratifiedKFold(n_splits=3)).mean()
    return {'loss':-accuracy, 'status': STATUS_OK }

# Define the hyperparameter configuration space
space = {
    "n_estimators": hp.quniform('n_estimators', 10, 100, 1),
    "max_depth": hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 64, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("RF: Hyperopt estimated optimum {}".format(best))

100%|██████████| 20/20 [00:59<00:00,  2.98s/trial, best loss: -0.6509108367182556]
RF: Hyperopt estimated optimum {'criterion': 1, 'max_depth': 43.0, 'max_features': 9.0, 'min_samples_leaf': 4.0, 'min_samples_split': 3.0, 'n_estimators': 94.0}


In [110]:
criterion = {0: 'gini', 1: 'entropy'}
trainedRF = RandomForestClassifier(criterion = criterion[best['criterion']],max_depth = int(best['max_depth']), max_features = int(best['max_features']),
                                   n_estimators = int(best['n_estimators']), min_samples_leaf = int(best['min_samples_leaf']), min_samples_split = int(best['min_samples_split'])
                                   ).fit(Xtrain, ytrain)
y_pred = trainedRF.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7782469609724888
MCC: 0.6879041515654386
Recall: 0.7480934809348093
Precision: 0.8109333333333333
Accuracy: 0.8702844311377246
AUC: 0.8359079561747741
Balanced Accuracy: 0.8359079561747742


## KNN params

In [100]:
space = {'n_neighbors': hp.choice('n_neighbors', np.arange(1, 10+1, dtype=int)), 'metric': hp.choice('metric',['euclidean', 'manhattan']), 'weights': hp.choice('weights',['uniform', 'distance'])}

def objective(space):
    model = KNeighborsClassifier(n_neighbors=space["n_neighbors"], weights=space["weights"], metric=space["metric"])
    accuracy = cross_val_score(model, Xtrain, ytrain, cv = 3).mean()
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 10,
            trials= trials)
best

100%|██████████| 10/10 [00:04<00:00,  2.42trial/s, best loss: -0.8321394226985418]


{'metric': 1, 'n_neighbors': 9, 'weights': 1}

In [101]:
metric = {0: 'euclidean', 1: 'manhattan'}
weights = {0: 'uniform', 1: 'distance'}
n_neighbors = {0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9, 9:10}

trainedKNN = KNeighborsClassifier(n_neighbors = n_neighbors[best['n_neighbors']], weights = weights[best['weights']], metric = metric[best['metric']]).fit(Xtrain, ytrain)
y_pred = trainedKNN.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7379255740300871
MCC: 0.6382354444329864
Recall: 0.6878228782287823
Precision: 0.7959009393680615
Accuracy: 0.8513473053892215
AUC: 0.8053423159298835
Balanced Accuracy: 0.8053423159298834


## Decision trees params

In [111]:
# Define the objective function
def objective(params):
    params = {
        "max_depth": int(params['max_depth']),
        "max_features": int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    model = DecisionTreeClassifier( **params)
    accuracy = cross_val_score(model, Xtrain, ytrain, scoring='matthews_corrcoef', cv=StratifiedKFold(n_splits=3)).mean()
    return {'loss':-accuracy, 'status': STATUS_OK }

# Define the hyperparameter configuration space
space = {
    "max_depth": hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 64, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['gini','entropy'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("DTs: Hyperopt estimated optimum {}".format(best))

100%|██████████| 20/20 [00:02<00:00,  7.56trial/s, best loss: -0.5649960034451268]
DTs: Hyperopt estimated optimum {'criterion': 1, 'max_depth': 5.0, 'max_features': 36.0, 'min_samples_leaf': 10.0, 'min_samples_split': 7.0}


In [112]:
criterion = {0: 'gini', 1: 'entropy'}
trainedDT = DecisionTreeClassifier(criterion = criterion[best['criterion']],max_depth = int(best['max_depth']), max_features = int(best['max_features']),
                                   min_samples_leaf = int(best['min_samples_leaf']), min_samples_split = int(best['min_samples_split'])).fit(Xtrain, ytrain)
y_pred = trainedDT.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7246740220661987
MCC: 0.6078089804554989
Recall: 0.7109471094710947
Precision: 0.7389414472002046
Accuracy: 0.8356287425149701
AUC: 0.8005515536597002
Balanced Accuracy: 0.8005515536597001


## Stacking classifier

In [119]:
cls1 = ('RF', trainedRF)
cls2 = ('DT', trainedDT)
cls3 = ('KNN', trainedKNN)
predictors = [cls1, cls2, cls3]
clf = StackingClassifier(estimators=predictors, final_estimator=RandomForestClassifier())

In [120]:
trainedCLF = clf.fit(Xtrain, ytrain)

y_pred = trainedCLF.predict(Xtest)

f1 = f1_score(ytest, y_pred)
mcc = matthews_corrcoef(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
accuracy = accuracy_score(ytest, y_pred)
auc = roc_auc_score(ytest, y_pred)
balanced_accuracy = balanced_accuracy_score(ytest, y_pred)

print(f"F1 score: {f1}")
print(f"MCC: {mcc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"Balanced Accuracy: {balanced_accuracy}")

F1 score: 0.7653695018380022
MCC: 0.6678314663326316
Recall: 0.7426814268142682
Precision: 0.7894874476987448
Accuracy: 0.8614520958083832
AUC: 0.8280378624119754
Balanced Accuracy: 0.8280378624119754
