In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.model_selection import cross_val_score

In [2]:
dataset = pd.read_csv('./files/dataset.csv')

dataset.head()

Unnamed: 0,Feature Model,NF,NM,NTop,NLeaf,DTMax,CogC,FEX,FoC,SCDF,MCDF,RDen,RoV,NVC,NGOr,NGXOr,Maintainability
0,RaaS,32,2,19,28,3,3,28,0.53125,0,0,0.0,4.857143,125829120.0,3,0,moderate
1,AvionFEatures,10,4,5,7,3,2,7,0.1,0,0,0.0,2.2,12.0,2,0,verygood
2,Bike Shop,21,3,5,15,5,5,27,0.095238,9,3,0.0,2.272727,70.0,1,4,good
3,Jetbrains,33,13,3,25,4,2,28,0.272727,3,0,2.0,3.4,4884.0,0,2,moderate
4,Ubuntu,11,3,4,7,3,3,7,0.090909,0,0,1.0,1.857143,12.0,0,3,verygood


In [3]:
X = dataset.iloc[:, [1, 2, 3, 4, 5, 6, 8, 9, 11]] # Select From Model
# X = dataset.iloc[:, [1, 4, 5, 6, 7, 13]] # Correlation Spearman Mean
# X = dataset.iloc[:, 1:-1] # Full Dataset
y = dataset.iloc[:, -1]

X.head()

Unnamed: 0,NF,NM,NTop,NLeaf,DTMax,CogC,FoC,SCDF,RDen
0,32,2,19,28,3,3,0.53125,0,0.0
1,10,4,5,7,3,2,0.1,0,0.0
2,21,3,5,15,5,5,0.095238,9,0.0
3,33,13,3,25,4,2,0.272727,3,2.0
4,11,3,4,7,3,3,0.090909,0,1.0


In [4]:
import warnings

warnings.filterwarnings('ignore')

clf = DecisionTreeClassifier(random_state=0, max_depth=5)
pipeline = make_pipeline(clf)
pipeline.fit(X, y)

scores_accuracy = cross_val_score(pipeline, X, y, cv=10)

print("\n>> Accuracy")
print("Scores:", list(scores_accuracy))
print("Min:", np.min(scores_accuracy))
print("Max:", np.max(scores_accuracy))
print("Mean:", np.mean(scores_accuracy))
print("Standard Deviation:", np.std(scores_accuracy))

scores_precision = cross_val_score(pipeline, X, y, cv=10, scoring="precision_weighted")

print("\n>> Precision")
print("Scores:", list(scores_precision))
print("Min:", np.min(scores_precision))
print("Max:", np.max(scores_precision))
print("Mean:", np.mean(scores_precision))
print("Standard Deviation:", np.std(scores_precision))

scores_recall = cross_val_score(pipeline, X, y, cv=10, scoring="recall_weighted")

print("\n>> Recall")
print("Scores:", list(scores_recall))
print("Min:", np.min(scores_recall))
print("Max:", np.max(scores_recall))
print("Mean:", np.mean(scores_recall))
print("Standard Deviation:", np.std(scores_recall))

scores_f1 = cross_val_score(pipeline, X, y, cv=10, scoring="f1_weighted")

print("\n>> F1")
print("Scores:", list(scores_f1))
print("Min:", np.min(scores_f1))
print("Max:", np.max(scores_f1))
print("Mean:", np.mean(scores_f1))
print("Standard Deviation:", np.std(scores_f1))

scores_roc_auc = cross_val_score(pipeline, X, y, cv=10, scoring="roc_auc_ovo_weighted")

print("\n>> ROC_AUC")
print("Scores:", list(scores_roc_auc))
print("Min:", np.min(scores_roc_auc))
print("Max:", np.max(scores_roc_auc))
print("Mean:", np.mean(scores_roc_auc))
print("Standard Deviation:", np.std(scores_roc_auc))


>> Accuracy
Scores: [0.7428571428571429, 0.8285714285714286, 0.8235294117647058, 0.8235294117647058, 0.7352941176470589, 0.8823529411764706, 0.7352941176470589, 0.9411764705882353, 0.8235294117647058, 0.7647058823529411]
Min: 0.7352941176470589
Max: 0.9411764705882353
Mean: 0.8100840336134454
Standard Deviation: 0.06407638792774241

>> Precision
Scores: [0.8018253968253969, 0.8222222222222223, 0.8712121212121212, 0.8676470588235294, 0.7156862745098039, 0.8860294117647058, 0.8014705882352942, 0.9527310924369746, 0.8172268907563025, 0.6409897292250233]
Min: 0.6409897292250233
Max: 0.9527310924369746
Mean: 0.8177040786011374
Standard Deviation: 0.0839225543083284

>> Recall
Scores: [0.7428571428571429, 0.8285714285714286, 0.8235294117647058, 0.8235294117647058, 0.7352941176470589, 0.8823529411764706, 0.7352941176470589, 0.9411764705882353, 0.8235294117647058, 0.7647058823529411]
Min: 0.7352941176470589
Max: 0.9411764705882353
Mean: 0.8100840336134454
Standard Deviation: 0.064076387927742

In [5]:
tree_text_rep = export_text(clf, feature_names=["NF", "NM", "NTop", "NLeaf", "DTMax", "CogC", "FoC", "SCDF", "RDen"])

print(tree_text_rep)

|--- DTMax <= 3.50
|   |--- NLeaf <= 19.50
|   |   |--- CogC <= 5.50
|   |   |   |--- class: verygood
|   |   |--- CogC >  5.50
|   |   |   |--- CogC <= 6.50
|   |   |   |   |--- class: good
|   |   |   |--- CogC >  6.50
|   |   |   |   |--- class: verygood
|   |--- NLeaf >  19.50
|   |   |--- NLeaf <= 29.50
|   |   |   |--- NF <= 30.00
|   |   |   |   |--- NF <= 27.50
|   |   |   |   |   |--- class: good
|   |   |   |   |--- NF >  27.50
|   |   |   |   |   |--- class: bad
|   |   |   |--- NF >  30.00
|   |   |   |   |--- class: moderate
|   |   |--- NLeaf >  29.50
|   |   |   |--- NTop <= 2.50
|   |   |   |   |--- class: verygood
|   |   |   |--- NTop >  2.50
|   |   |   |   |--- NM <= 8.00
|   |   |   |   |   |--- class: bad
|   |   |   |   |--- NM >  8.00
|   |   |   |   |   |--- class: verybad
|--- DTMax >  3.50
|   |--- NF <= 32.50
|   |   |--- NF <= 21.50
|   |   |   |--- FoC <= 0.27
|   |   |   |   |--- NTop <= 2.50
|   |   |   |   |   |--- class: good
|   |   |   |   |--- NTop 