In [1]:
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import confusion_matrix
import seaborn as sn
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.xmeans import xmeans, splitting_type
from imblearn.over_sampling import SMOTE
from implicitModel import Implicit_ME


In [2]:
def DecisionTree(X_train,y_train,X_test,y_test):
    clf = DecisionTreeClassifier(criterion = "entropy", splitter = "random").fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test
      
    }
    return result

In [3]:
def SVM(X_train,y_train,X_test,y_test):
    clf = svm.SVC().fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test
      
    }
    return result

In [4]:
def KNN(X_train,y_train,X_test,y_test):
    clf = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test
      
    }
    return result

In [5]:
def Logistic(X_train,y_train,X_test,y_test):
    clf = LogisticRegression(max_iter=100,penalty = 'none').fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test
      
    }
    return result

In [6]:
def gate_ntw(X_train,y_train,X_test,y_test):
    gateDT = DecisionTreeClassifier(criterion = "entropy", splitter = "best").fit(X_train,y_train)
    gateSVM = svm.SVC(probability=True).fit(X_train,y_train)
    gateKNN = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
    gateLOG = LogisticRegression(max_iter=10000).fit(X_train,y_train)
    voting_clf = VotingClassifier(
    estimators=[('DecisionTree',gateDT), ('SVM',gateSVM),('KNN',gateKNN),('Logistic',gateLOG)],voting='soft')
    voting_clf.fit(X_train, y_train)
    final_predictions = voting_clf.predict(X_test)
    print("Accuracy of Mixture of Experts:",metrics.accuracy_score(y_test, final_predictions))
    print("F1 score", f1_score(y_test,final_predictions, average='macro'))
    print("AUC score",  metrics.roc_auc_score(y_test,  final_predictions))
    

In [7]:
data = arff.loadarff('../dataSet/PROMISE/ivy-2.0.arff')
df = pd.DataFrame(data[0])
df.head()


Unnamed: 0,wmc,dit,noc,cbo,rfc,lcom,ca,ce,npm,lcom3,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,defects
0,28.0,1.0,0.0,32.0,82.0,374.0,26.0,7.0,24.0,1.012346,...,0.166667,0.0,0.0,0.226337,0.0,0.0,31.642857,23.0,2.6786,2.0
1,6.0,1.0,2.0,3.0,7.0,3.0,2.0,1.0,4.0,0.6,...,1.0,0.0,0.0,0.444444,0.0,0.0,4.666667,1.0,0.6667,0.0
2,4.0,2.0,0.0,5.0,6.0,4.0,1.0,4.0,2.0,0.666667,...,1.0,1.0,0.5,0.5,0.0,0.0,4.0,1.0,0.5,0.0
3,4.0,1.0,0.0,9.0,4.0,6.0,9.0,0.0,4.0,2.0,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,1.0,1.0,0.0
4,1.0,1.0,0.0,8.0,1.0,0.0,6.0,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [8]:
df.isnull().any().sum()


0

In [9]:
from sklearn.preprocessing import StandardScaler  
X= df.iloc[ : , :-1].values
y= df['defects'].apply(lambda x : 1 if(x > 0) else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
st_x= StandardScaler()  
X_train= st_x.fit_transform(X_train) 
X_test= st_x.transform(X_test)  
X_train

array([[ 2.71722263,  0.11685527,  0.43092059, ...,  0.89066565,
         1.36993607,  1.01450847],
       [-0.43639588,  0.89867264, -0.28558051, ..., -0.32190722,
        -0.57795532, -0.51855932],
       [ 1.38299941,  1.68049002, -0.28558051, ...,  0.11204913,
        -0.33446889, -0.27030934],
       ...,
       [-0.61833541, -0.66496211, -0.28558051, ...,  1.05488953,
         0.88296322,  2.83281549],
       [-0.37574937, -0.66496211, -0.28558051, ...,  0.64209333,
        -0.57795532, -0.68401794],
       [-0.25445635, -0.66496211, -0.28558051, ...,  3.60451314,
         0.15250395,  0.35031563]])

In [10]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [11]:
sel_.get_support()
X_train = sel_.transform(X_train)
X_test = sel_.transform(X_test)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [17]:
experts = []
experts.append(DecisionTree)
experts.append(KNN)
experts.append(SVM)
experts.append(Logistic)
Implicit_ME(X_train,y_train,X_test,y_test,2,experts,gate_ntw)

Accuracy of Mixture of Experts: 0.8450704225352113
F1 score 0.5342874180083482
AUC score 0.5313620071684588


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
