In [61]:
# Imports required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#Create lists to update the scores
accuracy = []
precision = []
recall = []

#Columns to read from Python_01 dataset
p1 = ['complexity','token_count','loc','comments',
    'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
    'calculated_length','volume', 'difficulty','effort','time','bugs','defect'] 

#Columns to read from Python_02 dataset
p2 = ['complexity','token_count','loc','comments',
          'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
          'calculated_length','volume', 'difficulty','effort','time','bugs',
          'change_type','added','removed','defect'] 

#Columns to read from Python_03 dataset
p3 = ['complexity','token_count','loc','comments',
          'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
          'calculated_length','volume', 'difficulty','effort','time','bugs','change_type','added','removed',
          'introspection','object_changes','code_generation','library_loading','defect']

#Columns to read from CM1 dataset
c1 = ['loc','v(g)','n','v','l','d','i',
       'e','b','t','lOCode','lOComment','lOBlank','locCodeAndComment',
       'uniq_Op','uniq_Opnd','total_Op','total_Opnd',
       'branchCount','defects']

#Columns to read from KC3 dataset
k3 = ["loc_blank","loc_code_and_comment",
          "loc_comments","cyclomatic_complexity",
          "loc_executable","halstead_content","halstead_difficulty","halstead_effort","halstead_error_est",
         "halstead_length","halstead_level","halstead_prog_time","halstead_volumn","num_operands","num_operators",
          "num_unique_operands","num_unique_operators","number_of_lines","loc_total","defects"]

In [63]:
#Function to perform classification algorithms
def nb_func(df,algo):
    X = df.iloc[:,0:-1] #features
    y = df.iloc[:,-1] #labels
    clf = algo #get the algorithm

##Apply selectKBest for 10 best features
    bestfeatures = SelectKBest(score_func=chi2, k=10)
    bestfit = bestfeatures.fit(X,y)
    dfscores = pd.DataFrame(bestfit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['feature','score']
    ten = featureScores.nlargest(10,'score')
    ten_df = pd.DataFrame(ten)
    ten.plot(kind='bar')
    plt.show()
    
    kfold = RepeatedKFold(n_splits=10, n_repeats=5, random_state=None)
    for train_ix, test_ix in kfold.split(X, y):
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        clf = clf.fit(X_train,y_train)
# #Predict the response for test dataset
        predicted_cv = clf.predict(X_test)
        accuracy.append(accuracy_score(y_test,predicted_cv))
        precision.append(precision_score(y_test,predicted_cv))
        recall.append(recall_score(y_test,predicted_cv))
    print(confusion_matrix(y_test,predicted_cv))
    evaluation_func(accuracy, precision, recall)
    
##Evaluation Function
def evaluation_func(accuracy, precision, recall):
    print("ACCURACY")
    mean_acc = np.mean(accuracy)
    print("Mean :",mean_acc)
#     max_acc = np.max(accuracy)
#     print("Max :",max_acc)

    print("PRECISION")
    mean_precision = np.mean(precision)
    print("Mean :",mean_precision)
#     max_precision = np.max(precision)
#     print("Max :",max_precision)

    print("RECALL")
    mean_recall = np.mean(recall)
    print("Mean :",mean_recall)
#     max_recall = np.max(recall)
#     print("Max :",max_recall)

In [None]:
# Execute the different datasets by calling the function which needs dataset and algorithm to be passed as input

In [None]:
#Read python_01 dataset
filepath = 'python_01.csv'
python_01 = pd.read_csv(filepath, index_col=None, usecols= p1)

#Pass an algorithm to execute (like example below)

algo = DecisionTreeClassifier() 
#algo = GaussianNB()                             
#algo = svm.SVC(gamma='auto') 
#algo = MLPClassifier(hidden_layer_sizes=(150,80),max_iter=200,activation = 'relu',solver='adam',random_state=50)

nb_func(python_01,algo)

In [None]:
#Read python_02 dataset
filepath = 'python_02.csv'
python_02 = pd.read_csv(filepath, index_col=None, usecols= p2)

#Pass an algorithm to execute (like example below)

algo = DecisionTreeClassifier() 
#algo = GaussianNB()                             
#algo = svm.SVC(gamma='auto') 
#algo = MLPClassifier(hidden_layer_sizes=(150,80),max_iter=200,activation = 'relu',solver='adam',random_state=50)

nb_func(python_02,algo)

In [None]:
#Read python_03 dataset
filepath = 'python_03.csv'
python_03 = pd.read_csv(filepath, index_col=None, usecols= p3)

#Pass an algorithm to execute (like example below)

algo = DecisionTreeClassifier() 
#algo = GaussianNB()                             
#algo = svm.SVC(gamma='auto') 
#algo = MLPClassifier(hidden_layer_sizes=(150,80),max_iter=200,activation = 'relu',solver='adam',random_state=50)

nb_func(python_03,algo)

In [None]:
#Read CM1 dataset
filepath = 'promise_CM1.csv'
cm1 = pd.read_csv(filepath, index_col=None, usecols= c1)

#Pass an algorithm to execute (like example below)

algo = DecisionTreeClassifier() 
#algo = GaussianNB()                             
#algo = svm.SVC(gamma='auto') 
#algo = MLPClassifier(hidden_layer_sizes=(150,80),max_iter=200,activation = 'relu',solver='adam',random_state=50)

nb_func(cm1,algo)

In [None]:
#Read KC3 dataset
filepath = 'promise_KC3.csv'
kc3 = pd.read_csv(filepath, index_col=None, usecols= k3)

#Pass an algorithm to execute (like example below)

algo = DecisionTreeClassifier() 
#algo = GaussianNB()                             
#algo = svm.SVC(gamma='auto') 
#algo = MLPClassifier(hidden_layer_sizes=(150,80),max_iter=200,activation = 'relu',solver='adam',random_state=50)

nb_func(kc3,algo)