In [23]:
import pandas as pd
import numpy as np 

### Merging different sets of metrics in one dataframe

In [29]:
df1 = pd.read_csv('bug-metrics.csv')
df1 = df1.drop(columns = ['nonTrivialBugs', 'majorBugs', 'criticalBugs', 'highPriorityBugs'])
df2 = pd.read_csv('change-metrics.csv')
df2 = df2.drop(columns = ['nonTrivialBugs', 'majorBugs', 'criticalBugs', 'highPriorityBugs'])
df = pd.merge(df1, df2, on = ['classname','bugs'])
df3 = pd.read_csv('lin-ent.csv')
df3 = df3.drop(columns = ['nonTrivialBugs', 'majorBugs', 'criticalBugs', 'highPriorityBugs'])
df = pd.merge(df, df3, on = ['classname', 'bugs'] )
df4 = pd.read_csv('complexity-code-change.csv')
df = pd.merge(df, df4, on='classname')

In [30]:
def classification(to_convert):
    final_list=[]
    for i in to_convert:
        if i>0.08:
            final_list.append(1)
        else:
            final_list.append(0)
    return final_list

### Separating features and target and feature scaling

In [31]:
from sklearn.preprocessing import MinMaxScaler

# --> Debugging statements

from sklearn.model_selection import train_test_split
##print(df.columns)

# classname has no use and the unnamed column was created due to a wrongly placed space
# will clear it in the future
df = df.drop(columns=['classname'])

##print(df.dtypes)
df.dropna()

#Removing the actual bug count column to extract the metrics
X = df.loc[:, df.columns != 'bugs']
Y = df['bugs']

##print(X.dtypes)

X = X.apply(pd.to_numeric)
##print(X.head())
X_asDF = X

# Feature Scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

Y = [(float(i)-min(Y))/(max(Y)-min(Y)) for i in Y]



In [42]:
from collections import Counter
freq = Counter(classification(Y))
freq

Counter({0: 791, 1: 206})

### Extracting top 5 metrics and calculating Ru

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
feature_scores = {}
r_scores = []

from sklearn.linear_model import LinearRegression
import operator

reg = LinearRegression()

for i in range(X_train.shape[1]):
    reg.fit(X_train[:,i].reshape(-1,1),Y_train)
    feature_scores[i] = reg.score(X_train[:,i].reshape(-1,1),Y_train)

sorted_feature_scores = dict( sorted(feature_scores.items(), key=operator.itemgetter(1),reverse=True))


dict_items = sorted_feature_scores.items()

top_n_metrics = list(dict_items)[:5]
#print(top_n_metrics)
#print(X_asDF.columns[top_n_metrics[0][0]])

list_keys = [x[0] for x in top_n_metrics]
#print(list_keys)
X_train_for_multiple  = X_train[:,list_keys]
X_test_for_multiple  = X_test[:,list_keys]
reg.fit(X_train_for_multiple, Y_train)

rU = reg.score(X_train_for_multiple, Y_train)
print(rU)

0.5490264761576464


### Applying proposed algorithm with Polynomial Regression

In [19]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(X)


poly.fit(X_poly,Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_poly, Y, test_size=0.2, random_state=1)
feature_scores = {}
r_scores = []

# Fitting Polynomial Regression to the dataset

from sklearn.linear_model import LinearRegression, Lasso
import operator

reg = LinearRegression() 
for i in range(X_train.shape[1]):
    reg.fit(X_train[:,i].reshape(-1,1),Y_train)
    feature_scores[i] = reg.score(X_train[:,i].reshape(-1,1),Y_train)

sorted_feature_scores = dict( sorted(feature_scores.items(), key=operator.itemgetter(1),reverse=True))


dict_items = sorted_feature_scores.items()

top_n_metrics = list(dict_items)[:5]
#print(top_n_metrics)
#print(X_asDF.columns[top_n_metrics[0][0]])

list_keys = [x[0] for x in top_n_metrics]
#print(list_keys)
X_train_for_multiple  = X_train[:,list_keys]
X_test_for_multiple  = X_test[:,list_keys]
reg.fit(X_train_for_multiple, Y_train)

rU = reg.score(X_train_for_multiple, Y_train)
print(rU)

KeyboardInterrupt: 

### Proposed Algorithm with Lasso Regression

In [36]:
from sklearn.linear_model import Lasso
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
feature_scores = {}
r_scores = []

# Fitting Polynomial Regression to the dataset

from sklearn.linear_model import LinearRegression, Lasso, Ridge
import operator

reg = Ridge() 
for i in range(X_train.shape[1]):
    reg.fit(X_train[:,i].reshape(-1,1),Y_train)
    feature_scores[i] = reg.score(X_train[:,i].reshape(-1,1),Y_train)

sorted_feature_scores = dict(sorted(feature_scores.items(), key=operator.itemgetter(1),reverse=True))


dict_items = sorted_feature_scores.items()

top_n_metrics = list(dict_items)[:5]
#print(top_n_metrics)
#print(X_asDF.columns[top_n_metrics[0][0]])

list_keys = [x[0] for x in top_n_metrics]
#print(list_keys)
X_train_for_multiple  = X_train[:,list_keys]
X_test_for_multiple  = X_test[:,list_keys]
reg.fit(X_train_for_multiple, Y_train)

rU = reg.score(X_train_for_multiple, Y_train)
print(rU)

0.5438538893297045


### Calculating weights for the selected features

In [37]:
d = []
for i in list_keys:
    remaining_features = [j for j in list_keys if j != i ]
    X_train_remaining = X_train[:,remaining_features]
    # X_test_remaining = X_test[:,remaining_features]
    reg.fit(X_train_remaining, Y_train)
    train_score = reg.score(X_train_remaining, Y_train)
    print(train_score)
    d.append(rU-train_score)

0.5357771742550714
0.5409279938800345
0.5415225079573114
0.544483030811822
0.5445710278756453


### Normalising weights for and predicting the bug proneness index

In [38]:
norm_d = [(float(i)-min(d))/(max(d)-min(d)) for i in d]
print(norm_d)
#Weights should be mapped to elements in the list_keys
#norm_d are the normalized weights of the n metrics

Y_pred = []

#print(list_keys)
for features_test in X_test:
    tot = 0
    for (i, wt) in zip(list_keys, norm_d):
        tot = tot + features_test[i] * wt
    Y_pred.append(tot)
    
  

[1.0, 0.41427048399891386, 0.34666484682001714, 0.010006655514182719, 0.0]


### Printing the predicted and the actual target

In [39]:
## The paper doesn't mention normalising again but some values don't lie in the range of 0 to 1
## Hence normalising again
Y_pred = [(float(i)-min(Y_pred))/(max(Y_pred)-min(Y_pred)) for i in Y_pred]
Y_pred_final = [round(i,4) for i in Y_pred]
Y_test_final = [round(i,4) for i in Y_test]
for(predicted, actual) in zip(Y_pred_final, Y_test_final):
    print(f"{predicted} - {actual}") 
    print()

0.3521 - 0.1111

0.0111 - 0.0

0.2021 - 0.0

0.0 - 0.0

0.009 - 0.0

0.0 - 0.0

0.0025 - 0.0

0.0921 - 0.1111

0.0013 - 0.0

0.0448 - 0.0

0.0037 - 0.0

0.0099 - 0.0

0.0 - 0.1111

0.0 - 0.0

0.014 - 0.0

0.0037 - 0.1111

0.0087 - 0.0

0.005 - 0.0

0.0643 - 0.0

0.0 - 0.0

0.0012 - 0.0

0.0194 - 0.0

0.0639 - 0.2222

0.0012 - 0.0

0.0102 - 0.0

0.0 - 0.0

0.0506 - 0.0

0.1514 - 0.0

0.0012 - 0.0

0.0546 - 0.0

0.0 - 0.0

0.0012 - 0.0

0.0197 - 0.0

0.0648 - 0.0

0.1201 - 0.0

0.0113 - 0.0

0.0037 - 0.0

0.0255 - 0.0

0.0012 - 0.0

0.0 - 0.0

0.0 - 0.2222

0.0187 - 0.0

0.2027 - 0.0

0.0647 - 0.0

0.074 - 0.0

0.0119 - 0.0

0.1066 - 0.1111

0.0218 - 0.0

0.3125 - 0.4444

0.1895 - 0.4444

0.0246 - 0.1111

0.0296 - 0.1111

0.0037 - 0.0

0.0 - 0.0

0.0165 - 0.0

0.0367 - 0.0

0.0 - 0.0

0.0286 - 0.0

0.0111 - 0.1111

0.2536 - 0.2222

0.0 - 0.0

0.2406 - 0.5556

0.03 - 0.0

0.0 - 0.0

0.0173 - 0.0

0.4786 - 0.1111

0.0138 - 0.0

0.005 - 0.0

0.0 - 0.0

0.0025 - 0.0

0.1064 - 0.1111

0.06 - 

In [17]:
## For comparison and visualization 
# For Linear
print(Y_pred[:5])

[0.12481037664445492, 0.0001748539793171428, 0.1298895562815286, 0.0, 0.0013809737117855948]


In [18]:
print(Y_test[:20])

[0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111, 0.0, 0.0, 0.1111111111111111, 0.0, 0.0, 0.0, 0.0]


In [26]:
df.head()

Unnamed: 0,numberOfBugsFoundUntil:,numberOfNonTrivialBugsFoundUntil:,numberOfMajorBugsFoundUntil:,numberOfCriticalBugsFoundUntil:,numberOfHighPriorityBugsFoundUntil:,bugs,numberOfVersionsUntil:,numberOfFixesUntil:,numberOfRefactoringsUntil:,numberOfAuthorsUntil:,...,numberOfPrivateMethods,numberOfPublicAttributes,numberOfPublicMethods,rfc,wmc,CvsEntropy,CvsWEntropy,CvsLinEntropy,CvsLogEntropy,CvsExpEntropy
0,3,2,0,0,0,0,65,4,0,8,...,0.0,0.0,0.0,0.0,0.0,10.9131,0.008302,0.014767,0.253257,0.001125
1,0,0,0,0,0,0,2,0,0,2,...,0.0,0.0,0.0,0.0,0.0,1.10349,0.000657,0.001886,0.027116,0.000351
2,55,48,6,4,2,1,120,10,0,12,...,0.0,0.0,0.000905,0.036114,0.041101,37.8606,0.228509,0.10618,1.0284,0.210412
3,3,3,0,0,0,0,28,4,0,5,...,0.0,0.0,0.001756,0.002711,0.002314,5.86013,0.009105,0.010113,0.14368,0.003485
4,15,13,1,1,0,0,93,17,0,8,...,0.0,0.000695,0.0,0.003749,0.001594,13.596,0.016005,0.025751,0.338602,0.021378


## Function to classify bug count to 0 or 1

In [14]:
from imblearn.over_sampling import SMOTE
Y_train_classified = classification(Y_train)
Y_test_classified = classification(Y_test)
Y_pred_classified=classification(Y_pred_final)

sm = SMOTE(random_state = 2)
X_test_res, Y_test_res = sm.fit_resample(X_test_for_multiple, Y_test_classified)
clf.fit(X_train_for_multiple, Y_train_classified)

NameError: name 'clf' is not defined

In [31]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_classified,Y_pred_classified))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       159
           1       0.62      0.61      0.62        41

    accuracy                           0.84       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.84      0.84      0.84       200



In [None]:
# Plotting of simple linear regression model

import matplotlib.pyplot as plt
X_train_for_singlefeat = X_train[:,list_keys[0]]
X_test_for_singlefeat = X_test[:,list_keys[0]]
X_train_for_singlefeat = X_train_for_singlefeat.reshape(-1,1)
reg.fit(X_train_for_singlefeat,Y_train)
X_test_for_singlefeat_2d = X_test_for_singlefeat.reshape(-1,1)
Y_Predicted = reg.predict(X_test_for_singlefeat_2d)
#plt.scatter(X_test_for_singlefeat_2d,Y_test_final)
plt.xlabel("Actual bug count")
plt.ylabel("Predicted bug index")
#Y_test_final_arr = np.array(Y_test_final)
#slope,intercept = np.polyfit(Y_test_final_arr,Y_pred_final,1)
#Y_intercept = slope*Y_test_final_arr + intercept
#plt.plot(X_test_for_singlefeat_2d,Y_Predicted)
plt.scatter(Y_test_final,Y_Predicted)

In [None]:
# Plotting of multiple linear regression

reg.fit(X_train,Y_train)
Y_Pred_multiplereg = reg.predict(X_test)
#plt.plot(Y_test,Y_Predicted_multiplereg)
plt.scatter(Y_test,Y_Pred_multiplereg)

In [None]:
# Plotting of prosposed model

plt.scatter(Y_test_final,Y_pred_final)

## Classificaton report for testing data
### Simple Linear regression

In [None]:
reg.fit(X_train_for_singlefeat,Y_train)
Y_pred_simple = reg.predict(X_test_for_singlefeat_2d)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred_simple)))

### Multiple Linear regression

In [None]:
reg.fit(X_train,Y_train)
Y_pred_multiple = reg.predict(X_test)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred_multiple)))

### Proposed model

In [40]:
from sklearn.metrics import classification_report
reg.fit(X_train_for_multiple,Y_train)
Y_pred_proposed = reg.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred_proposed)))

Classification report : 
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       159
           1       0.70      0.56      0.62        41

    accuracy                           0.86       200
   macro avg       0.79      0.75      0.77       200
weighted avg       0.85      0.86      0.85       200



### Applying classification models to the proposed model(5 selected features)

In [44]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB,ComplementNB
Y = df['bugs']
Y_nb = []
print(set(Y))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [46]:
from imblearn.over_sampling import SMOTE
clf = GaussianNB()
sm = SMOTE(random_state = 2)
Y_train_classified = classification(Y_train)
X_train_res, Y_train_res = sm.fit_resample(X_train_for_multiple, Y_train_classified)
print(Counter(Y_train_res))
clf.fit(X_train_res, Y_train_res)
Y_pred = clf.predict(X_test_for_multiple)
from sklearn.metrics import accuracy_score,classification_report
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

Counter({0: 632, 1: 632})
Classification report : 
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       159
           1       0.64      0.51      0.57        41

    accuracy                           0.84       200
   macro avg       0.76      0.72      0.73       200
weighted avg       0.83      0.84      0.83       200



In [None]:
clf = ComplementNB()
clf.fit(X_train_for_multiple, classification(Y_train))
Y_pred = clf.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

In [48]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = "entropy", splitter="random",min_samples_split=3)
clf.fit(X_train_res, Y_train_res)
print(Counter(Y_train_res))
Y_pred = clf.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

Counter({0: 632, 1: 632})
Classification report : 
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       159
           1       0.49      0.54      0.51        41

    accuracy                           0.79       200
   macro avg       0.68      0.70      0.69       200
weighted avg       0.80      0.79      0.79       200



In [50]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='saga', penalty = 'l2', C = 0.1, random_state=10)
clf.fit(X_train_res, Y_train_res)
Y_pred = clf.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

Classification report : 
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       159
           1       0.64      0.61      0.62        41

    accuracy                           0.85       200
   macro avg       0.77      0.76      0.77       200
weighted avg       0.85      0.85      0.85       200



In [52]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=35)
clf.fit(X_train_res, Y_train_res)
Y_pred = clf.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

Classification report : 
              precision    recall  f1-score   support

           0       0.93      0.78      0.85       159
           1       0.47      0.76      0.58        41

    accuracy                           0.78       200
   macro avg       0.70      0.77      0.71       200
weighted avg       0.83      0.78      0.79       200



In [57]:
from sklearn.ensemble import RandomForestClassifier
rfs = RandomForestClassifier(n_estimators=300, criterion = "gini")
rfs.fit(X_train_res, Y_train_res)
Y_pred = rfs.predict(X_test_for_multiple)
print("Classification report : ")
print(classification_report(classification(Y_test), classification(Y_pred)))

Classification report : 
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       159
           1       0.46      0.44      0.45        41

    accuracy                           0.78       200
   macro avg       0.66      0.65      0.66       200
weighted avg       0.78      0.78      0.78       200



In [61]:
Counter(Y_train_classified)
print(X_train)

[[0.         0.         0.         ... 0.02088787 0.01043805 0.00177681]
 [0.01401869 0.005      0.         ... 0.05292446 0.01730772 0.05042487]
 [0.06542056 0.065      0.05263158 ... 0.10292228 0.04406022 0.04115889]
 ...
 [0.14953271 0.145      0.10526316 ... 0.61903227 0.91153662 0.50895073]
 [0.03738318 0.035      0.         ... 0.51903798 0.87449154 0.40012858]
 [0.02336449 0.01       0.         ... 0.14018949 0.01443694 0.24466886]]
