In [1]:
import codecs
import json
import os
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
warnings.filterwarnings('ignore')

import numpy
#!{sys.executable} -m pip install numpy
from sklearn.feature_extraction.text import TfidfVectorizer
#!{sys.executable} -m pip install sklearn
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelBinarizer

In [2]:
boost_summary = 3
project_keys = ["HTTPCLIENT", "LUCENE", "JCR"]

Load data

In [3]:
def load_data():
    raw_data = []
    data_directory = ".." + os.path.sep + "data"
    for filename in os.listdir(data_directory):
        with codecs.open(data_directory + os.path.sep + filename, "r", "utf-8") as fin:
            raw_data += json.load(fin)
    return raw_data

In [4]:
def get_corpus_labels(raw_data):
    # Corpus building.
    corpus = []
    labels = []
    n_bug = 0
    for n_file in raw_data:

        txt = ""
        for i in range(boost_summary):
            txt += n_file["summary"] + " "

        corpus.append(txt + " " + n_file["description"])
        labels.append(n_file["label"])
        if n_file["label"] == "BUG":
            n_bug += 1
    print(f"{n_bug} BUG / {len(labels)} \n")
    return corpus, labels

In [5]:
def feature_computing(corpus, labels, vectorizer, feature_selection = True, k_best=30000):
    # TF-IDF.
    print("Feature computing.")
    X = vectorizer.fit_transform(corpus)
    print(f"\t{X.shape[1]} features.")

    if feature_selection:
        print("Extracting %d best features by a chi-squared test" % k_best)
        ch2 = SelectKBest(chi2, k=k_best)
        X = ch2.fit_transform(X, labels)

        #if feature_names:  # keep selected feature names.
        #    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
        return X, vectorizer, ch2

    return X, vectorizer

Split data by project

In [6]:
def split_data_by_project(raw_data, stemmer=None):
    # Create dicts of tickets for each project
    dict_data_split = {}
    print("Split data for each project")
    for project_key in project_keys:
        dict_data_split[project_key] = {}
        dict_data_split[project_key]["tickets"] = []
        dict_data_split[project_key]["corpus"] = []
        dict_data_split[project_key]["labels"] = []

    for ticket in raw_data:
        for project_key in project_keys:
            if project_key in ticket["key"]:
                dict_data_split[project_key]["tickets"].append(ticket)

    for project_key in project_keys:
        print("Get corpus and labels for project: ", project_key)
        tickets = dict_data_split[project_key]["tickets"]
        # Get corpus and labels for specific project tickets
        if stemmer is not None:
            corpus, labels = get_corpus_labels(tickets)
        else:
            corpus, labels = get_corpus_labels(tickets)
        dict_data_split[project_key]["corpus"] = corpus
        dict_data_split[project_key]["labels"] = labels

    return dict_data_split

In [7]:
def labels_binarizing(labels):
    lb = LabelBinarizer()
    # Binarize labels with BUG = 0 and NBUG = 1
    labels = numpy.array([number[0] for number in lb.fit_transform(labels)])
    # Inverse 0 and 1 to have good labels, i.e BUG = 1 and NBUG = 0
    return numpy.logical_not(labels).astype(int)

In [8]:
def make_scoring(X, binarized_labels, clf, cv=10):
    scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f" % scores.mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores.std() * 2))
    print("Standard deviation: %0.3f\n" % scores.std())
    
    scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='f1')
    print("F1score: %0.3f" % scores.mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores.std() * 2))
    print("Standard deviation: %0.3f\n" % scores.std())

### Multilayer Perceptron

In [9]:
def score_multilayer_perceptron(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    mlp = MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring MLP <=====")
    make_scoring(X,binarized_labels,mlp)

In [None]:
raw_data = load_data()
score_multilayer_perceptron(raw_data)

1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
=====> Scoring MLP Accuracy <=====


### Stochastic Gradient Descent

In [39]:
def score_stochastic_gradient_descent(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    sgd = SGDClassifier(random_state=0, loss='modified_huber', max_iter=5000)
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring SGD <=====")
    make_scoring(X, binarized_labels, sgd)

In [40]:
raw_data = load_data()
score_stochastic_gradient_descent(raw_data)

1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
=====> Scoring SGD Accuracy <=====
Accuracy: 0.894
95% Confidence Interval +/- 0.029
Standard deviation: 0.015

F1score: 0.841
95% Confidence Interval +/- 0.037
Standard deviation: 0.019



### SVM

In [None]:
def score_svm(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    svm = SVC(C=100, gamma='scale')
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring SVM <=====")
    make_scoring(X, binarized_labels, svm)

In [None]:
raw_data = load_data()
score_svm(raw_data)

### Random Forest

In [41]:
def score_random_forest(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    rf = RandomForestClassifier(n_estimators=20, random_state=0, criterion='entropy')
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring RF <=====")
    make_scoring(X, binarized_labels, rf)

In [42]:
raw_data = load_data()
score_random_forest(raw_data)

1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
=====> Scoring RF Accuracy <=====
Accuracy: 0.787
95% Confidence Interval +/- 0.033
Standard deviation: 0.017

F1score: 0.610
95% Confidence Interval +/- 0.089
Standard deviation: 0.044



### Ridge Regression Classifier

In [43]:
def score_ridge_classifier(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    rrc = RidgeClassifier(random_state=0)
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring RRC <=====")
    make_scoring(X, binarized_labels, rrc)

In [44]:
raw_data = load_data()
score_ridge_classifier(raw_data)

1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
=====> Scoring RRC Accuracy <=====
Accuracy: 0.882
95% Confidence Interval +/- 0.030
Standard deviation: 0.015

F1score: 0.819
95% Confidence Interval +/- 0.050
Standard deviation: 0.025



### K-Nearest Neighbors classifier

In [45]:
def score_knn(raw_data):
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"},  sublinear_tf=True)
    knn = KNeighborsClassifier(weights='distance', n_neighbors=2)
    X, vectorizer, chi = feature_computing(corpus, labels, vectorizer, feature_selection=True)
    binarized_labels = labels_binarizing(labels)

    print("=====> Scoring KNN <=====")
    make_scoring(X, binarized_labels, knn)

In [46]:
raw_data = load_data()
score_knn(raw_data)

1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
=====> Scoring KNN Accuracy <=====
Accuracy: 0.671
95% Confidence Interval +/- 0.018
Standard deviation: 0.009

F1score: 0.108
95% Confidence Interval +/- 0.090
Standard deviation: 0.045

