# Doc2Vec Automatic Bug Severity Prediction Model

## Dataset
Data from 6 NASA projects that have reported using PITS bug tracking system will be used to train and test this model. The dataset consists of bug reports that include the following information: **id**, **subject**, **severity**, **description** and **initiation date**.

## Method
1. Load data
2. Create Training and Test Data
3. Construct Doc2Vec Model
4. Build Classifier

In [111]:
import gensim
import os
import collections
import smart_open
import random

import math
import numpy as np
import pandas as pd
from pprint import pprint

# 1. Load Data

In [130]:
def load_data(path):
    df = pd.read_csv(path, sep=',', encoding='ISO-8859-1')
    raw_data = np.array(df)
    
    # get the columns for Subject and Severity Rating
    extract_cols = [1, 2]
    del_cols = np.delete(np.arange(raw_data.shape[1]), extract_cols)
    data = np.delete(raw_data, del_cols, axis=1)
    
    # check for possible NaN severity values
    del_rows = []
    for i in range(len(data)):
        if math.isnan(data[i][1]):
            del_rows.append(i)
    
    # delete rows that contain NaN severity values
    if len(del_rows) > 0:
        data = np.delete(data, del_rows, axis=0)
    
    return data

In [131]:
# dataset file locations
pits_train = ['../dataset/raw/pitsA.csv',
              '../dataset/raw/pitsB.csv',
              '../dataset/raw/pitsC.csv',
              '../dataset/raw/pitsD.csv',
              '../dataset/raw/pitsE.csv',]

pits_test = '../dataset/raw/pitsF.csv'

# construct list of lines from the concatenation of datasets for training
train_data = []
for project in pits_train:
    train_data.append(load_data(project))
train_data = np.concatenate([project for project in train_data])

# construct list of lines for testing
test_data = load_data(pits_test)

# 2. Create Training and Testing Data

In [132]:
def read_corpus(data, tokens_only=False):
    for i, line in enumerate(data):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line[0])
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[0]), [i])

In [133]:
train_corpus = list(read_corpus(train_data))
test_corpus = list(read_corpus(test_data, tokens_only=True))

In [134]:
# sample of training corpus
train_corpus[:2]

[TaggedDocument(words=[u'build', u'unitialized', u'variables'], tags=[0]),
 TaggedDocument(words=[u'build', u'fsw', u'typecast', u'mismatch', u'in', u'memory', u'deallocation'], tags=[1])]

In [135]:
# sample of testing corpus
test_corpus[:2]

[[u'lack',
  u'of',
  u'verification',
  u'of',
  u'requirement',
  u'fsw',
  u'sn',
  u'in',
  u'the',
  u'pulse',
  u'train',
  u'firing',
  u'thruster',
  u'mode'],
 [u'lack',
  u'of',
  u'verification',
  u'of',
  u'requirement',
  u'fsw',
  u'sn',
  u'in',
  u'the',
  u'pulse',
  u'train',
  u'firing',
  u'thruster',
  u'mode']]

# Construct Doc2Vec Model

In [136]:
# instantiate Doc2Vec object
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

# build a vocabulary
model.build_vocab(train_corpus)

# train model
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 6.94 s, sys: 1.93 s, total: 8.87 s
Wall time: 5.84 s


In [137]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [138]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

Counter({0: 546,
         1: 214,
         2: 94,
         3: 72,
         4: 69,
         5: 52,
         6: 46,
         7: 30,
         8: 28,
         9: 19,
         10: 23,
         11: 33,
         12: 24,
         13: 24,
         14: 17,
         15: 15,
         16: 20,
         17: 15,
         18: 24,
         19: 8,
         20: 8,
         21: 13,
         22: 13,
         23: 17,
         24: 6,
         25: 15,
         26: 16,
         27: 10,
         28: 5,
         29: 17,
         30: 11,
         31: 6,
         32: 5,
         33: 6,
         34: 5,
         35: 6,
         36: 8,
         37: 7,
         38: 13,
         39: 6,
         40: 2,
         41: 5,
         42: 9,
         43: 8,
         44: 6,
         45: 2,
         46: 11,
         47: 9,
         48: 8,
         49: 5,
         50: 5,
         51: 5,
         52: 1,
         53: 3,
         54: 7,
         55: 6,
         56: 2,
         57: 8,
         58: 4,
         59: 4,
         60: 2,
   

In [139]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (3281): «vague configurable parameters in section»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (357, 0.866884708404541): «engcntrl test scripts engcntrl srs requirement inconsistent with rvm and fsw_test_»

SECOND-MOST (366, 0.8561243414878845): «engcntrl test scripts engcntrl srs requirement inconsistent with rvm and fsw_test_»

MEDIAN (1873, 0.5102684497833252): «xb test scenario review verification method for requirement is questionable»

LEAST (2740, -0.6542261242866516): «inst test case tc_inst does not trace to srs and is missing trace»



In [140]:
# Pick a random document from the corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (554): «build obc code testing floats for equality in function ac_slewsetup»

Similar Document (1321, 0.8426072597503662): «code review dss digital sun sensor build feb no evidence found for converting the dss output to decimal values»



In [141]:
# Testing the model

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (638): «click»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (2702, 0.38667237758636475): «inst test cases and contain possible errors»

MEDIAN (3021, -0.07608343660831451): «level fsw requirement is vaguely worded»

LEAST (666, -0.42191559076309204): «prd is not linked to sc fs rqts»



In [142]:
# compare severity to similar doc
train_data[1799]

array([u"464-FSW-SPEC-0050 ACS FSW Requirements do not distinguish between 'Cold Restart' and 'Power On'",
       3.0], dtype=object)

In [143]:
test_data[103]

array([u'FSW.TP.06.06: Source inspection is inadequate to verify requirement',
       3], dtype=object)

In [144]:
len(train_data)

3282

# Construct a Better Doc2Vec Model
There are two types of paragraph vector models that can be used: disttibuted memory **(DM)** and distributed bag of words **(DBOW)**. In the orginal paper, _Distributed Representations of Sentences and Documents_, both models were introduced and experimented on. Their conclusion was that both models perform well for cetain datasets and should be chosen accordingly. The recommendation was to combine both models for the best results.

We have chosen to follow the recommendation as a starting point when constructing our model. Further exploration can be done in the window size and individually use each model over the combined model.

In [145]:
import multiprocessing

cores = multiprocessing.cpu_count()

# instantiate Doc2Vec object
model_DM = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40, workers=cores,  dm=1, dm_concat=1 )
model_DBOW = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40, workers=cores, dm=0)

# build a vocabulary
model_DM.build_vocab(train_corpus)
model_DBOW.build_vocab(train_corpus)

In [146]:
#train model
%time model_DM.train(train_corpus, total_examples=model_DM.corpus_count, epochs=model_DM.epochs)
%time model_DBOW.train(train_corpus, total_examples=model_DBOW.corpus_count, epochs=model_DBOW.epochs)

CPU times: user 6.16 s, sys: 1.28 s, total: 7.44 s
Wall time: 3.91 s
CPU times: user 4.65 s, sys: 1.32 s, total: 5.96 s
Wall time: 3.72 s


# Construct Classifier
The original Paper, _"Automated severity assessment of software defect reports"_ uses the RIPPER rule learning algorithm to perform severity prediction. A follow up paper, _"Prediction of defect severity by mining software project reports"_, extends the original method by applying both DT and MLP.

We have opted to use MLP to perform our predictions using the paragragh vectors produced by our Doc2Vec models. Rule learning and DT both performed well at the project level but, as highlighted in the above papers, rely on project specific data and thus do not generalize well.


In [175]:
# create test and train sets using Doc2Vec output
X_train = [(list(model_DM.docvecs[i]) + list(model_DBOW.docvecs[i])) for i in range(len(train_data))]
Y_train = [doc[1] for doc in train_data]

X_test = [(list(model_DM.infer_vector(test_corpus[i])) + list(model_DBOW.infer_vector(test_corpus[i]))) for i in range(len(test_data))]
Y_test = [doc[1] for doc in test_data]

In [173]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(alpha = 0.7, max_iter=10000) 
#classifier = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=500, alpha=0.0001,solver='sgd', random_state=21,tol=0.000000001)
classifier.fit(X_train, Y_train)
 
df_results = pd.DataFrame(data=np.zeros(shape=(0,3)), columns = ['classifier', 'train_score', 'test_score'] )
train_score = classifier.score(X_train, Y_train)
test_score = classifier.score(X_test, Y_test)
 
#print  (classifier.predict_proba(X_test))
#print  (classifier.predict(X_test))
 
df_results.loc[1,'classifier'] = "MLP"
df_results.loc[1,'train_score'] = train_score
df_results.loc[1,'test_score'] = test_score
print(df_results)

  classifier  train_score  test_score
1        MLP     0.669714    0.630376
