# Doc2Vec Automatic Bug Severity Prediction Model

## Dataset
Data from 6 NASA projects that have reported using PITS bug tracking system will be used to train and test this model. The dataset consists of bug reports that include the following information: **id**, **subject**, **severity**, **description** and **initiation date**.

## Method
1. Load data
2. Create Training and Test Data
3. Construct Doc2Vec Model
4. Build Classifier

In [3]:
import gensim
import os
import collections
import smart_open
import random

import numpy as np
import pandas as pd
from pprint import pprint

# 1. Load Data

In [4]:
def load_data(path):
    df = pd.read_csv(path, sep=',', encoding='ISO-8859-1')
    raw_data = np.array(df)
    
    # get the columns for Subject and Severity Rating
    extract_cols = [1, 2]
    del_cols = np.delete(np.arange(raw_data.shape[1]), extract_cols)
    data = np.delete(raw_data, del_cols, axis=1)
    
    return data

In [15]:
# dataset file locations
pits_train = ['../dataset/raw/pitsA.csv',
              '../dataset/raw/pitsB.csv',
              '../dataset/raw/pitsC.csv',
              '../dataset/raw/pitsD.csv',
              '../dataset/raw/pitsE.csv',]

pits_test = '../dataset/raw/pitsF.csv'

# construct list of lines from the concatenation of datasets for training
train_data = []
for project in pits_train:
    train_data.append(load_data(project))
train_data = np.concatenate([project for project in train_data])

# construct list of lines for testing
test_data = load_data(pits_test)

# 2. Create Training and Testing Data

In [17]:
def read_corpus(data, tokens_only=False):
    for i, line in enumerate(data):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line[0])
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[0]), [i])

In [18]:
train_corpus = list(read_corpus(train_data))
test_corpus = list(read_corpus(test_data, tokens_only=True))

In [25]:
# sample of training corpus
train_corpus[:2]

[TaggedDocument(words=['build', 'unitialized', 'variables'], tags=[0]),
 TaggedDocument(words=['build', 'fsw', 'typecast', 'mismatch', 'in', 'memory', 'deallocation'], tags=[1])]

In [29]:
# sample of testing corpus
test_corpus[:2]

[['lack',
  'of',
  'verification',
  'of',
  'requirement',
  'fsw',
  'sn',
  'in',
  'the',
  'pulse',
  'train',
  'firing',
  'thruster',
  'mode'],
 ['lack',
  'of',
  'verification',
  'of',
  'requirement',
  'fsw',
  'sn',
  'in',
  'the',
  'pulse',
  'train',
  'firing',
  'thruster',
  'mode']]

# Construct Doc2Vec Model

In [30]:
# instantiate Doc2Vec object
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

# build a vocabulary
model.build_vocab(train_corpus)

# train model
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 5.5 s, sys: 879 ms, total: 6.37 s
Wall time: 4.37 s


In [31]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [32]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

Counter({9: 17,
         47: 8,
         35: 13,
         1138: 1,
         55: 3,
         3: 107,
         1: 241,
         16: 22,
         17: 18,
         0: 824,
         1175: 1,
         3190: 1,
         48: 7,
         65: 9,
         2763: 2,
         3408: 1,
         882: 2,
         21: 19,
         1183: 1,
         2927: 1,
         8: 32,
         2806: 1,
         186: 5,
         1483: 1,
         1723: 1,
         797: 1,
         1293: 1,
         26: 14,
         1433: 2,
         586: 2,
         1782: 1,
         289: 2,
         3367: 2,
         37: 5,
         2324: 1,
         7: 42,
         2859: 1,
         6: 58,
         197: 2,
         2: 116,
         2609: 2,
         899: 1,
         33: 15,
         3659: 2,
         227: 1,
         2832: 1,
         76: 6,
         249: 1,
         1459: 2,
         2669: 2,
         215: 4,
         96: 4,
         589: 1,
         1404: 2,
         501: 1,
         700: 1,
         631: 1,
         13: 26,
   

In [33]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (3944): «vague configurable parameters in section»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (3944, 0.6398130059242249): «vague configurable parameters in section»

SECOND-MOST (1797, 0.6026480197906494): «test scenario sbc_bac_restart tbd parameters»

MEDIAN (732, 0.1399850845336914): «projecta sc fsrd rqt sfs is missing link to fgi»

LEAST (127, -0.5926074981689453): «requirement ts in the obc dh srs is incorrect in that it does not agree with the current fp implementation»



In [34]:
# Pick a random document from the corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (3057): «more complete downward trace from kav»

Similar Document (3090, 0.6851406097412109): «downward trace from sc»



In [57]:
# Testing the model

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (426): «no check to see if char is available»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1874, 0.7482154369354248): «xb test scenario review verification method for requirement is questionable in commandtest scenario»

MEDIAN (3427, 0.384260892868042): «inst test set tecp basic does not verify srs»

LEAST (3655, -0.5016402006149292): «no acs mode listing for initialization function in the flight srs»



In [59]:
# compare severity to similar doc
train_data[3427]

array(['Inst5 test set 8.1 TECP-BASIC does not verify SRS 6.2.1.8', 3],
      dtype=object)

In [56]:
test_data[426]

array(['FSW.DS.11: proc filename is ambiguous', 3], dtype=object)

In [41]:
len(train_data)

3945