# Get issue data and split it

In [87]:
project_names = ["ant-ivy", "archiva", "calcite", "cayenne", "commons-bcel", "commons-beanutils", "commons-codec",
            "commons-collections", "commons-compress", "commons-configuration", "commons-dbcp", "commons-digester",
            "commons-io", "commons-jcs", "commons-jexl", "commons-lang", "commons-math", "commons-net",
            "commons-rdf", "commons-scxml", "commons-validator", "commons-vfs", "deltaspike", "eagle", "giraph", "gora",
            "jspwiki","kylin", "lens", "mahout", "manifoldcf", "nutch", "opennlp", "parquet-mr", "santuario-java",
            "systemml", "tika", "wss4j"]

In [88]:
import numpy as np
from pycoshark.mongomodels import  IssueSystem, Issue
from sklearn.model_selection import train_test_split

Y_target=[]
X_desc=[]

for i in np.arange(len(project_names)):
    project = Project.objects(name=project_names[i]).only('id').get()
    issue_system = IssueSystem.objects(project_id=project.id).only('id','url').get()
    for issue in Issue.objects(issue_system_id=issue_system.id).only('desc','issue_type','title').timeout(False):
        if(issue.issue_type=='Bug'):
            Y_target.append(issue.issue_type)
        elif(issue.issue_type==None):
            continue
        else:
            Y_target.append('Non_bug')
        if (issue.desc == None):
            issue.desc = "nodescr"
        X_desc.append(issue.title+' '+issue.desc)
    X_train, X_test, Y_train, Y_test = train_test_split(X_desc, Y_target, test_size=0.9)


# Clean data

In [89]:
import re
import numpy as np
import string

def cleandata(X_train):
    for i in np.arange(len(X_train)):
        if(X_train[i]==None):
            print(X_train[i])
            X_train[i]="nodescription" 
            continue    
        X_train[i] = re.sub("http(.)*"," link",X_train[i])
        X_train[i] = re.sub("(\r|\n|\r\n)"," ",X_train[i])
        X_train[i] = re.sub("{code(.)*}*[.\n]*{code(.)*}","code",X_train[i])
        X_train[i] = re.sub("{noforamt(.)*}(.)*{noformat(.)*}","code",X_train[i])
        X_train[i] = re.sub("[a-z]*\{(.)*\}","code",X_train[i])
        X_train[i] = re.sub("({) .*(})","code",X_train[i])
        X_train[i] = re.sub("[^\s]*\d[^\s]*"," ",X_train[i])
        X_train[i] = re.sub("[^\s]{15,}?", "", X_train[i])
        X_train[i] = X_train[i].lower()
        exclude = set(string.punctuation)
        X_train[i] = ''.join(ch for ch in X_train[i] if ch not in exclude)
    return X_train


In [92]:
sample = ["I'd like propose adding Zipf distribution to commons-math. I have a patch, but it's incomplete and somewhat inefficient; I'd like to throw it up for discussion though. To follow."]
cleandata(sample)
preparedata(sample)

['id like propos ad zipf distribut commonsmath patch incomplet somewhat ineffici id like throw discuss though follow']

# Stem and Tokenize and Shape the Data

In [26]:
import nltk.tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

def preparedata(X_train):
    for i in np.arange(len(X_train)):
        if(X_train[i]==None):
            print(X_train[i])
            X_train[i]="nodescription" 
        X_train[i] = word_tokenize(X_train[i])
        stopWords = set(stopwords.words('english'))
        filtered = [w for w in X_train[i] if not w in stopWords]
        X_train[i] = [PorterStemmer().stem(w) for w in filtered]
        X_train[i] = ' '.join(X_train[i])
    return X_train

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def shapedata(X_train):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf


# Build the Model

## Naive Bayes

In [54]:
Testbug= ['work','Fine thanks', 'bad', 'good', 'code']
from sklearn.naive_bayes import MultinomialNB
X_train =cleandata(X_train)

X_train=preparedata(X_train)
#print(X_train)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

Test = cleandata(X_test)
Test = preparedata(Test)
X_new = count_vect.transform(Test)
X_new_tfidf = tfidf_transformer.transform(X_new)

predicted = clf.predict(X_new_tfidf)


In [39]:
#correct predictions
print(np.mean(predicted==Y_test))
#wrong as bug predicted
print(np.mean((predicted=='Bug')&(predicted!=Y_test)))
#wrong as non bug predicted
print(np.mean((predicted!='Bug')&(predicted!=Y_test)))

0.7768644331500881
0.11300694948656778
0.11012861736334405


## Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

n = 1000 # maximal depth

clf = RandomForestClassifier(max_depth=n, random_state=0)
clf.fit(X_train_tfidf,Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=1000, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [51]:
predicted = clf.predict(X_new_tfidf)

In [52]:
#correct predictions
print(np.mean(predicted==Y_test))
#wrong as bug predicted
print(np.mean((predicted=='Bug')&(predicted!=Y_test)))
#wrong as non bug predicted
print(np.mean((predicted!='Bug')&(predicted!=Y_test)))

0.739679493828441
0.15351104657193237
0.1068094595996266
