In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv("bug_data_exported.csv")
df.head()

Unnamed: 0,id,description,severity
0,91,document properties cannot be listed,major
1,103,layout bug: table cell overflows containing cell,normal
2,104,table saved as text: missing inter-column space,minor
3,105,nested <TABLE>s: bgcolor of inner table not re...,normal
4,133,Navigator draws entities like &lt;&amp;&gt; as...,minor


In [3]:
df = df.dropna()


In [4]:
df.severity.value_counts()

normal      272520
major        35898
critical     31953
minor        19451
blocker       3694
Name: severity, dtype: int64

In [5]:
df['severity_num'] = df.severity.map({'minor':0,'normal':1,'major':2,'critical':4,'blocker':5})


In [6]:
df_normal = df[df.severity_num==1]
df_major = df[df.severity_num==2]
df_critical = df[df.severity_num==4]
df_minor = df[df.severity_num==0]
df_blocker = df[df.severity_num==5]

In [7]:
from sklearn.utils import resample
def resample_df(df):
    return resample(df,replace=True,n_samples=30000,random_state=123)

df_normal = resample_df(df_normal)
df_major = resample_df(df_major)
df_critical = resample_df(df_critical)
df_minor = resample_df(df_minor)
df_blocker = resample_df(df_blocker)

df = pd.concat([df_normal,df_major,df_critical,df_minor,df_blocker])

df.severity.value_counts()


normal      30000
major       30000
critical    30000
minor       30000
blocker     30000
Name: severity, dtype: int64

In [8]:
df.head()

Unnamed: 0,id,description,severity,severity_num
337673,752343,"All tabs with screenshots,without site (for ac...",normal,1
37035,72500,dom table column handling is wrong,normal,1
23480,43909,"<popup orient=""horizontal""> doesn't work",normal,1
268014,569516,"""smokescreen"" flash emulator doesn't work on m...",normal,1
270320,575254,Reduce TabBrowser XBL usage to minimum,normal,1


In [9]:
#define x as bug description and y as the severity
x = df.description
y = df.severity_num
print(x.shape)
print(y.shape)

(150000,)
(150000,)


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33)


In [11]:
#instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)
x_train_dtm = vect.transform(x_train)
tfdf_transformer = TfidfTransformer()
x_train_tfdf = tfdf_transformer.fit_transform(x_train_dtm)

In [12]:
knn = KNeighborsClassifier(n_neighbors=60,n_jobs=-1)
clf = knn.fit(x_train_dtm,y_train)

In [13]:
test = ["Navigator spawn a lot of error messages in infinite loop."]
test_dtm = vect.transform(test)
test_tidf = tfdf_transformer.transform(test_dtm)
predicted = clf.predict(test_tidf)

In [14]:
predicted

array([5])

In [15]:
x_test_dtm = vect.transform(x_test)
x_test_tidf = tfdf_transformer.transform(x_test_dtm)

In [16]:
predicted = clf.predict(x_test_tidf)

In [17]:
predicted

array([4, 5, 1, ..., 5, 5, 5])

In [18]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, predicted)
conf_mat

array([[   3, 2368,  239, 2227, 4986],
       [  13, 2466,  268, 2164, 4961],
       [   1, 2355,  317, 2326, 5050],
       [   0, 1265,  442, 2242, 6005],
       [   0, 1443,  316, 1478, 6565]])

In [19]:
from sklearn import metrics
metrics.accuracy_score(y_test, predicted)

0.2342020202020202