[View in Colaboratory](https://colab.research.google.com/github/sayeed910/nlp/blob/master/bug_classification.ipynb)

In [248]:
!pip install xlrd



In [0]:

import pandas as pd

from pprint import pprint
df = pd.read_excel('https://raw.githubusercontent.com/sayeed910/nlp/master/data/Mozilla_4source.xlsx', header=None, names=['severity', 'summary', 'description'], usecols=[0, 1, 2])

df = df.sample(frac=1)

In [257]:
severities = df['severity']
class_counts = severities.value_counts(ascending=True)

print(class_counts)

total_count = len(severities)
class_probability = [0] * 6
for i in range(1, 6):
    class_probability[i] = class_counts.loc[i] / total_count
    
class_probability

1    283
5    446
2    508
4    697
3    701
Name: severity, dtype: int64


[0,
 0.10740037950664137,
 0.19278937381404174,
 0.266034155597723,
 0.2645161290322581,
 0.16925996204933585]

In [0]:
def nzeroes(n):
    return [0] * n

In [0]:
from collections import defaultdict



class BugClassifier:
    
    def __init__(self, class_count=5):
        self.class_count = class_count
        self.class_probability = nzeroes(class_count+1)
        self.word_count_per_class = defaultdict(lambda : nzeroes(class_count+1))
        self.total_words_per_class = nzeroes(class_count+1)
        self.vocabulary = set()
        self.p_word_given_class = defaultdict(lambda : nzeroes(class_count+1))
        
    def _model(self):
        return (self.class_probability, self.word_count_per_class, self.vocabulary, self.total_words_per_class)
    
    def _calculate_class_probability(self, severities):
        class_counts = severities.value_counts(ascending=True)        
        total_count = len(severities)

        for i in range(1, (self.class_count+1)):
            self.class_probability[i] = class_counts.loc[i] / total_count
        
    
    def _p_words_given_class(self, words, clazz):
        "P(Word|Class) = (Occurence of word in class + 1) / (total words in class + total words in vocabulary)"

        denominator = self.total_words_per_class[clazz] + len(self.vocabulary)
        
        return (((self.word_count_per_class[word][clazz] + 1) / denominator) * 100 for word in words)
    
        
        

    
    def fit(self, sentences, labels):
            
        self._calculate_class_probability(labels)
        
        for severity, sentence in zip(labels, sentences):            
            words = sentence.split()
            self.total_words_per_class[severity] += len(words)
            
            for word in words:
                self.vocabulary.add(word)
                self.word_count_per_class[word][severity] += 1
                
                
                
                
    def predict(self, sentences):
        
        return [self._predict(sentence) for sentence in sentences]
        
    
    
    def _predict(self, sentence):
        """
        P(Class|Words) = P(Word_1|Class) * P(Word_2|Class) ... * P(Class)

        class = argmax(P(Class|Words))
        """

        words = sentence.split()
        
        p_class_given_sentence = [0] + \
            [reduce(operator.mul,
                 self._p_words_given_class(words, clazz), self.class_probability[clazz])
                        for clazz in range(1, (self.class_count + 1))]
        
        return p_class_given_sentence.index(max(p_class_given_sentence))


        

        


In [0]:
def measure_accuracy(predicted_labels, real_labels, class_count=5):
    confusion_matrix = [[0, 0, 0, 0, 0, 0] for _ in range(class_count+1)]
        
    for prediction, real in zip(predicted_labels, real_labels):
        confusion_matrix[prediction][real] += 1
        
    precisions = nzeroes(class_count+1)
    recalls = nzeroes(class_count+1)
    for clazz in range(1, class_count+1):        
        precisions[clazz] = confusion_matrix[clazz][clazz] / sum(confusion_matrix[clazz])
        recalls[clazz] = confusion_matrix[clazz][clazz] / sum([row[clazz] for row in confusion_matrix])
    
#     print(precisions)
#     print(recalls)
    
    precision = sum(precisions) / (len(precisions) - 1)  #the array is 1 size larger
    recall = sum(recalls) / (len(recalls) - 1)  #the array is 1 size larger
    f_measure = lambda measure: (2 * measure[0] * measure[1]) / (measure[0] + measure[1] + .0000001)
    
    return list(map(f_measure, zip(precisions[1:], recalls[1:])))
   
    

In [256]:

total_rows = len(df)
nrows_per_fold = total_rows // 11

f_measures_summary = []
f_measures_description = []

for i in range(1, 11):
    classifier = BugClassifier()
    train_data_index = i * nrows_per_fold
    train_data = df[: train_data_index]
    test_data = df[train_data_index: ]
    
    classifier.fit(train_data['summary'], train_data['severity'])
    predictions = classifier.predict(test_data['summary'])
    
    f_measures_summary.append(measure_accuracy(predictions, test_data['severity']))
    
    classifier = BugClassifier()
    classifier.fit(train_data['description'], train_data['severity'])
    predictions = classifier.predict(test_data['description'])
    
    f_measures_description.append(measure_accuracy(predictions, test_data['severity']))
    

avg_fmeasure_summary = [sum((row[i] for row in f_measures_summary)) / 10 for i in range(5)]
avg_fmeasure_description = [sum((row[i] for row in f_measures_description)) / 10 for i in range(5)]

print("F Measure(Summary):", avg_fmeasure_summary)
print("F Measure(Description):", avg_fmeasure_description)


F Measure(Summary): [0.5977984162507026, 0.5663075606530399, 0.4913573358906298, 0.47227946022575795, 0.3808233998525301]
F Measure(Description): [0.621653832385726, 0.5026126723183791, 0.3982295297094724, 0.5126821318848307, 0.31844189579711096]
