In [46]:
import pandas as pd

In [47]:
def fit(df):
    severity = df['severity']
    counts = severity.value_counts(ascending=True)

    total_count = len(severity)

    class_probability = [0] * 6


    for i in range(1, 6):
        class_probability[i] = counts.loc[i] / total_count




    class_group = df.groupby('severity')

    from collections import defaultdict
    word_count_per_class = defaultdict(lambda : [0, 0, 0, 0, 0, 0])
    total_words_per_class = [0, 0, 0, 0, 0, 0]

    vocabulary = set()
    for key, group in class_group:    
        summaries = group['summary']    
        for summary in summaries:        
            words = summary.split()        
            total_words_per_class[key] += len(words)
            for word in words:
                vocabulary.add(word)
                word_count_per_class[word][key] += 1

    
    return class_probability, word_count_per_class, vocabulary, total_words_per_class
        

(len(vocabulary), total_words_per_class)


(4257, [0, 2567, 6710, 14360, 7937, 4515])

In [48]:
from pprint import pprint

from functools import reduce
import operator

def measure_accuracy(test_data, class_probability, word_count_per_class, vocabulary, total_words_per_class):
    
    def probability_of_word_given_class(words, clazz):
        "P(Word|Class) = (Occurence of word in class + 1) / (total words in class + total words in vocabulary)"

        denominator = total_words_per_class[clazz] + len(vocabulary)

        for word in words:
            yield (word_count_per_class[word][clazz] + 1) / denominator
        
    

    def predict_severity(summary):
        """
        P(Class|Words) = P(Word_1|Class) * P(Word_2|Class) ... * P(Class)

        class = argmax(P(Class|Words))
        """

        words = summary.split()
        class_probability_given_summary = [0, 0, 0, 0, 0, 0]

        for clazz in range(1,6):
            product_of_words_probability = (reduce(operator.mul, probability_of_word_given_class(words, clazz)))

            class_probability_given_summary[clazz] = product_of_words_probability  * class_probability[clazz]

        return class_probability_given_summary.index(max(class_probability_given_summary))

    
    
    result_matrix = [[0, 0, 0, 0, 0, 0] for _ in range(6)]
        
    for index, data in test_data.iterrows():
        prediction = predict_severity(data['summary'])
        real = data['severity']
        result_matrix[prediction][real] += 1
        
#     pprint([row[1:] for row in result_matrix[1:]])
#     pprint(result_matrix)
    
    
    precisions = [0] * 6
    recalls = [0] * 6
    for clazz in range(1, 6):        
        precisions[clazz] = result_matrix[clazz][clazz] / sum(result_matrix[clazz])
        recalls[clazz] = result_matrix[clazz][clazz] / sum([row[clazz] for row in result_matrix])
    
    
    
    precision = sum(precisions) / (len(precisions) - 1)  #the array is 1 size larger
    recall = sum(recalls) / (len(recalls) - 1)  #the array is 1 size larger
    f_measure = lambda measure: (2 * measure[0] * measure[1]) / (measure[0] + measure[1])
    
    return list(map(f_measure, zip(precisions[1:], recalls[1:])))
    
    
    
    
    


    

In [54]:
from pprint import pprint
df = pd.read_excel('data/Eclipse_4sourcev1.xls', header=None, names=['severity', 'summary', 'description'], usecols=[0, 1, 2])

total_rows = len(df)
nrows_per_fold = total_rows // 11

f_measures = []

for i in range(1, 11):
    train_data_index = i * nrows_per_fold
    train_data = df[: train_data_index]
    test_data = df[train_data_index: ]
    
    class_probability, word_count_per_class, vocabulary, total_words_per_class = fit(train_data)
    f_measures.append(measure_accuracy(test_data, class_probability, word_count_per_class, vocabulary, total_words_per_class))

avg_fmeasure = []    
for i in range(5):
    avg_fmeasure.append(sum((row[i] for row in f_measures)) / 10)

pprint(f_measures)
print(avg_fmeasure)


[[0.09215686274509804,
  0.2739244951712028,
  0.37616822429906543,
  0.24980959634424982,
  0.13256006628003314],
 [0.148479427549195,
  0.2657641610260064,
  0.39447125932426497,
  0.2976827094474153,
  0.19716206123973112],
 [0.1442495126705653,
  0.2570937231298366,
  0.44856815578465065,
  0.3044943820224719,
  0.2636289666395444],
 [0.14984391259105098,
  0.24281150159744408,
  0.46979530703944083,
  0.2921348314606742,
  0.2801556420233463],
 [0.14639905548996457,
  0.25138632162661734,
  0.4689298043728423,
  0.30965682362330404,
  0.3046044864226682],
 [0.14392059553349876,
  0.22549019607843138,
  0.4644194756554307,
  0.32622798887859134,
  0.28484848484848485],
 [0.11487481590574375,
  0.229038854805726,
  0.4555789473684211,
  0.33651551312649164,
  0.28225806451612906],
 [0.13430127041742287,
  0.17923186344238978,
  0.46271094175285793,
  0.3367003367003367,
  0.25806451612903225],
 [0.1818181818181818,
  0.17040358744394618,
  0.47200000000000003,
  0.33139534883720934,

In [50]:
len(df)

7373