In [None]:
# tot, train, test, cls, voc = data_cutter(amazon, imdb, yelp)

# good = [word for word in voc if '1' in voc[word]["class"]]
# bad = [word for word in voc if '0' in voc[word]["class"]]

# unq_good = [word for word in good if word not in bad]
# unq_bad = [word for word in bad if word not in good]

# both = [word for word in voc if word in good and word in bad]

# # for word in both:
# #     print("{}\t-> {}".format(word, voc[word]))

# print(cls)
   

In [None]:
import random
from math import *
import time
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
import operator

stemmer = LancasterStemmer()


class Data:

    def __init__(self, *files, train_partition=0.8, quiet=True, shuffle=True, trim=True):
        """

        :param files:
        """
        self.files = files
        self.train_partition = train_partition
        self.quiet = quiet
        self.shuffle = shuffle
        self.trim = trim
        self.total_data, \
        self.training_data, \
        self.test_data, \
        self.doc_classes, \
        self.vocabulary = self.data_cutter()
        if self.trim:
            self.trim_common()
        self.voc_classes = self.count_class()
        self.log_priors = self.log_prior()
        self.log_likelihood()

    def data_cutter(self):

        # code to cut data
        files = self.files
        total_data = []
        training_data = []
        test_data = []
        voc_classes = {}
        doc_classes = {}
        vocabulary = {}

        word_count = 0

        punct = "!@#$%^&*()_'''+=,./;[]\<>?:{}|-```~1234567890"

        for f in files:
            data = open(f, "r")

            c = 0
            w = 0

            for line in data:
                t_line = line \
                    .rstrip('\n') \
                    .split('\t')

                label = int(t_line[1])
                sentence = t_line[0].lower()

                for sym in punct:
                    sentence = sentence.replace(sym, '')

                if label not in doc_classes:
                    doc_classes[label] = 0

                # split words in each sentence.
                # TODO: tokenize and remove filler words
                words = nltk.word_tokenize(sentence)

                doc = (label, words)

                total_data.append(doc)

                w += len(words)
            # TODO: create shuffle trigger
            if self.shuffle:
                random.shuffle(total_data)

            # TODO: create portioning variable
            training_data = total_data[0:int(self.train_partition * len(total_data))]
            test_data = total_data[int(self.train_partition * len(total_data)):]

            c += 1
            if not self.quiet:
                print('{} finished on line {} with {} words'
                      '\n==========================\n'.format(f, c, w))
            data.close()

            # exit file loop

        stopWords = set(stopwords.words('english'))

        for doc in training_data:

            label = doc[0]
            words = doc[1]

            if label not in doc_classes:
                doc_classes[label] = 1
            else:
                doc_classes[label] += 1

            for word in words:
                if word in stopWords:
                    pass
                else:
                    if word not in vocabulary:
                        # initialising word in vocabulary
                        vocabulary[word] = {"frequency": {l: 0 for l in doc_classes}, "class": [label]}
                        vocabulary[word]["frequency"][label] += 1
                    else:
                        if label in vocabulary[word]["class"]:
                            vocabulary[word]["frequency"][label] += 1

                        else:
                            vocabulary[word]["class"].append(label)
                            vocabulary[word]["frequency"][label] = 1

                    # if label not in voc_classes:
                    #     voc_classes[label] = 1
                    # else:
                    #     voc_classes[label] += 1

                    word_count += 1

        # voc_classes = self.count_class()

        #
        #     c += 1
        #
        # print('{} finished on line {} with {} words'
        #       '\n==========================\n'.format(f, c, w))
        if not self.quiet:
            print(
                "=====Summary=====\n"
                "Unique words:\t -> {}\n"
                "Total words:\t -> {}\n"
                "Total data:\t -> {}\n"
                "Train data:\t -> {}\n"
                "Test data:\t -> {}\n"
                    .format(len(vocabulary),
                            word_count,
                            len(total_data),
                            len(training_data),
                            len(test_data)))

        return total_data, training_data, test_data, doc_classes, vocabulary

    # def vocabulary(self):
    #     return self.vocabulary
    #
    # def word(self, word):
    #     return self.vocabulary[word]

    def count_class(self):
        nc = {l: 0 for l in self.doc_classes}
        for word, data in self.vocabulary.items():
            for cls in self.doc_classes:
                nc[cls] += data['frequency'][cls]
        return nc

    def log_likelihood(self):

        for word in self.vocabulary:
            self.vocabulary[word]["probability"] = {}
            for cls, wfr in self.vocabulary[word]["frequency"].items():
                cf = self.voc_classes[cls]

                p = log10((wfr + 1) / (cf + 1))

                self.vocabulary[word]["probability"][cls] = p
        if not self.quiet:
            print("Log probabilities calculated")

        return self

    def log_prior(self):
        log_priors = {}
        n_doc = len(self.training_data)
        for cls, n_c in self.doc_classes.items():
            pc = log10(n_c / n_doc)
            log_priors[cls] = pc

        return log_priors

    def nb_classifier(self, doc):
        sum_c = {}
        for cls, lp in self.log_priors.items():
            # print('{} -> {}'.format(cls, lp))
            sum_c[cls] = lp
            for word in doc:
                if word in self.vocabulary:
                    sum_c[cls] += self.vocabulary[word]['probability'][cls]

        return max(zip(sum_c.values(), sum_c.keys()))

    def test(self):
        score = 0
        word_count = 0
        for doc in self.test_data:
            words = doc[1]
            label = doc[0]

            result = self.nb_classifier(words)
            if result[1] == label:
                score += 1
            word_count += 1

        return (score / word_count) * 100

    def trim_common(self):
        # remove words with similar frequencies
        del_w = []
        for w, fr in self.vocabulary.items():
            if len(fr['class']) > 1:
                if sum(fr['frequency'].values()) > 10:
                    tot = sum(fr['frequency'].values())
                    perc = []
                    for c, f in fr['frequency'].items():
                        perc.append((f / tot) * 100)
                    dif = abs(perc[0] - perc[1])
                    if dif < 15:
                        del_w.append(w)

        for w in del_w:
            del self.vocabulary[w]

        if not self.quiet:
            print('{} words removed from vocabulary'.format(len(del_w)))
        del del_w


In [1]:
# unit test classifier 
from util import NB_DataHandler



amazon = "amazon_cells_labelled.txt"
imdb = "imdb_labelled.txt"
yelp = "yelp_labelled.txt"

v2 = NB_DataHandler(amazon, imdb, yelp, quiet=False)
res = v2.test()
print(res)

amazon_cells_labelled.txt finished on line 1000 with 10202 words

imdb_labelled.txt finished on line 1000 with 14300 words

yelp_labelled.txt finished on line 1000 with 10848 words

=====Summary=====
Unique words:	 -> 3416
Total words:	 -> 28615
Total data:	 -> 3000
Train data:	 -> 2400
Test data:	 -> 600

38 words removed from vocabulary
Log probabilities calculated
82.0


In [None]:
v2.report()

In [None]:
# test accuracy over several runs

import time
import matplotlib.pyplot as plt

runs = 100
vals = []
start = time.time()
for i in range(1, runs+1):
    if i%10 == 0:
        t = time.time() - start
        print("run {}: {}".format(i, t))
    v2 = NB_DataHandler(amazon, imdb, yelp)
    res = v2.test()
    vals.append(res)
    

avg = sum(vals)/len(vals)
finish = time.time()-start
print("accuracy after {} runs:{} avg:{} \ntime: {}s"
      .format(runs, round(max(vals),3), round(avg, 3), round(finish,3)))

plt.plot(vals)
plt.show()

In [None]:
# find common words

voc = v2.vocabulary

for w, fr in voc.items():
    if len(fr['class']) > 1:
        if sum(fr['frequency'].values()) > 10:
            tot = sum(fr['frequency'].values())
            perc = []
            for c, f in fr['frequency'].items():
                perc.append((f/tot)*100)
            dif = abs(perc[0]-perc[1])
            if dif < 15:
                print ('{} -> {} -> {}'.format(w, perc, dif))

In [None]:
# print vocabulary

voc = v2.vocabulary
for word, data in voc.items():
    print('{} -> {}'.format(word, data['frequency']))

In [None]:
d = v2.training_data
d

In [None]:
c = v2.count_class()
c

In [None]:
v2.voc_classes

In [None]:
v2.log_priors

In [None]:
doc = 17
#res = v2.nb_classifier(v2.test_data[doc][1])
#res = v2.test()
#print(res)
print(v2.test_data[doc][1])
for word in v2.test_data[doc][1]:
    if word in v2.vocabulary:
        print('{} -> {}'.format(word, True))
    

In [None]:
import operator
c = {0:1234, 1:344}
r = max(zip(c.keys(), c.values()))
r[1]

In [None]:
v2.vocabulary['not']

In [None]:
doc = v2.test_data[17][1]

sum_c = {}
for cls, lp in v2.log_priors.items():
    print('{} -> {}'.format(cls, lp))
    sum_c[cls] = lp
    for word in doc:
        if word in v2.vocabulary:
            print('{} -> {}'.format(word, v2.vocabulary[word]['probability'][cls]))
                    

In [None]:
v2.report()


In [4]:
w = v2.unit_test(amazon)
w

amazon_cells_labelled.txt finished on line 1000 with 10202 words



92.0