 # Naive Bayes classifier

In [1]:
import re
from os.path import join
from glob import glob
from random import shuffle, seed

# import regex
from tqdm import tqdm_notebook

import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [2]:
STEMMER = SnowballStemmer('english')
SEED = 9745

Extract features from the document.

In [3]:
def tokenize(file_name):
    text = open(file_name).read().lower()
    tokens = nltk.word_tokenize(text)

    # Get tokens without stop words
    words = [STEMMER.stem(t)
             for t in tokens if t not in stopwords.words('english')]

    # A word most have 3 or more characters with one letter
    words = [w for w in words if len(w) >= 3 and re.match(r'[^\W\d\_]', w)]

    return words

In [4]:
def build_corpus(locations):
    corpus = []

    for location, category in locations:
        files = glob(join(location, '*.txt'))
        for file_name in tqdm_notebook(files, desc=category):
            corpus.append((tokenize(file_name), category))

    return corpus

In [5]:
def build_frequency_dist(corpus):
    all_words = []

    for words, label in corpus:
        all_words += words

    return FreqDist(all_words)

In [6]:
def document_features(features, document):
    words = set(document[0])
    return {w: (w in words) for w in features}

In [None]:
corpus = build_corpus([('data/Rel-Yes', 'Rel-Yes'),
                       ('data/Rel-No', 'Rel-No')])
shuffle(corpus)

all_words = build_frequency_dist(corpus)




In [None]:
print(all_words.most_common(10))
print(all_words['record'])

In [None]:
word_features = list(all_words.keys())[:1000]
print(word_features[:10])

In [None]:
feature_set = [(document_features(word_features, d), d[1]) for d in corpus]