In [147]:
import os
import re
import random
import math
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\czt3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\czt3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\czt3\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\czt3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Read reviews from training file
reviews = open('data/train.txt', 'r').read().split('\n')
reviews = [line.split('\t') for line in reviews]

# Validate reviews strings
for r in reviews:
    if len(r) is not 2:
        print('Rejected invalid input: ', str(r))
        reviews.remove(r)

# Split positive and negative reviews
rw_pos = [r[0] for r in reviews if r[1] == '1']
rw_neg = [r[0] for r in reviews if r[1] == '0']

Rejected invalid input:  ['']


In [3]:
# Load stop words
stop_words = list(set(stopwords.words('english')))

# Select word types
'''
    j -> adjective
    r -> adverb
    v -> verb
'''
sel_wts = ['J', 'V', 'R']

In [29]:
# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# Extract keywords
def parse_review(r):
    bow = []
    # remove punctuations
    cleaned = re.sub(r'[^(a-zA-Z)\s]','', r)
    # Tokenise
    tokenized = word_tokenize(cleaned)
    # Remove stop words
    stopped = [w for w in tokenized if not w in stop_words]
    # Part of Speed labelling 
    labelled = nltk.pos_tag(stopped)
    # Reserve word types needed
    for w in labelled:
        w_type = w[1][0]
        if w_type in sel_wts:
            word = w[0].lower()
            if w_type == 'V': # convert verbs to present tense
                word = lemmatizer.lemmatize(word, 'v')
            bow.append(word)
    return bow
    
# Extract all selected words from parsed reviews
all_words = []
for r in rw_pos+rw_neg:
    all_words = all_words + parse_review(r)

In [30]:
# Get words frequency
all_words = nltk.FreqDist(all_words)
# Reject low frequency words
all_words = [word for word, freq in all_words.items() if freq > 1]
# all_words

In [36]:
# parse reviews to vectors
null_count = 0
def rw_to_vec(r):
    bow = parse_review(r)
    if len(bow) == 0:
        global null_count
        null_count += 1
    vec = {}
    for w in all_words:
        vec[w] = (w in bow)
    return vec
    
rw_pos_vecs = [(rw_to_vec(r), 1) for r in rw_pos]
rw_neg_vecs = [(rw_to_vec(r), 0) for r in rw_neg]

In [68]:
# Filter review vectors that has no word matched
rw_vecs = []
for tp in rw_pos_vecs + rw_neg_vecs:
    vec = tp[0]
    wd_matched = [k for k,v in vec.items() if v]
    if len(wd_matched) > 0:
        rw_vecs.append(tp)

In [108]:
# Shuffle the vectors
random.shuffle(rw_vecs)

# Split train & test set
train_set_pct = 0.8
split_num = math.floor(len(rw_vecs) * train_set_pct)
train_set = rw_vecs[:split_num]
test_set = rw_vecs[split_num:]

In [109]:
# Train the model (classifier)
model = nltk.NaiveBayesClassifier.train(train_set)


Classifier accuracy percent: 80.11695906432749


In [149]:
# Show accuracy and most informative features
print("Classifier accuracy percent:",(nltk.classify.accuracy(model, test_set))*100)
model.show_most_informative_features(20)

Classifier accuracy percent: 80.11695906432749
Most Informative Features
                friendly = True                1 : 0      =      6.3 : 1.0
                    nice = True                1 : 0      =      5.9 : 1.0
                    love = True                1 : 0      =      5.7 : 1.0
                    wait = True                0 : 1      =      5.7 : 1.0
              disappoint = True                0 : 1      =      4.9 : 1.0
                      do = True                0 : 1      =      4.9 : 1.0
                    dont = True                0 : 1      =      4.6 : 1.0
                    take = True                0 : 1      =      4.5 : 1.0
                    wont = True                0 : 1      =      4.5 : 1.0
                   worst = True                0 : 1      =      4.2 : 1.0
                    know = True                0 : 1      =      4.2 : 1.0
                   never = True                0 : 1      =      4.1 : 1.0
                      so = 

In [151]:
prob_dist = model.prob_classify(test_set[60][0])
for k in prob_dist.samples():
    print(k, prob_dist.prob(k))

0 0.7777502997482557
1 0.22224970025174404


In [145]:
prob_dist.max()

0

In [144]:
model.classify(test_set[50][0])

0