In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [2]:
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.neural_network import MLPClassifier
from bs4 import BeautifulSoup

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [5]:
positive_reviews = BeautifulSoup(open(r'sorted_data_acl\electronics\positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')
np.random.shuffle(positive_reviews)

In [6]:
negative_reviews = BeautifulSoup(open(r'sorted_data_acl\electronics\negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')
np.random.shuffle(negative_reviews)

In [7]:
diff = len(positive_reviews) - len(negative_reviews)
idxs = np.random.choice(len(negative_reviews), size=diff)
extra = [negative_reviews[i] for i in idxs]
negative_reviews += extra

In [8]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [9]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [10]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [11]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [12]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 10946


In [13]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [14]:
N = len(positive_tokenized) + len(negative_tokenized)

In [15]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [16]:
orig_reviews, data = shuffle(orig_reviews, data)

In [17]:
X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [18]:
model = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=100,activation = 'relu',
                           solver='adam',random_state=1)
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 1.0
Test accuracy: 0.82


In [19]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

In [20]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [21]:
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [22]:
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.00010549113685596732, pred = 0.0):

You must have received CA-709 (without the cassette component) by mistake.  CA-706 is prepackaged with the cassette component

Most wrong negative review (prob = 0.9996577980180292, pred = 1.0):

After waiting almost 6 months for this Skypephone, and its competitor from Belkin, to come to market through numerous postponements, I just gave up and bought an Asus P525 smartphone. For just over double the price of the Netgear I got a Windows CE PDA with GSM capability and full Skype compatibility. When browsing through your phone contacts your are given the choice to call through GSM or Skype. Because you have built-in Wi-Fi, you can make Skype calls and chats from any covered area, plus you can go on the internet using IE and check your e-mails with Outlook. I have also read that the Skype software that runs on dedicated phones like the Netgear and the Belkin is very crude due to the lack of processing power, whereas