In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [2]:
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from bs4 import BeautifulSoup

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [5]:
positive_reviews = BeautifulSoup(open(r'sorted_data_acl\electronics\positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')
np.random.shuffle(positive_reviews)

In [6]:
negative_reviews = BeautifulSoup(open(r'sorted_data_acl\electronics\negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')
np.random.shuffle(negative_reviews)

In [7]:
diff = len(positive_reviews) - len(negative_reviews)
idxs = np.random.choice(len(negative_reviews), size=diff)
extra = [negative_reviews[i] for i in idxs]
negative_reviews += extra

In [8]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [9]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [10]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [11]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [12]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 10946


In [13]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [14]:
N = len(positive_tokenized) + len(negative_tokenized)

In [15]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [16]:
orig_reviews, data = shuffle(orig_reviews, data)

In [17]:
X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [18]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.7805263157894737
Test accuracy: 0.73


In [19]:
# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

you 1.1475819567918162
've 0.8145583333556904
n't -2.1042451840353382
buy -0.9055855439331382
little 1.0071952264174695
lot 0.704848746573803
wa -1.7694708568958835
cable 0.5182595661449163
sound 0.9844727651687323
price 2.622273031801436
home 0.5099983859843846
time -0.5795244070381305
recommend 0.622452685540962
highly 0.9525269947066872
quality 1.3855972098295475
try -0.7062906678434426
month -0.8363120100245447
love 1.1587424343122295
ha 0.7574730013476119
memory 0.9685582779806071
returned -0.8028166827913277
bad -0.797344684346985
returning -0.5045290523828067
item -1.0251911451361981
expected 0.5561199397140033
unit -0.534099834706297
doe -1.1673332461794088
bit 0.6332899931143814
perfect 0.9931621588080344
fast 0.9853637241681494
easy 1.7932187067538496
pro 0.5307515440495661
pretty 0.6836610102492499
excellent 1.3434113716287166
then -1.1265770597186
return -1.1904547523450018
happy 0.6367000497347804
space 0.6122997810866645
using 0.6437441557204685
comfortable 0.613436126707

In [20]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

In [21]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [22]:
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [23]:
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.3450757287185476, pred = 0.0):

I didn't buy this on Amazon but wanted to say this device is great. The only bad thing was MY laptop is old!  Can't go wrong with this one

Most wrong negative review (prob = 0.6023507974107721, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

