# Sentiment Analyzer

In [15]:
import nltk
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')

### Electronics Category Review

In [27]:
positive_reviews = BeautifulSoup(open('/Users/praga/Downloads/electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('/Users/praga/Downloads/electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [21]:
len(positive_reviews)

983

In [22]:
len(negative_reviews)

402

### Oversampling the Negative Reviews

In [28]:
diff = len(positive_reviews) - len(negative_reviews)
idxs = np.random.choice(len(negative_reviews), size=diff)
extra = [negative_reviews[i] for i in idxs]
negative_reviews += extra

In [29]:
len(negative_reviews)

983

In [30]:
#### or undersample the positive reviews

# np.random.shuffle(positive_reviews)
# positive_reviews = positive_reviews[:len(negative_reviews)]

### Custom Tokenizer

In [35]:
wordnet_lemmatizer = WordNetLemmatizer()

In [73]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords.words('english')] # remove stopwords
    tokens= [t for t in tokens if t.isalpha()]
    return tokens

In [74]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [75]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [76]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [77]:
positive_reviews[0]

<review_text>\nI purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.\n\nI feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.\n\nAs always, Amazon had it to me in &lt;2 business days\n</review_text>

In [78]:
pos_0=' '.join(positive_tokenized[0])
pos_0

u'purchased unit due frequent blackout area power supply going bad run cable modem router lcd monitor minute enough time save work shut equally important know electronics receiving clean power feel investment minor compared loss valuable data failure equipment due power spike irregular power supply always amazon business day'

In [79]:
negative_reviews[0]

<review_text>\ncons\ntips extremely easy on carpet and if you have a lot of cds stacked at the top\n\npoorly designed, it is a vertical cd rack that doesnt have individual slots for cds, so if you want a cd from the bottom of a stack you have basically pull the whole stack to get to it\n\nputting it together was a pain, the one i bought i had to break a piece of metal just to fit it in its guide holes.\n\nagain..poorly designed... doesnt even fit cds that well, there are gaps, and the cd casses are loose fitting\n\npros\n..........\ni guess it can hold a lot of cds....\n</review_text>

In [80]:
neg_0=' '.join(negative_tokenized[0])
neg_0

u'con tip extremely easy carpet lot cd stacked top poorly designed vertical rack doesnt individual slot cd want bottom stack basically pull whole stack get putting together wa pain one bought break piece metal fit guide hole designed doesnt even fit cd well gap ca loose fitting pro guess hold lot cd'

### Input matrices (Normalized Count Vectorizer)

In [81]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [82]:
N = len(positive_tokenized) + len(negative_tokenized)
N

1966

In [93]:
data = np.zeros((N, len(word_index_map) + 1)) #data is a matrix of 1966 X 7542
i = 0
data.shape

(1966L, 7542L)

Word_index_map is a dictionary which contains the index(location) of all the words in both positive and negative reviews.

In [91]:
word_index_map['purchased'],word_index_map['unit']

(0, 1)

In [98]:
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

In [102]:
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [271]:
type(data)

numpy.ndarray

Data is a matrix which has 1966 rows, which are all the reviews in each row and there are 7542 columns denoting the unique words in these reviews. The values in data matrix is the normalized count value for each of the words.

In [270]:
len(data)

1966

In [103]:
len(data[0])

7542

In [268]:
data[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [269]:
set(data[1])

{0.0, 0.02857142857142857, 0.05714285714285714}

### Setting the Model

In [104]:
orig_reviews, data = shuffle(orig_reviews, data)

In [105]:
X = data[:,:-1]
Y = data[:,-1]

In [106]:
# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


### Logistic Regression Model

In [155]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print"Train accuracy:{:1.2%}".format(model.score(Xtrain, Ytrain))
print"Test accuracy:{:1.2%}".format(model.score(Xtest, Ytest))

Train accuracy:80.28%
Test accuracy:79.00%


### Naive Bayes Model

In [180]:
from sklearn.naive_bayes import MultinomialNB
model_nb=MultinomialNB()
model_nb.fit(Xtrain, Ytrain)
print"Train accuracy:{:1.2%}".format(model_nb.score(Xtrain, Ytrain))
print"Test accuracy:{:1.2%}".format(model_nb.score(Xtest, Ytest))

Train accuracy:87.41%
Test accuracy:82.00%


In [274]:
from future.utils import iteritems
threshold = 0.5
i=0
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        if i <15:
            print(word, weight)
            i+=1
        else:
            break


(u'unit', -0.6693402198376536)
(u'best', 1.1059264306368357)
(u'much', 0.6370770500863964)
(u'worked', -0.8340595805433348)
(u'easy', 1.1159350589043164)
(u'happy', 0.6851858331743848)
(u'time', -0.618083341483251)
(u'love', 0.9940843407094628)
(u'working', -0.6559656517002291)
(u'printer', 0.5411236225243428)
(u'original', -0.5159647793803978)
(u'returned', -0.5427291111267295)
(u'cable', 0.8620004244369122)
(u'small', 0.8062855702526749)
(u'company', -0.5755700457455178)


In [278]:
from future.utils import iteritems
threshold = 0.5
i=0
for word, index in iteritems(word_index_map):
    weight = model_nb.coef_[0][index]
    if weight > threshold or weight < -threshold:
        if i <15:
            print(word, weight)
            i+=1
        else:
            break

(u'raining', -9.03033192734806)
(u'conspiratively', -9.043577154098081)
(u'yellow', -8.990133118936946)
(u'four', -8.794575962309032)
(u'gag', -9.043577154098081)
(u'circuitry', -9.040691149208946)
(u'hanging', -8.960560001102877)
(u'marching', -9.028539276733541)
(u'shure', -8.994968024757819)
(u'looking', -8.180464248669278)
(u'accupower', -9.030673749262172)
(u'eligible', -9.00848583428681)
(u'electricity', -9.015386483774035)
(u'scold', -9.014589617224829)
(u'unanswered', -9.043577154098081)


Words with negative weights represent negative sentiment.

In [221]:
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)
preds_nb = model_nb.predict(X)
P_nb = model_nb.predict_proba(X)[:,1] # p(y = 1 | x)

### Model Evaluation

#### Logistic Regression

In [201]:
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [212]:
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [213]:
print("Most wrong positive review:") 
print(wrong_positive_review)
print('prob = %s, pred = %s, actual = 1'% (minP_whenYis1, wrong_positive_prediction))

Most wrong positive review:

This was a defective unit. Got new unit and it works as expected

prob = 0.393088492994584, pred = 0.0, actual = 1


In [214]:
print("Most wrong negative review:") 
print(wrong_negative_review)
print('prob = %s, pred = %s, actual = 0'% (maxP_whenYis0, wrong_negative_prediction))

Most wrong negative review:

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

prob = 0.6324071475427929, pred = 1.0, actual = 0


#### Naive Bayes

In [222]:
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [223]:
for i in range(N):
    p = P_nb[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds_nb[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds_nb[i]
            maxP_whenYis0 = p

In [224]:
print("Most wrong positive review:") 
print(wrong_positive_review)
print('prob = %s, pred = %s, actual = 1'% (minP_whenYis1, wrong_positive_prediction))

Most wrong positive review:

The Sandisk 512 MB Secure Digitial Ultra II (SDSDH-512-901) sent to me was not the item I ordered.  I returned the item, unopened

prob = 0.44208612683813353, pred = 0.0, actual = 1


In [225]:
print("Most wrong negative review:") 
print(wrong_negative_review)
print('prob = %s, pred = %s, actual = 0'% (maxP_whenYis0, wrong_negative_prediction))

Most wrong negative review:

I found that these DVD-Rs did not work well in my system, were unreliable and slow.  I cannot recommend them

prob = 0.5404190221588154, pred = 1.0, actual = 0


### Ada Boost Classifier

In [283]:
from sklearn.ensemble import AdaBoostClassifier
model_ada= AdaBoostClassifier(n_estimators=75)
model_ada.fit(Xtrain, Ytrain)
print"Train accuracy:{:1.2%}".format(model_ada.score(Xtrain, Ytrain))
print"Test accuracy:{:1.2%}".format(model_ada.score(Xtest, Ytest))

Train accuracy:91.16%
Test accuracy:89.00%


................................................................................................................................................................................................................................................................

# DVD Category Review

In [348]:
positive_reviews = BeautifulSoup(open('/Users/praga/Downloads/sorted_data_acl/dvd/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('/Users/praga/Downloads/sorted_data_acl/dvd/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [349]:
len(positive_reviews)

31

In [350]:
len(negative_reviews)

121

In [351]:
positive_reviews[0]

<review_text>\nI'm not sure why Sony, which now owns I Dream of Jeannie, decided to colorize the first season of this series.  Whatever the reason, you can readily tell by looking at the prices here on Amazon.com that the original black-and-white version of the first season is worth a lot more.  The reason for that is simple--I Dream of Jeannie was originally broadcast in black-and-white.  And for a television fan like myself, that's the ONLY way to watch the first season.\n\nThe episodes themselves are just as I remember seeing them.  Since I wasn't around in 1965, I'm pretty sure I've never seen these without the cuts that have been referenced here.  But to me, they're still pretty good.  The theme music, in my opinion, is every bit as good as the second theme, introduced when Jeannie went to color in 1966.  \n\nThe one thing that truly will drive the purists nuts is the fact that Sony stripped off the old Screen Gems animation from the end of every episode.  That logo was attached t

In [352]:
negative_reviews[0]

<review_text>\nThis entire movie could have run in only 20 minutes and you wouldn't miss anything and might even enjoy it. Unfortunately it ran 88 minutes too long and I couldn't wait for it to end.  I saw it in the theater and the people all around me were all complaining how boring it was. At least a quarter of them walked out before the end. It's that bad. It's a shame, I love a good suspense/horror movie and the decent actors in this movies were waisted\n</review_text>

In [353]:
diff = -len(positive_reviews) + len(negative_reviews)
idxs = np.random.choice(len(positive_reviews), size=diff)
extra = [positive_reviews[i] for i in idxs]
positive_reviews += extra

In [354]:
len(positive_reviews)

121

In [355]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [356]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [357]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [358]:
print"len(word_index_map):", len(word_index_map)


len(word_index_map): 4143


In [359]:
N = len(positive_tokenized) + len(negative_tokenized)

In [360]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [369]:
orig_reviews, data = shuffle(orig_reviews, data)

In [370]:
X = data[:,:-1]
Y = data[:,-1]

In [374]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Model Evaluation

In [376]:
from sklearn.naive_bayes import MultinomialNB
model_nb=MultinomialNB()
model_nb.fit(X_train, y_train)
print"Train accuracy:{:1.2%}".format(model_nb.score(X_train, y_train))
print"Test accuracy:{:1.2%}".format(model_nb.score(X_test, y_test))

Train accuracy:77.78%
Test accuracy:56.25%


In [375]:
model = LogisticRegression()
model.fit(X_train, y_train)

print"Train accuracy:{:1.2%}".format(model.score(X_train, y_train))
print"Test accuracy:{:1.2%}".format(model.score(X_test, y_test))

Train accuracy:88.89%
Test accuracy:73.75%


In [385]:
model_ada= AdaBoostClassifier(n_estimators=13)
model_ada.fit(X_train, y_train)
print"Train accuracy:{:1.2%}".format(model_ada.score(X_train, y_train))
print"Test accuracy:{:1.2%}".format(model_ada.score(X_test, y_test))

Train accuracy:95.68%
Test accuracy:91.25%


In [387]:
preds = model_ada.predict(X)
P = model_ada.predict_proba(X)[:,1] # p(y = 1 | x)


In [388]:
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [389]:
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [390]:
print("Most wrong positive review:") 
print(wrong_positive_review)
print('prob = %s, pred = %s, actual = 1'% (minP_whenYis1, wrong_positive_prediction))

Most wrong positive review:

he kept it real thats all i can say. aw yeah those people talkin bout its too much cussin. what u expect im from the hood and every other word i say is a cuss word. i just cant help it. everybody in the hood cusses. but anyways..

prob = 0.4675154477459153, pred = 0.0, actual = 1


In [391]:
print("Most wrong negative review:") 
print(wrong_negative_review)
print('prob = %s, pred = %s, actual = 0'% (maxP_whenYis0, wrong_negative_prediction))

Most wrong negative review:

If you are looking for a good movie to buy for your child, pass on this one. This movie has so many drug references, i can't even begin to explain.(trust me, I just so happen to have taken acid before) This is a movie that NEVER should have been directed toward children. 
  
   If you want your child to be drug free when he/she grows up, do not buy this

prob = 0.9404172966821557, pred = 1.0, actual = 0
