```
題目:電商產品評分文件以機器學習方式分辨是否為正向或負向

 說明：輸入文件 positive.review 和 negative.review，兩者都是XML檔。我們用BeautifulSoup讀進來，
 擷取review_text，然後用NLTK自建Tokenizer。 先產生 word-to-index map 再產生 word-frequency vectors。
 之後 shuffle data 創造 train/test splits，留100個給 test 用。接著用Logistic Regression 分類器
 找出訓練組和測試組的準確度(Accuracy)。接著我們可以看看每個單字的正負權重，可以訂一個閥值，
 比方絕對值大於正負0.5，以確認情緒是顯著的。最後我們找出根據現有演算法歸類錯誤最嚴重的正向情緒和負向
 情緒的例子。

 延伸:可用不同的tokenizer，不同的tokens_to_vector，不同的ML分類器做改進準確率的比較。最後可用您的
 model去預測unlabeled.review檔的內容。

 範例程式檔名: sentiment_情緒分析.py，以LogisticRegression 方式完成情緒分析。
 模組: sklearn, bs4, numpy, nltk
 輸入檔：stopwords.txt, /electronics 下 positive.review, negative.review
 成績：辨識百分率

注意事項：nltk 需要有 punkt corpus 和 wordnet  資源
import nltk
nltk.download('punkt')
nltk.download('wordnet') 
資料檔需在適當位置 jupyter 或 colab 才能看到，用colab時要上傳 data 到 ./sample_data 或 mount
```

In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range


import re
import nltk
import numpy as np
from sklearn.utils import shuffle
from nltk.corpus import wordnet 
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()
# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [3]:
# 另一個 stopwords 的來源
# from nltk.corpus import stopwords
# stopwords.words('english')

# 讀正向與負向 reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('sorted_data_acl/electronics/positive.review', encoding='utf-8').read(), features='html.parser')
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('sorted_data_acl/electronics/negative.review', encoding='utf-8').read(), features='html.parser')
negative_reviews = negative_reviews.findAll('review_text')

In [4]:
# 基於nltk自建 tokenizer

def my_tokenizer(s):
    s = re.sub(r'\W', ' ', s).lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t) > 2] # 去除短字
    tags = ['MD', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
    }
    tokens = [wordnet_lemmatizer.lemmatize(t, tag_dict.get(pos[0], 'n')) 
              for (t, pos) in nltk.pos_tag(tokens)] # lemmatize
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens

In [5]:
# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 8876


In [6]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    if x.sum() == 0:
        print(tokens)
    x = x / x.sum() # 正規化數據提升未來準確度
    x[-1] = label
    return x

In [7]:
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [8]:
# shuffle data 創造 train/test splits
# 多次嘗試!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# 最後 100 列是測試用
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [9]:
np.unique(Ytrain, return_counts=True)

(array([0., 1.]), array([951, 949], dtype=int64))

In [10]:
model = LogisticRegression(C=100)
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))



Train accuracy: 0.9536842105263158
Test accuracy: 0.81


In [11]:
# 列出每個字的正負 weight
# 用不同的 threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

purchase -0.5143209702486142
this -0.8688947491920044
unit -2.4924610104560414
due -7.919745574385151
frequent 1.5946454137774027
blackout 1.4924837681708605
power 4.188781859895338
supply 3.5942738270642387
bad -17.33089450108214
run 3.2013002122589493
cable 1.9523529847184833
modem 0.8648684691901359
router -5.459996081417225
lcd 1.5707397613907652
monitor 5.68794846452566
minute -5.502187395210485
time -1.4018924025483188
shut -7.372753048028884
equally 2.9944631080344486
electronics -3.007780773994582
receive -3.513621274325259
clean 2.7975200454593745
feel 1.618918355269801
investment 6.21511554578239
minor 8.826826530608164
compare 6.304515808989227
loss -2.7954349469670117
valuable 0.6024182075427139
data 5.050002150177903
failure -3.9217100112052203
equipment 2.922081329649886
spike 1.1492213570866887
irregular 1.0292371548350503
amazon 6.141760725437239
day -7.305977526787214
apc -0.7308050896969609
ups -3.2151783355541212
500 0.5202752959219953
recommendation 2.02889225355406

mono 0.9803956149058909
mike 0.6923887097008934
external -2.5187822484594746
toggle -0.6025744136053731
setting 1.889090330006298
change -7.035474190875804
fly 1.2740767142254465
built 4.5637048179310735
distort -3.6124513344125
horribly -0.6428382293197705
overall 11.696025450239127
pocket -1.2498538152050314
obviously -4.164991594564401
bite 2.365345521278758
trial 0.8395941342707475
error -12.058690469779574
summer 2.8458348748452726
involve -1.5745829431630418
class 1.7906771264230883
fear 1.4743430372886766
requirement 1.4488263554189909
special 4.431516599374614
typical 0.5763541231949314
convert 2.712970491744884
trasnfer -1.204268905039811
yourself -6.885360328998972
rca 0.601816288608317
pause 1.794358069470487
reliably -3.8009191904808173
799 -0.9328460579797156
produce 1.5361013519916589
previous 3.971945343477873
require -1.7776725946680283
future 1.0172817783743344
hubby 3.4178868905641533
rehoboth 0.5252909036847737
beach 3.8613597827812076
thru -3.493302823400467
pain 0.

indoor 1.6645145790963778
terk -0.6527648789564819
zenith 1.0179087432044485
fraction 0.7850818134315634
building 2.0550795279950855
cushiony 3.1522015984466476
living 1.8929655364141713
supportive 0.7806800661164454
smell -0.6178399621073134
garmin 4.0360971258847576
navigation 3.3233060961956546
waterproof -0.8658885856197374
unbox 2.0791682499663406
hop -3.619996519245134
la 2.204209143096771
airtunes 0.8101653473160518
802 4.114036277951627
11g 1.6237027289993042
simplicity 0.8958568164383174
refer -1.3586106754310208
limited -1.3835631914699837
distance -1.2038230579613916
effective 1.4757951137153076
relative 0.5293775099589912
wave 2.2264271446938992
intruding 1.1758335742209958
complains 1.1758335742209958
debate 1.8896099579724288
picturemate 0.6446224191675962
475 0.6446224191675962
4x6 0.6544114750276536
vivid 2.336062320956607
5gb -1.8660358537249613
nero -0.9627863886023821
verification -1.0938104203449885
mixture 0.5968323872363274
inspiron 1.1386560521394482
1505 0.59683

america 1.1080403542201938
logic 0.6279339012129606
wallet 0.868027528378861
expansion 0.5351468557761402
nylon 1.4709758191551592
refers 0.868027528378861
accompaning 0.868027528378861
juice -0.8671068691533892
session -1.0613248788652005
conjunction -2.5473820530937012
sbl 0.9739523047330632
soundcards 0.9739523047330632
successfully -0.9445266413689205
acually 0.5081655160814625
hd650 0.5081655160814625
denon 1.016331032162925
cheapy 0.5081655160814625
bugs 0.5081655160814625
upto -1.4788657371319596
navigator -1.8509742489302232
listing -1.346223750844373
sensor -1.0576298702782512
hr -2.3497288166007233
motion -2.710519691473152
flex -1.207866440953418
vendor -2.7936425224537027
hts 1.129178142190033
hls 1.129178142190033
5687w 1.129178142190033
directv 2.7755158156222723
50g 2.6840050698937024
successor 0.5368010139787406
49g 2.1472040559149623
securedigital 1.6104030419362214
9gb 0.5368010139787406
accessible 0.6762786340435767
recognice 0.5368010139787406
fat16 0.53680101397874

fruitless -1.4570446800736678
compensation -1.0785266473128416
thermal -0.7305407182969668
carafe -0.835877779033177
coffeemaker -0.5572518526887846
hotter -0.5290250577023319
capresso -1.1145037053775693
zojirushi -0.835877779033177
seven -1.3134946582248728
marketing -2.5571055464513615
scramble -1.153465543362198
puzzle -1.3985310446987629
solving -0.6837093050339169
h10 -4.204280201343974
subscription -3.250484925568284
disappearing -1.8745542240928073
desperately -1.8745542240928073
mazda -1.026192774534526
f1500 -1.026192774534526
kvm -1.1784305219460127
upper -1.2134451117944396
era -0.66648195457163
multimedia -0.7080064145541226
farther -0.6295028916482235
employer -0.5126977897063184
winxp -1.8253595473176252
eventual -0.6247463544092187
quaility -0.6566392930978323
lexmark -3.5664627692603474
mild -0.9213354988090994
brown -0.9213354988090994
haze -0.9213354988090994
improved -0.9621438358373315
passably -0.9213354988090994
speechless -0.7595759364764632
lynksys -0.748124907

In [12]:
# 找出歸類錯誤的例子
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

In [13]:
# 只列出最糟的
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.020026852552939744, pred = 0.0):

Got my order quickly and found out that these cases are much BETTER than the ones sold here locally, and they were CHEAPER !!

Most wrong negative review (prob = 0.9100625815710439, pred = 1.0):

I like the HP 96 because it can be used in many HP printers that use the HP 98,  but its a better deal than the HP 98 because it contains almost twice as much ink. 

HP's ink cartridges provide excellent performance,  but I think they are over priced 

