## 本日課程-文字預處理，部分內容前面章節可能提過，這裡會將前處理所需技巧串起

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
#tsv是指用tab分開字元的檔案

# this file has no header, encoding is Big5 not utf-8
dataset = pd.read_csv('movie_feedback.csv', header=None, encoding='Big5')

# dataset[0] is comment
X = dataset[0].values
# dataset[1] is 1/0
Y = dataset[1].values

In [2]:
from utils import big5ToUtf8, Utf8ToBig5

In [3]:
big5ToUtf8("movie_feedback.csv", "movie_feedback-utf8.csv")

In [4]:
X[:10]

array(['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
       'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
       'effective but too-tepid biopic',
       'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
       "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . ",
       'the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game . ',
       'offers that rare combination of entertainment and education . ',
       'perhaps no picture ever made has more literally showed that the road to h

In [5]:
Y[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [6]:
dataset

Unnamed: 0,0,1
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
10657,a terrible movie that some people will neverth...,0
10658,there are many definitions of 'time waster' bu...,0
10659,"as it stands , crocodile hunter has the hurrie...",0
10660,the thing looks like a made-for-home-video qui...,0


---

In [7]:
print('review before preprocessing : {}'.format(X[0]))

review before preprocessing : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


## 運用re.sub去除部分字元

In [8]:
import re


str = "fasdkfj asdfklajsdkfj ASF [Dsd] 1238 {497} . go to school: she has a dog .74 {}[]+=-"
_str = re.sub(r"[^a-zA-Z\s]", " ", str)
_str = re.sub(r"\s+", " ", _str)

_str

'fasdkfj asdfklajsdkfj ASF Dsd go to school she has a dog '

In [9]:
import re 
# 去除a-zA-Z以外的字元，並將他們取代為空格' '
review = re.sub(r"[^a-zA-Z]"," ",X[0])
review = re.sub(r"\s+", " ", review)

In [10]:
print('review after re.sub : {}'.format(review))

review after re.sub : the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal 


In [11]:
review.lower()

'the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal '

In [12]:
review.upper()

'THE ROCK IS DESTINED TO BE THE ST CENTURY S NEW CONAN AND THAT HE S GOING TO MAKE A SPLASH EVEN GREATER THAN ARNOLD SCHWARZENEGGER JEAN CLAUD VAN DAMME OR STEVEN SEGAL '

## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [13]:
#把全部變成小寫
review =  review.lower()
print('review after lower : {}'.format(review))

review after lower : the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal 


## 斷詞

In [14]:
import nltk
#把review裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


* tokenize 相較於split會是更好的選擇，如 split 無法分開 word. 這種case

In [15]:
# 使用 nltk.tokenize 這個 module 的 word_tokenize 方法來斷詞

review_token = nltk.tokenize.word_tokenize(review)
print('review after tokenize : {}'.format(review_token))

review after tokenize : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


## stopwords: 移除贅字，此步驟為前處理的重要步驟之一，過多的贅字不僅無法提供更多訊息，還會干擾到模型的訓練

In [16]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s1900\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stopwords = nltk.corpus.stopwords.words('english')

In [18]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [19]:
'i' in stopwords

True

In [20]:
# 去除重複的 stopword
set(stopwords)

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [21]:
print('review before removing stopwords : {} \n'.format(review_token))
review_token_clean = [word for word in review_token if not word in set(stopwords)]
print('review after removeing stopwords : {}'.format(review_token_clean))

review before removing stopwords : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal'] 

review after removeing stopwords : ['rock', 'destined', 'st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'steven', 'segal']


## Stemming: 詞幹提取
 * ex. loves,loved都變成love
 * 中文沒有詞幹提取的需求

In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review_stem = [ps.stem(word) for word in review_token_clean]

In [23]:
review_stem

['rock',
 'destin',
 'st',
 'centuri',
 'new',
 'conan',
 'go',
 'make',
 'splash',
 'even',
 'greater',
 'arnold',
 'schwarzenegg',
 'jean',
 'claud',
 'van',
 'damm',
 'steven',
 'segal']

In [24]:
print('review before stemming : {} \n'.format(review_token_clean))

print('review after stemming : {}'.format(review_stem))

review before stemming : ['rock', 'destined', 'st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'steven', 'segal'] 

review after stemming : ['rock', 'destin', 'st', 'centuri', 'new', 'conan', 'go', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegg', 'jean', 'claud', 'van', 'damm', 'steven', 'segal']


## 練習清理所有的句子

In [25]:
#dataset=pd.read_csv('movie_feedback.csv',encoding = 'Big5',names=['feedback', 'label'] )
X = dataset[0].values

### 流程筆記：
- 1.讀取檔案
- 2.讀取欄位資料, 轉成 dataframe, 接著再從 dataframe 轉成 list
- 3.去除多餘的字元(不是 a-zA-Z)
- 4.轉成小寫
- 5.split word by word => list
- 6.詞幹提取

In [26]:
corpus=[]
row=len(X)
for i in range(0,row):
    review=re.sub('[^a-zA-Z]',' ',X[i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    ## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
    review=[ps.stem(word) for word in review ]
    review=' '.join(review)
    corpus.append(review)

In [27]:
corpus

['the rock is destin to be the st centuri s new conan and that he s go to make a splash even greater than arnold schwarzenegg jean claud van damm or steven segal',
 'the gorgeous elabor continu of the lord of the ring trilog is so huge that a column of word cannot adequ describ co writer director peter jackson s expand vision of j r r tolkien s middl earth',
 'effect but too tepid biopic',
 'if you sometim like to go to the movi to have fun wasabi is a good place to start',
 'emerg as someth rare an issu movi that s so honest and keenli observ that it doesn t feel like one',
 'the film provid some great insight into the neurot mindset of all comic even those who have reach the absolut top of the game',
 'offer that rare combin of entertain and educ',
 'perhap no pictur ever made ha more liter show that the road to hell is pave with good intent',
 'steer turn in a snappi screenplay that curl at the edg it s so clever you want to hate it but he somehow pull it off',
 'take care of my cat

## 轉bag-of-words vector

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#tokenization(符號化)
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features=1500)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
# print(cv.fit_transform(corpus))
X_ = cv.fit_transform(corpus).toarray()
Y_ = dataset[1].values

In [29]:

# matrix
X_

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
# 原始資料的 column_index=1 

Y_

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

## 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

## Training

In [31]:

from sklearn.model_selection import train_test_split

# 使用 train_test_split 拆分資料(訓練資料, 測試資料)
# test_size 為拆分比例 : 0.1 代表 訓練資料:測試資料 = 0.9 : 0.1
X_train, X_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.1)

# Feature Scaling

# 使用 Naive Bayes (一種分類的方式) 來訓練模型
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# fit() => 訓練函式
classifier.fit(X_train, y_train)


GaussianNB()

## Inference 預測

In [32]:
def prediction(message):
    review=re.sub('[^a-zA-Z]',' ',message)
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    review=[ps.stem(word) for word in review]
    review = ' '.join(review)
    input_ = cv.transform([review]).toarray()
    _prediction = classifier.predict(input_)
    
    ## 0代表負面評價, 1 代表正面評價
    return _prediction


In [33]:
message='I really like this movie!!'

res = prediction(message)
print(res)

[1]


In [34]:
message='A terrible movie  !!'

res = prediction(message)
print(res)

[0]


In [35]:
message='I hate it'

res = prediction(message)
print(res)

[0]


In [36]:
message='I love it'

res = prediction(message)
print(res)

[1]


In [37]:
message='It is a good song'

res = prediction(message)
print(res)

[1]
