# 應用模組

In [3]:
import tarfile   
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
import pyprind
import pandas as pd
import os,re
from nltk.stem.porter import PorterStemmer #波特詞幹還原演算法
import nltk
from nltk.corpus import stopwords

# 資料載入、預處理

In [4]:
             #檔案解壓縮

with tarfile.open('aclImdb_v1.tar.gz','r:gz') as tar:
    tar.extractall()

In [11]:

basepath = 'aclImdb'

labels = {'pos':1,'neg':0}
bar = pyprind.ProgBar(50000)   #檔案數目
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        for file  in os.listdir(path):
            with open(os.path.join(path,file),'r',encoding= 'utf-8') as infile:  #讀取每個文件內容
                txt = infile.read()
            df = df.append([[txt,labels[l]]],                 #內容、標籤
                           ignore_index = True)
            bar.update()
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:56


In [12]:
df

Unnamed: 0,review,sentimant
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0


In [8]:
df.columns = ['review','sentiment']
df.to_csv('movie_data.csv',index = False, encoding = 'utf-8') #存為csv
df.head(10)

Unnamed: 0,review,sentiment
11841,at a Saturday matinee in my home town. I went ...,0
19602,I love this movie. It is the first film Master...,1
45519,"In the voice over which begins the film, Hughi...",1
25747,"!!! Spoiler alert!!!<br /><br />The point is, ...",0
42642,"This is an excellent film. No, it's not Mel Gi...",1
31902,"This movie is sort of similar to ""Better Off D...",1
30346,Brown of Harvard is a hard movie to pin down. ...,1
12363,No sense going over the story since enough rev...,1
32490,In this TV special Jon is the one who needs a ...,1
26128,Entertainment Tonight has been going down hill...,0


# 以上只需執行一遍或使用以下CSV載入

In [7]:
#隨機排列 讀去資料
np.random.seed(0)
df = pd.read_csv('movie_data.csv')
df = df.reindex(np.random.permutation(df.index))
df

Unnamed: 0,review,sentiment
11841,at a Saturday matinee in my home town. I went ...,0
19602,I love this movie. It is the first film Master...,1
45519,"In the voice over which begins the film, Hughi...",1
25747,"!!! Spoiler alert!!!<br /><br />The point is, ...",0
42642,"This is an excellent film. No, it's not Mel Gi...",1
...,...,...
21243,Although the director tried(the filming was ma...,0
45891,It has been about 50 years since a movie has b...,1
42613,"""Bar Hopping"" seems to be trying to be about t...",0
43567,This awful effort just goes to show what happe...,0


# 字詞轉回特徵向量

In [9]:
count = CountVectorizer()
docs = np.array(['I really like Kris Kristofferson and his usual easy going delivery of lines in his movies',
                 'Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly',
                 'I must give kudos to the cinematography and and the actors'])  #取自df
bag = count.fit_transform(docs)
for text in list(sorted(count.vocabulary_.items())):    #索引排列
    print(text)
print(bag.toarray())

('actors', 0)
('age', 1)
('and', 2)
('cinematography', 3)
('delivery', 4)
('easy', 5)
('effortlessly', 6)
('energy', 7)
('give', 8)
('going', 9)
('has', 10)
('he', 11)
('helped', 12)
('him', 13)
('his', 14)
('in', 15)
('kris', 16)
('kristofferson', 17)
('kudos', 18)
('like', 19)
('lines', 20)
('low', 21)
('movies', 22)
('must', 23)
('of', 24)
('really', 25)
('scene', 26)
('soft', 27)
('spoken', 28)
('steal', 29)
('style', 30)
('the', 31)
('to', 32)
('usual', 33)
('will', 34)
('with', 35)
[[0 0 1 0 1 1 0 0 0 1 0 0 0 0 2 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 1 1]
 [1 0 2 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 2 1 0 0 0]]


# TF-IDF(詞頻-反向文件頻率)

In [10]:
#確認關聯性 可供資料分離 ('and', 2) = 0.35 較無關聯 3個資料都有
tfidf = TfidfVectorizer(use_idf = True,
                        norm = 'l2',
                        smooth_idf = True)
np.set_printoptions(precision = 2)
for idx,texts in enumerate(tfidf.fit_transform(docs).toarray()):
    print(idx+1,'\n',texts)

1 
 [0.   0.   0.15 0.   0.26 0.26 0.   0.   0.   0.26 0.   0.   0.   0.
 0.4  0.26 0.26 0.26 0.   0.26 0.26 0.   0.26 0.   0.26 0.26 0.   0.
 0.   0.   0.   0.   0.   0.26 0.   0.  ]
2 
 [0.   0.25 0.15 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.25 0.25 0.25
 0.19 0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.   0.   0.25 0.25
 0.25 0.25 0.25 0.   0.   0.   0.25 0.25]
3 
 [0.3  0.   0.35 0.3  0.   0.   0.   0.   0.3  0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.3  0.   0.   0.   0.   0.3  0.   0.   0.   0.
 0.   0.   0.   0.59 0.3  0.   0.   0.  ]


# 文字清理

In [11]:
df.loc[0,'review'][-50:]

'nd three more acting performances (including Yam).'

In [12]:
# regular expression(re) 正規表式是
def preprocessor(text1):
    text1 = re.sub('<[^>]*>', '',text1)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text1)
    text1 = (re.sub('[\W]',' ',text1.lower()) + ' '.join(emoticons).replace('-',''))
    return text1
preprocessor(df.loc[0,'review'][-50:])


'nd three more acting performances  including yam  '

# 字符轉換

In [13]:
def tokenizer(text1):
    return text1.split()
print(tokenizer(preprocessor(df.loc[0,'review'][-50:])))
print()

['nd', 'three', 'more', 'acting', 'performances', 'including', 'yam']



In [14]:
porter = PorterStemmer()
def tokenizer_porter(text1):
    return [porter.stem(word) for word in tokenizer(text1)]
#tokenizer_porter(preprocessor(df.loc[0,'review'][-50:]))
tokenizer_porter('running processesing tow cups of tea')  #還原字根形式

['run', 'processes', 'tow', 'cup', 'of', 'tea']

# 停用字

In [190]:
#下載停用詞 可自己新增
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stop = stopwords.words('english')
[w for w in tokenizer_porter((preprocessor(df.loc[0,'review'][0:])))[-10:] if w not in stop] #取最後10個字 並刪除停用詞

['love', 'three', 'act', 'perform', 'includ', 'yam']

# 訓練模型

In [16]:
#訓練模型需要模組
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
x_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
x_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [15]:
tfidf = TfidfVectorizer(strip_accents = None,
                        lowercase = False,
                        preprocessor = None)

param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop,None],        #停用字
               'vect__tokenizer':[tokenizer,
                                  tokenizer_porter],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]},
              {'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]}]

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf',LogisticRegression(random_state = 0 ,solver = 'liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf , param_grid,
                           scoring = 'accuracy',
                           cv = 5, verbose = 1 ,
                           n_jobs = -1)
gs_lr_tfidf.fit(x_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've...
                                                'yourselves', 'he', 'him',
                                                'his', 'himself', 'she',
                                                "she's", 'her', 'hers',
       

In [34]:
print('最佳參數: %s' % gs_lr_tfidf.best_params_)
print('train acc: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('test acc: %.3f' % clf.score(x_test,y_test))   #使用最佳參數預測數據


最佳參數: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001E16B337DC0>}
train acc: 0.885
test acc: 0.890


# 核外學習

In [19]:
import os
import gzip
if not os.path.isfile('movie_data.csv'):
    if not os.path.isfile('movie_data.csv.gz'):
        print('not exist')
    else:
        with gzip.open('movie_data.csv.gz', 'rb') as in_f,open('movie_data.csv', 'wb') as out_f:
            out_f.write(in_f.read())
            
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)        #skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [20]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)



clf = SGDClassifier(loss='log', random_state=1)                   #隨機梯度


doc_stream = stream_docs(path='movie_data.csv')

import pyprind        #進度條
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):                                               #45小批文件每小批為1000個文件
    x_train, y_train = get_minibatch(doc_stream, size=1000)
    if not x_train:
        break
    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:20


In [31]:
x_test, y_test = get_minibatch(doc_stream, size=5000)                  #5000個文件評估效能           
x_test = vect.transform(x_test)
print('Accuracy: %.3f' % clf.score(x_test, y_test))

Accuracy: 0.866


In [32]:
clf = clf.partial_fit(x_test, y_test)  #最後5000個文件更新模型

# 主題建模

In [14]:

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

count = CountVectorizer(stop_words='english',
                        max_df=.1,
                        max_features=5000)
X = count.fit_transform(df['review'].values)

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))
    
horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music history
Topic 4:
human audience cinema art feel
Topic 5:
police guy car dead murder
Topic 6:
horror house sex blood gore
Topic 7:
role performance comedy actor performances
Topic 8:
series episode episodes tv season
Topic 9:
book version original effects read
Topic 10:
action fight guy fun guys

Horror movie #1:
Emilio Miraglia's first Giallo feature, The Night Evelyn Came Out of the Grave, was a great combination of Giallo and Gothic horror - and this second film is even better! We've got more of the Giallo side of the equation this time around, although Miraglia doesn't lose the Gothic horror stylings tha ...

Horror movie #2:
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of t



# 序列化

In [33]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [47]:
clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb')) #反序列化

In [64]:
#執行情緒分類
label = {0:'negative', 1:'positive'}

example = [str(preprocessor(df.loc[1,'review']))]
x = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %\
      (label[clf.predict(x)[0]], 
       np.max(clf.predict_proba(x))*100))

Prediction: negative
Probability: 77.53%


In [66]:
preprocessor(df.loc[1,'review']) 

'i was just watching a forensic files marathon on court tv  the episode was identical to the plot of this movie  right down to the incest secret and the affair with the sister subplot  i don t recall any based on a true story disclaimer  but the case does have mow written all over it  apparently it chronicles the real homicide of ruby morris by her husband earl  sentenced to 25 years to life for her murder  just goes to show you  truth can be stranger than fiction  because i thought the lifetime plot was contrived and a more than  a stretch  insofar as believability goes  i m with the other posters who said the acting was bad  i didn t notice it with all of the players  though  it was really the lead character  the daughter  whose performance was bad '