<a href="https://colab.research.google.com/github/silverstar0727/1day-1commit-challenge/blob/master/machine_learning_ch8%269_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMBDb영화 리뷰

In [0]:
!pip install pyprind

In [0]:
import pyprind
import pandas as pd
import os

basepath = 'C:\\Users\\silve\\Desktop\\aclImdb_v1\\aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)

df = pd.DataFrame()
for s in('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding = 'utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index = True)
            pbar.update()
            
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:05


In [0]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')

In [0]:
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [0]:
df.shape

(50000, 2)

### BOW model

In [0]:
# BoW 모델을 만듦
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining', 'The weather is sweet', 
                 'The sun is shining, the weater is sweet, and one and one is two'])

bag = count.fit_transform(docs)

CountVetorizer모듈에 내장되어 있는 vocabulary메소드 사용

단어와 정수를 매핑하여 딕셔너리형태로 저장.

In [0]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 9, 'sweet': 5, 'weater': 8, 'and': 0, 'one': 2, 'two': 7}


In [0]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0 0]
 [0 1 0 0 0 1 1 0 0 1]
 [2 3 2 1 1 1 2 1 1 0]]


## 단어의 적합성 평가
tf-idf(term frequency-inverse document frequency)

In [0]:
# 사이킷런의 TfidTransFormer클래스 사용
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf = True, norm = 'l2', smooth_idf = True)
np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.   0.  ]
 [0.   0.39 0.   0.   0.   0.5  0.39 0.   0.   0.66]
 [0.5  0.44 0.5  0.19 0.19 0.19 0.29 0.25 0.25 0.  ]]


In [0]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [0]:
# 텍스트 데이터 정제
import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+',' ', text.lower()) + ' '.join(emoticons).replace('-',''))
    return text

In [0]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [0]:
preprocessor("<\a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [0]:
# 문서를 토큰으로 나누기
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [0]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [0]:
# 불용어집합을 다운로드
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\silve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [0]:
# 문서분류를 위한 로지스틱 회귀 모델 훈련
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[:25000, 'review'].values
y_test = df.loc[:25000, 'sentiment'].values

In [0]:
# 40분 소요
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents = None, lowercase = False, preprocessor = None)
param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1,1)],
             'vect__stop_words': [tokenizer, tokenizer_porter],
             'vect__use_idf': [False],
             'vect__norm': [None],
             'clf__penalty': ['l1', 'l2'],
             'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(solver = 'liblinear', random_state = 0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring = 'accuracy', cv = 5, verbose = 1, n_jobs = 1)
gs_lr_tfidf.fit(X_train, y_train)

In [0]:
print('cv 정확도: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('테스트 정확도: %.3f' % clf.score(X_test, y_test))

# 대용량 데이터처리

In [0]:
#불용어를 제외한 단어토큰으로 분리
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+',' ', text.lower()) + ' '.join(emoticons).replace('-',''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [0]:
def stream_docs(path):
    with open(path, 'r', encoding = 'utf-8') as csv:
        next(csv) # 헤더 넘기기
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [0]:
next(stream_docs(path = 'movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [0]:
def get_minibatch(doc_stream, size):
    docs, y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
            
    except StopIteration:
        pass
    return docs, y


In [0]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error = 'ignore', n_features = 2**21, preprocessor = None, tokenizer = tokenizer)
clf = SGDClassifier(loss = 'log', random_state = 1, max_iter = 1)
doc_stream = stream_docs(path = 'movie_data.csv')

In [0]:
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0,1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:25


In [0]:
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)

print('정확도: %.3f' % clf.score(X_test, y_test))

정확도: 0.868


In [0]:
clf = clf.partial_fit(X_test, y_test)

## 잠재 디리클레 할당을 사용한 토픽 모델링 (LDA)
5장의 LDA와는 다름 (여기서의 LDA는 Latent Dirichlet Allocation)

베이지안 추론에 대한 수학적 깊이가 필요함

In [0]:
# 사이킷런의 LDA
#데이터 읽기
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')

In [0]:
# CountVectorizer 클래스를 이용하여 BoW행렬을 만들기
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words = 'english', max_df = .1, max_features = 5000) # 빈도가 높은 단어를 max_df를 통해 제외
X = count.fit_transform(df['review'].values)

In [0]:
# 문서에서 열개의 토픽을 추정하도록 LatentDirichletAllocation추정기를 BoW행렬에 학습하는 방법을 보여줌
# 시간 오래걸림
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 10, random_state = 123, learning_method = 'batch')
X_topics = lda.fit_transform(X)

In [0]:
lda.components_.shape

(10, 5000)

In [0]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print('토픽 %d:' % (topic_idx + 1))
    print(' '.join([feature_names[i] for i in topic.argsort() [:-n_top_words - 1: -1]]))


토픽 1:
worst minutes awful script stupid
토픽 2:
family mother father children girl
토픽 3:
american war dvd music tv
토픽 4:
human audience cinema art sense
토픽 5:
police guy car dead murder
토픽 6:
horror house sex girl woman
토픽 7:
role performance comedy actor performances
토픽 8:
series episode war episodes tv
토픽 9:
book version original read novel
토픽 10:
action fight guy guys cool


In [0]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\n공포영화 #%d' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


공포영화 #1
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

공포영화 #2
Okay, what the hell kind of TRASH have I been watching now? "The Witches' Mountain" has got to be one of the most incoherent and insane Spanish exploitation flicks ever and yet, at the same time, it's also strangely compelling. There's absolutely nothing that makes sense here and I even doubt there  ...

공포영화 #3
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a total freakfest from start to finish. A fun freakfest at that, but at times it was a tad too reliant on kitsch rather than the horror. The story is difficult to summarize succinctly: a carefree, normal teenage girl starts coming fac ...


# chapter 9
# 웹 애플리케이션에 머신러닝 모델 내장

In [0]:
import pickle
import os

# 웹 애플리케이션에 필요한 파일과 데이터를 저장할 movieclassifier디렉토리 생성
# pkl_objects는 서브디렉토리: python 객체를 저장함
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
# dump메소드로 불용어를 직렬화 하여 저장
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol = 4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol = 4)

In [0]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname('C:\\Users\\silve\Desktop\\movieclassifier\\pkl_objects')
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    
    return tokenized

vect = HashingVectorizer(decode_error = 'ignore', n_features = 2 ** 21, preprocessor = None, tokenizer = tokenizer)

In [0]:
!pip install vectors



In [0]:
# 분류기 복원
import pickle
import re
import os

clf = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'classifier.pkl'), 'rb'))

In [0]:
# 문서샘플 전처리 후 예측
import numpy as np

label = {0: '음성', 1: '양성'}

example = ['I love this movie']
X = vect.transform(example)

print('예측: %s \n확률: %.2f%%' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)) * 100))

예측: 양성 
확률: 81.44%


# 데이터를 저장하기 위한 SQLite

In [0]:
import sqlite3

conn = sqlite3.connect('reviews.sqlite') # connect 메소드를 이용한 파일연결
c = conn.cursor()# curcor 메소드를 이용하여 코드조작

c.execute('DROP TABLE IF EXISTS review_db') #rebiew_db생성
c.execute('CREATE TABLE review_db' \
         '(review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'I love this movie'
c.execute("INSERT INTO review_db"\
         " (review, sentiment, date) VALUES"\
         " (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db"\
         " (review, sentiment, date) VALUES"\
         " (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [0]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date"\
         " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()

In [0]:
conn.close()
print(results)

[('I love this movie', 1, '2020-03-02 07:08:44'), ('I disliked this movie', 0, '2020-03-02 07:08:44')]


# 플라스크 웹 애플리케이션

In [0]:
!pip install flask



In [0]:
''' 디렉터리 구조
1st_flask_app_1/
    app.py
    templates/
        first_app.html
'''
from flask import Flask, render_template

app = Flask(__name__)
@app.route('/')
def index():
    return render_template('first_app.html')

if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


In [0]:
python app.py

SyntaxError: invalid syntax (<ipython-input-49-b851947b46d7>, line 1)

In [0]:
!pip install wtforms

Collecting wtforms
  Downloading WTForms-2.2.1-py2.py3-none-any.whl (166 kB)
Installing collected packages: wtforms
Successfully installed wtforms-2.2.1


In [0]:
'''
1st_flask_app_2/
    app.py
    static/
        style.css
    templates/
        _forhelpers.html
        first_app.html
        hello.html'''
from flask import Flask, render_template, request
from wtforms import Form, TextAreaField, validators

app = Flask(__name__)

class HelloForm(Form):
    sayhellp = TextAreaField('', [validators.DataRequired()])
    
@app.route('/')
def index():
    form = HelloForm(request.form)
    return render_template('first_app.html', form = form)

@app.route('/hello', methods = ['POST'])
def hello():
    form = HelloFrom(request.form)
    if request.method == 'POST' and form.validate():
        name = request.form['sayhello']
        return render_template('hello.html', name = name)
    return render_template('first_app.html', form = form)

if __name__ == '__main__':
    app.run(debug = True)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [0]:
# Jinja 템플릿을 이용하여 매크로 구현
{% macro render_field(field) %}