In [1]:
from keras import models
from keras import layers

model = models.Sequential()
# 첫 번째 차원이 784인 2D 텐서만 입력으로 받는 층, 첫 번째 차원의 크기가 32로 변환된 텐서를 출력.
model.add(layers.Dense(32, input_shape=(784,)))
# 케라스에서는 모델에 추가된 층을 자동으로 상위 층의 크기에 맞추어 줌.
model.add(layers.Dense(10))

Using TensorFlow backend.


In [2]:
def read_data(filename):
    with open(filename, 'rt', encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        # txt 파일의 헤더(id document label)는 제외하기
        data = data[1:]
    return data

train_data = read_data('./ratings_train.txt')
test_data = read_data('./ratings_test.txt')

In [3]:
from konlpy.tag import Okt

okt = Okt()

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [4]:
import json
import os
from pprint import pprint

def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

if os.path.isfile('train_docs.json'):
    with open('train_docs.json', 'a', encoding='utf-8') as f:
        train_docs = json.load(f)
    with open('test_docs.json', 'a', encoding='utf-8') as f:
        test_docs = json.load(f)
else:
    train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
    test_docs = [(tokenize(row[1]), row[2]) for row in test_data]
    # JSON 파일로 저장
    with open('train_docs.json', 'w', encoding="utf-8") as make_file:
        json.dump(train_docs, make_file, ensure_ascii=False, indent="\t")
    with open('test_docs.json', 'w', encoding="utf-8") as make_file:
        json.dump(test_docs, make_file, ensure_ascii=False, indent="\t")

# 예쁘게(?) 출력하기 위해서 pprint 라이브러리 사용
pprint(train_docs[0])

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나다/Adjective',
  '목소리/Noun'],
 '0')


In [5]:
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

2159921


In [6]:
import nltk
text = nltk.Text(tokens, name='NMSC')

In [7]:
# 시간이 꽤 걸립니다! 시간을 절약하고 싶으면 most_common의 매개변수를 줄여보세요.
selected_words = [f[0] for f in text.vocab().most_common(1000)]

def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

train_x = [term_frequency(d) for d, _ in train_docs]
test_x = [term_frequency(d) for d, _ in test_docs]
train_y = [c for _, c in train_docs]
test_y = [c for _, c in test_docs]

In [8]:
import numpy as np

x_train = np.asarray(train_x).astype('float32')
x_test = np.asarray(test_x).astype('float32')

y_train = np.asarray(train_y).astype('float32')
y_test = np.asarray(test_y).astype('float32')

In [9]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(1000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
             loss=losses.binary_crossentropy,
             metrics=[metrics.binary_accuracy])

model.fit(x_train, y_train, epochs=10, batch_size=512)
results = model.evaluate(x_test, y_test)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
import csv
feedback_list=[]

with open('../../(1) cleaner/mycelebs_content.csv', 'r', encoding='utf-8') as f:
    csv_reader = csv.reader(f, delimiter = '\n')
    for row in csv_reader:
        feedback_list.append(row)

feedback_list = sum(feedback_list, [])
feedback_list[:5]

['마이셀럽스는 어떤 회사?마이셀럽스는국내 최초로 인공지능을 활용한 빅데이터 기반의 취향 검색 서비스를 제공하는 회사 이다.년 월맥킨지컴퍼니 디지털전략 부문을 담당했고 그룹 최고디지털책임자부사장를 지낸 도준웅 씨가 설립했다. 자체 개발한 인공지능 솔루션 시스템인를 운영하며 데이터의 수집',
 '시각화',
 '지능탑재',
 '라이브 업데이트 등의 서비스를 제공한다. 이 시스템은 아마존웹서비스로부터즉시 수익화가 가능한 인공지능 솔루션 이라는 평가를 받았다. 부킹닷컴',
 '신세계면세점']

In [11]:
len(feedback_list)

1393

In [12]:
def predict_pos_neg_fix(review):
    token = tokenize(review)
    tf = term_frequency(token)
    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)
    score = float(model.predict(data))
    if(score > 0.5):
        positive001.append("[{}], 긍정 텍스트 확률 : {:.2f}%".format(review, score * 100))
    else:
        negative001.append("[{}], 부정 텍스트 확률 : {:.2f}%".format(review, (1 - score) * 100))

In [13]:
positive001 = []
negative001 = []

for i in range(0, len(feedback_list)):
    predict_pos_neg_fix(str(feedback_list[i]))

import pandas as pd
evaluate_pos001 = pd.DataFrame(positive001)
evaluate_pos001.to_csv("./mycelebs_content_positive.csv", header = False, index = False)
evaluate_neg001 = pd.DataFrame(negative001)
evaluate_neg001.to_csv("./mycelebs_content_negative.csv", header = False, index = False)