# How To Use NLP

## 4. MLP를 이용한 Sentiment Analysis
본 강의서 사용된 자료출처: https://github.com/e9t/nsmc

### 4.1 데이터 전처리

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import find
import gensim
import tensorflow as tf

root_dir = "../data"
train_file = root_dir + '/ratings_train-Copy1.txt'
test_file = root_dir+ '/ratings_train-Copy1.txt'

  from ._conv import register_converters as _register_converters


#### 4.1.1데이터 로딩

In [2]:
train = pd.read_csv(train_file, delimiter = '\t')
test = pd.read_csv(test_file, delimiter='\t')

In [3]:
y_train = train.label.values
y_test = train.label.values

#### 4.1.2 데이터 확인

In [4]:
train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,1
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


#### 라벨: 부정평가(0), 긍정평가(1) 구분

In [5]:
print(y_train)
print(len(y_train))

[0 1 1 ... 0 1 0]
150000


In [6]:
train.loc[0]

id                      9976970
document    아 더빙.. 진짜 짜증나네요 목소리
label                         0
Name: 0, dtype: object

#### 4.1.3 형태소 분석

In [5]:
from konlpy.tag import Twitter, Komoran
from time import strftime
import time
twitter = Twitter()
komoran = Komoran()

In [6]:
print(train.loc[0, 'document'])
print(twitter.morphs(train.loc[0, 'document'], stem=True, norm=True))
print(komoran.morphs(train.loc[0, 'document']))
print(twitter.pos(train.loc[0, 'document']))
print(komoran.pos(train.loc[0, 'document']))

아 더빙.. 진짜 짜증나네요 목소리
['아', '더빙', '..', '진짜', '짜증', '나네', '요', '목소리']
['아', '더빙', '.', '.', '진짜', '짜증', '나', '네요', '목소리']
[('아', 'Exclamation'), ('더빙', 'Noun'), ('..', 'Punctuation'), ('진짜', 'Noun'), ('짜증', 'Noun'), ('나네', 'Verb'), ('요', 'Eomi'), ('목소리', 'Noun')]
[('아', 'IC'), ('더빙', 'NNG'), ('.', 'SF'), ('.', 'SF'), ('진짜', 'MAG'), ('짜증', 'NNG'), ('나', 'VV'), ('네요', 'EC'), ('목소리', 'NNG')]


#### 형태소분석-train/test set의 morpheme을 떼어내 저장
train/test_segementation : 각 단어의 형태소만을 떼어내 저장한 것  
train/test_sentence: 1문장씩 들어 있는 것

In [7]:
def segmentation(sequental_data):
    segmentation  = []
    for i in range(len(sequental_data)):
        if isinstance(sequental_data.loc[i, 'document'], float):
            continue
        tokens = komoran.morphs(test.loc[i,'document'])
        segmentation.append(tokens)
    return segmentation

In [8]:
def parsing(segmented_data):
    sentence = []
    for i in range(len(segmented_data)):
        temp = ''
        for j in range(len(segmented_data[i])-1):
            temp += train_segmentation[i][j] + ' '
        temp += segmented_data[i][len(segmented_data[i])-1]
        sentence.append(temp)
    return sentence

In [10]:
#Twitter Speed calc
train_segmentation  = []
start_time = time.time()
for i in range(len(train)):
    if isinstance(train.loc[i, 'document'], float):
        continue
    tokens = twitter.morphs(train.loc[i,'document'], norm=True, stem=True)
    train_segmentation.append(tokens)
print("--- %s seconds ---" %(time.time() - start_time))    

--- 139.8429720401764 seconds ---


In [9]:
#Komoran Speed calc - 코모란을 씁시다
train_segmentation  = []
start_time = time.time()
for i in range(len(train.index)):
    if isinstance(train.loc[i, 'document'], float):
        continue
    tokens = komoran.morphs(train.loc[i,'document'])
    train_segmentation.append(tokens)
print("--- %s seconds ---" %(time.time() - start_time))    

--- 52.736218214035034 seconds ---


In [9]:
train_segmentation = segmentation(train)
test_segmentation  = segmentation(test)

In [10]:
train_sentence = parsing(train_segmentation)
test_sentence = parsing(test_segmentation)

In [17]:
train_segmentation[13]

['담백',
 '하',
 '고',
 '깔끔',
 '하',
 '아서',
 '좋',
 '다',
 '.',
 '신문',
 '기사',
 '로',
 '만',
 '보다',
 '보',
 '면',
 '자꾸',
 '잊어버리',
 'ㄴ다',
 '.',
 '그',
 '들',
 '도',
 '사람',
 '이',
 '었',
 '다는',
 '것',
 '을',
 '.']

In [16]:
#만들어놓은 list들 저장
np.save('./data/train_segmentation', train_segmentation)
np.save('./data/test_segmentation', test_segmentation)
np.save('./data/train_sentence', train_sentence)
np.save('./data/test_sentence', test_sentence)

In [15]:
#이렇게 만들어놓은 np 데이터는 load명령어로 불러옵니다.
train_segmentation = np.load('../data/train_segmentation.npy')
train_sentence = np.load('../data/train_sentence.npy')
test_segmentation = np.load('../data/test_segmentation.npy')
print(train_segmentation[0])

['아', '더빙', '.', '.', '진짜', '짜증', '나', '네요', '목소리']


#### 4.1.4 Word2Vec

In [22]:
train_segmentation[1]

['흠',
 '...',
 '포스터',
 '보고',
 '초딩',
 '영화',
 '줄',
 '....',
 '오버',
 '연기',
 '조차',
 '가볍',
 '지',
 '않',
 '구나']

In [20]:
model = gensim.models.Word2Vec(train_segmentation,min_count=2, window=5, size=300, sg=1)

KeyboardInterrupt: 

In [17]:
#model.save('./data/sentiment.bin')
#모델 불러오기
model = gensim.models.Word2Vec.load('./data/sentiment.bin')

FileNotFoundError: [Errno 2] No such file or directory: './data/sentiment.bin'

In [15]:
vectors=model.wv

#### 4.1.5 Vectorization

In [21]:
print(train_segmentation[:1],'\n\n',type(train_segmentation),'\n\n')
print(train_sentence[:1],'\n\n',type(train_sentence))

[['아', '더빙', '.', '.', '진짜', '짜증', '나', '네요', '목소리']] 

 <class 'list'> 


['아 더빙 . . 진짜 짜증 나 네요 목소리'] 

 <class 'list'>


#### **단어장 생성**

In [None]:
word_dict =[]
for sentences in train_segmentation:
    for segmentation in sentences:
        word_dict.append(segmentation)
print(word_dict[:10])
print(len(word_dict))
print(type(word_dict))

In [19]:
word2num = {word:(index+1) for index,word in enumerate(set(word_dict))} 
num2word = {(index+1):word for index,word in enumerate(set(word_dict))}
print(len(word2num))
print(type(word2num))
for i in range(5):
      print(list(num2word.items())[i])

61280
<class 'dict'>
(1, '진실')
(2, '맨발')
(3, '조센진')
(4, '조니 뎁')
(5, '못밌겠네!')


#### 단어를 Vectorize하기

In [20]:
embedding_vector = [np.zeros(shape=300)]
for index, word in enumerate(word2num.keys()):
    if word not in vectors.vocab:
        embedding_vector += [np.zeros(shape=300)] #[np.random.normal(scale=1e-2, size=300)] 이걸 왜 넣었지?
    else:
        embedding_vector += [vectors[word]]

In [21]:
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [22]:
def  sentence2index (segment):
    idx = []
    for sentence in segment:
        temp = []
        for segment in sentence:
            if segment not in word2num.keys():
                segment = '<UNK>'
            temp.append(word2num[segment])
        idx.append(temp)
    return idx

In [23]:
train_idx = sentence2index(train_segmentation)
test_idx = sentence2index(test_segmentation)
train_idx[:1]
test_idx[:1]

[[6934, 60580, 59897, 59897, 25762, 36305, 41423, 35038, 52574]]

In [24]:
def idx2EmbedSum(idxSet, embedding_vector):
    sum_w2v = []
    for idxes in idxSet:
        temp = np.zeros(shape=300)
        for idx in idxes:
            temp += embedding_vector[idx]
        sum_w2v.append(temp)
    return sum_w2v

In [25]:
X_train_w2v = idx2EmbedSum(train_idx, embedding_vector)
X_test_w2v = idx2EmbedSum(test_idx, embedding_vector)

In [26]:
print(len(X_test_w2v[999]))

300


### 4.2 학습및 테스트하기

### 4.2.1 Tensorflow 기본 동작 원리

<img src='./data/pic/3.jpg'>

In [38]:
output = 2 #0과 1이므로
embedding = 300 #보통 Word2Vec형태의 임베딩, 300차원으로 한다.
hidden = 128 #hidden layer을 지날때의 벡터 차원, 아무렇게 바꿔줘도 된다.
learning_rate = 1e-3 #learning rate, 케이스에 따라 다르긴 한데, 보통 0.01에서 시작한다.
epoch = 10 #몇 번 돌 것인가
batch_size = 64 #batch norm을 할때 한번에 읽어들일 개수

In [39]:
tf.set_random_seed(0)
#Placeholder
X = tf.placeholder(tf.float32, shape=[None, embedding]) #그래프를 그릴땐, 몇개인지 정의하지 않는다, 300차원
Y = tf.placeholder(tf.int32, shape=None) #정답, 0과 1로만 되어있다

In [40]:
#Weight, activation
W1 = tf.Variable(tf.truncated_normal([embedding, hidden]))
b1 = tf.Variable(tf.truncated_normal([hidden]))
activate = tf.nn.relu(tf.matmul(X, W1)+b1)#보통 nlp에서 activation function은 hyperbolantic tangent를 쓴다
W2 = tf.Variable(tf.truncated_normal([hidden, output]))
b2 = tf.Variable(tf.truncated_normal([output]))

In [41]:
#Cost 계산
logits = tf.matmul(activate, W2) + b2
hypothesis = tf.nn.softmax(logits)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
prediction = tf.cast(tf.argmax(hypothesis, 1), tf.int32)
correct = tf.equal(prediction, Y)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [42]:
#Session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# train
for epoch in range(epoch):
    total_batch = int(len(y_train) / batch_size)
    cost_avg = 0
    print('< epoch :', (epoch+1), '>')
    for i in range(total_batch):
        if i == (total_batch-1):
            batch_xs = X_train_w2v[(i*batch_size):len(y_train)]
            batch_ys = y_train[(i*batch_size):len(y_train)]
        else:
            batch_xs = X_train_w2v[i*batch_size:(i+1)*batch_size]
            batch_ys = y_train[i*batch_size:(i+1)*batch_size]       
        cost_val, _ = sess.run([cost, optimizer], feed_dict={X: batch_xs, Y: batch_ys})
        cost_avg += cost_val
        if i % 1000 == 999:
            print('%d' % (i+1), 'Cost: ', '{:.3f}'.format(cost_avg/1000))
            cost_avg = 0

< epoch : 1 >
1000 Cost:  0.721
2000 Cost:  0.695
< epoch : 2 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 3 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 4 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 5 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 6 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 7 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 8 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 9 >
1000 Cost:  0.695
2000 Cost:  0.696
< epoch : 10 >
1000 Cost:  0.695
2000 Cost:  0.696


In [43]:
# test
test_batch = int(len(X_test_w2v) / batch_size)
test_acc = 0
for i in range(test_batch):
    if i == (test_batch-1):
        batch_xs = X_test_w2v[(i*batch_size):len(X_test_w2v)]
        batch_ys = y_test[(i*batch_size):y_test.shape[0]]
    else:
        batch_xs = X_test_w2v[i*batch_size:(i+1)*batch_size]
        batch_ys = y_test[i*batch_size:(i+1)*batch_size]       
    acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys})
    test_acc += acc
print('Accuracy: ', '{:.3f}'.format(test_acc/test_batch))

Accuracy:  0.501
