# Sentiment Classification & Embedding I

* Before RNN & Embedding Layer

# 01. What data we use?

In [None]:
import pandas as pd
import zipfile as zf

In [None]:
!curl -o labels.txt https://raw.githubusercontent.com/RayleighKim/Example_datasets/master/sentiment_data/labels.txt
!curl -o reviews.zip https://raw.githubusercontent.com/RayleighKim/Example_datasets/master/sentiment_data/reviews.zip

In [None]:
zf_r = zf.ZipFile('reviews.zip')

labels = pd.read_csv('labels.txt', header=None, names = ['labels'])
reviews = pd.read_csv(zf_r.open('reviews.txt'), header=None, names = ['reviews'])

In [None]:
labels.head()

In [None]:
labels['y'] = 0
labels.loc[labels['labels']=='positive', ['y']] = 1

labels.head()

In [None]:
reviews.head()

In [None]:
y = labels[['y']].values
labels = labels['labels'].tolist()
reviews = reviews['reviews'].tolist()

In [None]:
def preview(i):
    print(labels[i] + "  :  " + reviews[i][:80] + "...")

In [None]:
len(reviews)

In [None]:
reviews[0]

In [None]:
labels[0]

# Discussion : 무엇으로 무엇을 어떻게 예측하려 하는가?

In [None]:
print("labels.txt  :  reviews.txt")
print("--------------------------")
preview(2490)
preview(12786)
preview(6267)
preview(24965)
preview(11947)
preview(2312)

# Q1. Tokenizer를 이용하여, TFIDF vector를 만들 것.

* 만들고 나서 x와 y를 제작한다.
    * reviews --> x
    * labels --> y
    * 80%는 트레이닝 셋으로, 20%는 테스트 셋으로



* Bag of Words : Binary ver
> texts_to_matrix(reviews, mode='binary')
* Bag of Words : Counting ver
> texts_to_matrix(reviews, mode='count')
* Tf-IDF
> texts_to_matrix(reviews, mode='tfidf')

In [None]:
from tensorflow import keras
import tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
t = Tokenizer()
t.fit_on_texts(reviews)

In [None]:
print(len(t.word_counts))
print(t.document_count)
print(t.word_index)
print(t.word_docs) # 그 단어가 포함된 문서의 수

In [None]:
use_len = 10000 ## 이거 없이 다 한다면 터진다.

x = t.texts_to_matrix(reviews[:use_len])
y = y[:use_len]

# 모델링!



In [None]:
import matplotlib.pyplot as plt

from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# 혹시 이미 그려둔 그래프가 있다면 날려줘!
keras.backend.clear_session()

# model에 순차적으로 레이어를 쌓아가겠다는 의도!
model = keras.models.Sequential()

# 인풋을 받아, weight을 곱하고, bias를 더해주고
# activation은 없애보자!

model.add( layers.Dense(2048, input_shape=(74073,), activation='swish')  )
model.add( layers.Dense(1024, activation='swish')  )

model.add( layers.Dense(1, activation = 'sigmoid')     )

adam = keras.optimizers.Adam(lr = 0.0001)

# 컴파일 해주렴!
model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics =['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor = 'val_loss',
                  min_delta = 0, # 개선되고 있다고 판단하기 위한 최소 변화량
                  patience = 4, # 개선 없는 epoch 얼마나 기달려 줄거야?
                  verbose = 1
                  )

history = model.fit(x[:-2000],y[:-2000], batch_size = 2048, epochs=10, verbose=1,
                   validation_split = 0.2, callbacks=[es], )

In [None]:
test_x , test_y = x[-2000:], y[-2000:]

In [None]:
performance_test = model.evaluate(test_x, test_y, batch_size = 512)

print('Test Loss : {:.6f},  Test Accuracy : {:.3f}%'.format(performance_test[0], performance_test[1]*100))

In [None]:
if not isinstance(history, dict):
    history = history.history

plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Accuracy : Training vs Validation')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc=0)
plt.show()

In [None]:
if not isinstance(history, dict):
    history = history.history

plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Loss : Training vs Validation')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc=0)
plt.show()