<a href="https://colab.research.google.com/github/sandrakaku/ml0930/blob/master/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import glob #找所有記事本並建立起來
# glob.glob("/root/.keras/datasets/aclImdb/train/pos/*")
import os
# datas = { # 準備空的字典
#     "content":[],
#     "target":[]
# }
import pandas as pd

def read(path):
  with open(path, "r", encoding="utf-8") as f:
    content = f.read()
  return content


def get_data(t):
  dn = os.path.dirname(dataset)
  pattern = os.path.join(dn, "aclImdb", t, "pos", "*.txt") # 所有檔名
  pos = glob.glob(pattern)
  pattern = os.path.join(dn, "aclImdb", t, "neg", "*.txt") # 所有檔名
  neg = glob.glob(pattern)
  sentiments = [1] * len(pos) + [0] * len(neg)
  contents = map(read, pos + neg)
  df = pd.DataFrame({
      "contents":contents,
      "sentiment":sentiments
  })
  return df
train_df = get_data("train")
test_df = get_data("test")
test_df

In [None]:
"-".join(map(str, [1, 2, 3, 4, 5])) # (與本主題無關)map對裡面的所有東西做一次相同的事

In [None]:
TOK = 3000
LEN = 512
EM = 128 # 情緒感受 BERT就選500個

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=TOK) # 不用那麼多的話可以截長補短
# fit_on_texts: fit_transform 的 fit
tok.fit_on_texts(train_df["contents"])
# sequences: 把所有單字換成數字的序列
x_train_seq = tok.texts_to_sequences(train_df["contents"])
x_test_seq = tok.texts_to_sequences(test_df["contents"])

In [None]:
# tok.word_index
tok.index_word

In [None]:
# pd.DataFrame(train_df_seq)
# truncating: pre 截掉前面 ex: 12345 -> 45  / post 截掉後面 12345 -> 12
from tensorflow.keras.preprocessing.sequence import pad_sequences # 截長補短
x_train_pad = pad_sequences(x_train_seq, LEN) # truncating(預設值就是"pre")
x_test_pad = pad_sequences(x_test_seq, LEN)
pd.DataFrame(x_test_pad)

In [None]:
# 以下開始enbadding

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D   # 詞意
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.layers import SimpleRNN

layers = [
  # 1~3000(token) + 0(padding) = 3001
  # param: 3001 * 128
  # 線性模型
  Embedding(TOK+1, EM, mask_zero=True, input_length=LEN), # 128:output_dim, input_length=512(一篇文章有512個詞) 3000種512個 [每一個詞被畫成128個情緒]
  GlobalAveragePooling1D(), # 1D
  # Flatten(),
  Dense(2, activation="softmax") # 分類器 Word2Vec的作法
]
model = Sequential(layers)
model.summary()

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer=Adam(),
       metrics=["accuracy"])

In [None]:
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

In [None]:
# epoch: 整份資料要看幾遍
# 60000筆, 2epoch, 200batch -> 60000 * 2 / 200次調整
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    ModelCheckpoint("model.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad, 
     y_train,
     batch_size=100,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks,
     verbose=2)

In [None]:
model.evaluate(x_test_pad, y_test)

In [None]:
layers = [
  Embedding(TOK+1, EM, mask_zero=True),
  GlobalAveragePooling1D(),
]
w = model.layers[0].get_weights()
infer = Sequential(layers) # infer推論
infer.layers[0].set_weights(w) # 設Embedding
infer.summary()

In [None]:
# w
# infer.layers[0].get_weights()
# 25000, 512 Shape
# 原本:[512list, 512list]
target = "the"
# 現在:[1list]
pre = infer.predict([[tok.word_index[target]]])
pre[0]

In [None]:
f = open("vec.txt", "w", encoding="utf-8")
f.write("{} {}\n".format(TOK, EM))

for i in range(1, TOK+1):
  pre = infer.predict([[i]])[0]
  v = " ".join(map(str, pre))
  f.write("{} {}\n".format(tok.index_word[i], v))
f.close()

In [None]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format("vec.txt")

In [None]:
# w2v.wv["the"]
# 詞彙列表 w2v.wv.vocab
w2v.most_similar("christmas")