In [0]:
%tensorflow_version 2.x

In [0]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [0]:
import os
import glob
import pandas as pd
dn = os.path.dirname(dataset)
dn = os.path.join(dn, "aclImdb")

def get_data(n):
    train_dn = os.path.join(dn, n)
    contents = []
    sentiment = []
    pos_fn = os.path.join(train_dn, "pos", "*.txt")
    for fn in glob.glob(pos_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(1)

    neg_fn = os.path.join(train_dn, "neg", "*.txt")
    for fn in glob.glob(neg_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(0)

    df = pd.DataFrame({
        "content":contents,
        "sentiment":sentiment
    }, columns=["content", "sentiment"])
    return df

In [4]:
train_df = get_data("train")
test_df = get_data("test")
test_df

Unnamed: 0,content,sentiment
0,Fair and nifty little science fiction/horror f...,1
1,I think Gerard's comments on the doc hit the n...,1
2,This is one of the best Czech movies I have ev...,1
3,"This film, for what it was set out to be, succ...",1
4,"Excellent performance by Mary KAy Place, Steve...",1
...,...,...
24995,First I have to say that I have read everythin...,0
24996,"""The Duke"" is a film based in the heart of the...",0
24997,So we're supposed to find it funny that this w...,0
24998,This show had a promising start as sort of the...,0


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
# 這一步比較像以前的 CountVectorizer的fit(找出多少種)
tok.fit_on_texts(train_df["content"])

In [0]:
# 好習慣, 正像反向先準備好
index_2_word = tok.index_word
word_2_index = {v:k for k, v in tok.index_word.items()}

In [7]:
# 真的轉換成數字, transform
# 這些數字都會在精選3000個單字裡面
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,1,6,3,84,17,10,216,9,337.0,8.0,28.0,311.0,41.0,1.0,2419.0,8.0,1.0,519.0,258.0,1.0,422.0,59.0,1461.0,894.0,1.0,83.0,454.0,150.0,4.0,1.0,23.0,21.0,63.0,671.0,8.0,11.0,19.0,7.0,7.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,32,155,1,646,4,484,2,113,8.0,11.0,2425.0,17.0,6.0,2767.0,4.0,1.0,52.0,115.0,4.0,151.0,359.0,1280.0,2.0,23.0,52.0,49.0,292.0,344.0,15.0,344.0,2.0,190.0,15.0,190.0,9.0,6.0,28.0,4.0,1.0,115.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,11,6,3,726,209,118,2052,253,1664.0,31.0,2338.0,2.0,24.0,294.0,1011.0,5.0,24.0,2373.0,2.0,3.0,1016.0,1751.0,1.0,790.0,131.0,675.0,552.0,6.0,31.0,3.0,1301.0,693.0,174.0,4.0,70.0,106.0,153.0,11.0,17.0,6.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,764,13,128,157,675,320,34,90,1.0,36.0,99.0,5.0,695.0,16.0,7.0,7.0,24.0,284.0,431.0,835.0,3.0,2125.0,5.0,1.0,295.0,4.0,150.0,142.0,7.0,7.0,26.0,59.0,891.0,1.0,120.0,16.0,32.0,633.0,72.0,59.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,90,8,2706,6,32,318,17,12,86.0,104.0,25.0,73.0,8.0,1138.0,18.0,23.0,21.0,57.0,1883.0,4.0,1.0,2387.0,254.0,921.0,44.0,20.0,28.0,157.0,624.0,906.0,126.0,520.0,2.0,2070.0,22.0,5.0,64.0,86.0,1.0,2997.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,803,516,9,3,297,60,6,1,672.0,10.0,199.0,5.0,99.0,20.0,60.0,10.0,1132.0,43.0,39.0,806.0,2353.0,8.0,11.0,418.0,10.0,1578.0,2353.0,155.0,232.0,36.0,1.0,127.0,63.0,63.0,1095.0,2.0,21.0,2914.0,30.0,29.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,58,365,2,10,1608,12,17,233,311.0,2.0,72.0,66.0,28.0,4.0,1.0,830.0,916.0,8.0,1.0,17.0,6.0,21.0,420.0,5.0,27.0,160.0,30.0,29.0,18.0,9.0,6.0,40.0,35.0,643.0,2.0,9.0,1503.0,98.0,1878.0,1688.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,10,553,35,73,41,11,17,86,9.0,13.0,3.0,84.0,1176.0,2.0,28.0,4.0,145.0,399.0,1353.0,99.0,12.0,781.0,251.0,448.0,4.0,88.0,1176.0,99.0,40.0,66.0,5.0,64.0,70.0,10.0,1608.0,9.0,2.0,10.0,25.0,5.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,1165,908,55,651,841,22,680,9,328.0,149.0,566.0,1.0,948.0,276.0,91.0,40.0,69.0,276.0,10.0,40.0,89.0,37.0,2113.0,1287.0,14.0,10.0,194.0,731.0,2.0,79.0,2065.0,42.0,239.0,50.0,1911.0,2450.0,71.0,29.0,43.0,209.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
# 因為後面接 MLP , 一定要讓你的 input 是一樣的
# 可以控制前或後的截長補短
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_padseq = pad_sequences(x_train_seq, maxlen=256)
x_test_padseq = pad_sequences(x_test_seq, maxlen=256)
pd.DataFrame(x_test_padseq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,14,2550,3,28,4,1,1321,115,136,6,2896,1894,2,2,3,1053,2,971,5,10,188,336,543,1026,5,120,12,26,6,53,5,24,151,2,9,128,492,37,3,1379
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,9,45,1,872,548,933,218,5,22,42,1290,321,2,2423,3,2,2727,2044,80,1,93,4,110,4,1,2,369,29,671,4,3,49,18,78,273,20,3,191,4,336
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,84,367,2,1,153,23,63,84,10,37,322,99,35,10,63,421,11,28,10,59,383,9,5,256,34,490,5,121,139,41,312,179,322,2,65,110,100,1,322,8
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,8,11,19,2,470,5,27,968,4,9,2,15,12,10,2521,1,19,15,48,9,13,79,143,651,10,202,9,35,10,67,1494,9,16,82,81,12,2535,112,570,9
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,938,1,527,30,1,1286,4,65,2218,7,7,54,548,48,1132,4,110,3,527,6,36,33,77,815,8,5,28,39,50,8,11,19,7,7,59,116,5,64,3,751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,66,39,86,251,56,117,13,1,61,116,4,38,110,38,2,45,312,1583,35,119,100,9,38,110,13,8,2,9,559,55,12,56,97,1597,761,142,292,171,97,25,...,11,6,28,1425,62,60,59,25,1810,3,73,125,1252,276,291,77,78,9,30,1,558,28,67,94,73,50,4,11,62,31,883,1,776,271,39,5,60,23,526,535
24996,31,589,34,296,3,34,6,2925,5,350,2,2891,24,151,31,397,24,115,5,2891,24,159,1297,1421,14,1,919,106,47,5,2,1,1151,659,1864,26,117,269,2,995,...,78,21,383,11,19,5,2704,17,448,18,10,78,542,383,9,5,908,1838,2166,34,654,5,354,3,160,19,15,65,503,2,589,448,34,654,5,64,8,24,474,214
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1259,5,175,48,23,1,993,15,82,71,109,1208,1368,10,479,12,158,548,5,256,1079,5,1,882,5,1,102,34,40,5,1,89,435,126,55,6,1079,160,882,439
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,82,153,23,580,1,102,580,1134,2,33,303,5,27,962,1225,2753,1451,81,7,7,5,3,461,6,37,8,3,8,58,649,1582,93,5,13,93,125,2,73,50,817


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dropout, Dense
# NLP : 0 要保留給padding用 -> mask_zero=True (0有用處)
# input_dim : 進去的有幾種 3000 + 1(0 padding), input_length : 進去有幾個
INPUT_DIM = 3000 + 1
EMBEDDING_DIM = 64
INPUT_LENGTH = 256
model = Sequential()
model.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM,
                    mask_zero=True,
                    input_length = INPUT_LENGTH))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(2, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 64)           192064    
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               2097280   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 2,289,602
Trainable params: 2,289,602
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(), optimizer="adam", metrics=["accuracy"])

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
stop_callback = EarlyStopping(patience=3, restore_best_weights=True)
save_callback = ModelCheckpoint("fashion.h5", save_best_only=True)
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]
model.fit(x_train_padseq, y_train,
          batch_size=50,
          epochs=100,
          validation_split=0.1,
          callbacks=[stop_callback, save_callback],
          verbose=2)

Epoch 1/100
450/450 - 13s - loss: 0.4109 - accuracy: 0.7984 - val_loss: 0.3815 - val_accuracy: 0.8216
Epoch 2/100
450/450 - 13s - loss: 0.1346 - accuracy: 0.9514 - val_loss: 0.4941 - val_accuracy: 0.8088
Epoch 3/100
450/450 - 13s - loss: 0.0250 - accuracy: 0.9924 - val_loss: 0.9939 - val_accuracy: 0.7700
Epoch 4/100
450/450 - 13s - loss: 0.0065 - accuracy: 0.9985 - val_loss: 0.7758 - val_accuracy: 0.8372


<tensorflow.python.keras.callbacks.History at 0x7fd040a2f278>

In [12]:
model.evaluate(x_test_padseq, y_test)



[0.3128603994846344, 0.8654400110244751]

In [16]:
# input , output 皆為向量 tensor
# 看各層 model.layers[0].output
# 把每個詞做成詞向量
infer = Sequential()
infer.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM))
# 單獨拿這層的參數
w = model.layers[0].get_weights()
infer.set_weights(w)
infer.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          192064    
Total params: 192,064
Trainable params: 192,064
Non-trainable params: 0
_________________________________________________________________


In [19]:
w = input("你要轉換哪一個詞?")
data = [[word_2_index[w]]]
print("詞向量:", infer.predict(data))

你要轉換哪一個詞?good
詞向量: [[[-0.06411489  0.02748772 -0.0405018   0.0049464  -0.00893738
   -0.01833113 -0.02910353  0.01299307  0.04077014 -0.04832683
    0.02746807  0.01805099  0.01061262 -0.03244929  0.00480672
    0.04472292 -0.02388347 -0.00808236  0.08513205 -0.02855348
   -0.03491329  0.03920214 -0.04623974 -0.00802679 -0.03181931
   -0.07206565 -0.03942411 -0.04761915  0.03745254  0.05573238
   -0.02545472 -0.01394451 -0.02975556  0.05515446 -0.03093215
   -0.00821895 -0.01599044  0.03184147  0.08336147  0.0241491
   -0.07712875  0.02064331  0.00667083  0.04128896  0.05031335
    0.00889582  0.01676182 -0.0308644   0.00843534  0.02098008
   -0.01032273  0.01945371 -0.04237732  0.04481575  0.02253096
   -0.00358478  0.03588423 -0.0147132   0.01170244  0.00684702
   -0.03767297 -0.01067339 -0.0757203   0.02544866]]]
