In [1]:
import pandas as pd
# 加载数据集
path = "datasets/spam.csv"
data = pd.read_csv(path, encoding='latin-1')
# 获得短信文字内容（特征值）
X = data['v2'].values
# 获得标签，并将标签转化为0与1的形式
y = data['v1'].astype('category').cat.codes.values
# 将数据集分为训练集与测试集
X_train = X[0: 2000]
y_train = y[0: 2000]
X_test = X[2000: 2200]
y_test = y[2000: 2200]

In [2]:
from keras import backend as K
from keras.layers import Layer
import tensorflow_hub as hub
import tensorflow as tf
class ElmoEmbeddingLayer(Layer):
    def __init__(self):
        super(ElmoEmbeddingLayer, self).__init__()
    def build(self, input_shape):
        # 加载ELMO模型
        self.elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
        super(ElmoEmbeddingLayer, self).build(input_shape)
    def call(self, x):
        # 指定ELMO模型相关参数
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1), 
                           as_dict=True, 
                           signature='default')['default']
        return result
    def compute_output_shape(self, input_shape):
        # 指定ELMO模型的输出数据
        return (input_shape[0], 1024)

Using TensorFlow backend.


In [3]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
# 将整句作为输入
inputs = Input(shape=(1,), dtype=tf.string)
# 使用ELMO模型获得上下文词向量
embedding = ElmoEmbeddingLayer()(inputs)
# 使用全连接神经网络完成分类任务
dense = Dense(256, activation='relu')(embedding)
outputs = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=[inputs], outputs=outputs)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.


W0318 23:37:21.739906  5752 deprecation.py:323] From D:\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0318 23:37:23.200212  5752 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 262,657
Trainable params: 262,657
Non-trainable params: 0
_________________________________________________________________


In [4]:
embedding

<tf.Tensor 'elmo_embedding_layer_1/module_apply_default/truediv:0' shape=(?, 1024) dtype=float32>

In [5]:
model.compile(loss='binary_crossentropy', 
              optimizer=Adam(), 
              metrics=['accuracy'])
model.fit(X_train, 
          y_train, 
          epochs=3, 
          validation_split=0.2,
          verbose=2,
          batch_size=16)

Instructions for updating:
Use tf.cast instead.


W0318 23:37:23.486565  5752 deprecation.py:323] From D:\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Train on 1600 samples, validate on 400 samples
Epoch 1/3
 - 69s - loss: 0.1215 - acc: 0.9494 - val_loss: 0.0556 - val_acc: 0.9800
Epoch 2/3
 - 56s - loss: 0.0438 - acc: 0.9850 - val_loss: 0.0407 - val_acc: 0.9900
Epoch 3/3
 - 66s - loss: 0.0235 - acc: 0.9925 - val_loss: 0.0240 - val_acc: 0.9950


<keras.callbacks.History at 0x252f8340dd8>

In [6]:
import tensorflow_hub as hub
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
embeddings = elmo(
    ["the cat is on the mat", "dogs are in the fog"],
    signature="default",
    as_dict=True)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0318 23:40:42.816405  5752 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [7]:
embeddings

{'word_emb': <tf.Tensor 'module_1_apply_default/bilm/Reshape_1:0' shape=(2, 6, 512) dtype=float32>,
 'lstm_outputs2': <tf.Tensor 'module_1_apply_default/concat_1:0' shape=(2, ?, 1024) dtype=float32>,
 'sequence_len': <tf.Tensor 'module_1_apply_default/Sum:0' shape=(2,) dtype=int32>,
 'elmo': <tf.Tensor 'module_1_apply_default/aggregation/mul_3:0' shape=(2, 6, 1024) dtype=float32>,
 'default': <tf.Tensor 'module_1_apply_default/truediv:0' shape=(2, 1024) dtype=float32>,
 'lstm_outputs1': <tf.Tensor 'module_1_apply_default/concat:0' shape=(2, 6, 1024) dtype=float32>}

In [8]:
from keras import backend as K
K.get_value(embeddings['sequence_len'])

array([6, 5])