In [1]:
sentences = [
    "今天外面天气很好",
    "人工智能正在改变世界",
    "我们明天早上一起去图书馆学习"
]

In [2]:
import jieba
new_sentences = []
for sentence in sentences:
    # 对句子进行分词处理
    segments = jieba.cut(sentence)
    # 将句子中的每一个词汇用空格分来
    new_sentence = " ".join(segments)
    # 将处理好的句子存储到列表中
    new_sentences.append(new_sentence)
print(new_sentences)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BEIZHO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.783 seconds.
Prefix dict has been built succesfully.


['今天 外面 天气 很 好', '人工智能 正在 改变 世界', '我们 明天 早上 一起 去 图书馆 学习']


In [3]:
new_sentences

['今天 外面 天气 很 好', '人工智能 正在 改变 世界', '我们 明天 早上 一起 去 图书馆 学习']

In [4]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(new_sentences)
word_index = tokenizer.word_index
print(word_index)

Using TensorFlow backend.


{'今天': 1, '外面': 2, '天气': 3, '很': 4, '好': 5, '人工智能': 6, '正在': 7, '改变': 8, '世界': 9, '我们': 10, '明天': 11, '早上': 12, '一起': 13, '去': 14, '图书馆': 15, '学习': 16}


In [5]:
from keras.preprocessing.text import text_to_word_sequence
sequences = tokenizer.texts_to_sequences(new_sentences)
print(sequences)

[[1, 2, 3, 4, 5], [6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16]]


In [6]:
# one_hot_result = tokenizer.texts_to_matrix(samples, mode='binary')
# print(one_hot_result)

In [7]:
from keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(sequences, padding='pre')
print(padded_sequences)

[[ 0  0  1  2  3  4  5]
 [ 0  0  0  6  7  8  9]
 [10 11 12 13 14 15 16]]


In [8]:
padded_sequences = pad_sequences(sequences, padding='post')
print(padded_sequences)

[[ 1  2  3  4  5  0  0]
 [ 6  7  8  9  0  0  0]
 [10 11 12 13 14 15 16]]


In [9]:
padded_sequences = pad_sequences(sequences, 
                                 padding='pre', 
                                 truncating='pre',
                                 maxlen=6)
print(padded_sequences)

[[ 0  1  2  3  4  5]
 [ 0  0  6  7  8  9]
 [11 12 13 14 15 16]]


In [10]:
padded_sequences = pad_sequences(sequences, padding='pre')
from keras.models import Sequential
from keras.layers import Embedding
model = Sequential()
embedding = Embedding(input_dim=17,
                      input_length=7,
                      output_dim=3)
model.add(embedding)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 3)              51        
Total params: 51
Trainable params: 51
Non-trainable params: 0
_________________________________________________________________


In [11]:
padded_sequences

array([[ 0,  0,  1,  2,  3,  4,  5],
       [ 0,  0,  0,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16]])

In [12]:
print(embedding.get_weights())

[array([[-0.02805706,  0.02496606, -0.02966118],
       [-0.04563084, -0.01646264, -0.02802988],
       [ 0.02386883, -0.01998019, -0.00714093],
       [ 0.01394485,  0.02984795,  0.03769021],
       [-0.04217581,  0.0048572 ,  0.03675225],
       [ 0.00317006, -0.01415241, -0.02957951],
       [-0.00254823, -0.03221738,  0.02868478],
       [ 0.00424597,  0.02475877,  0.03217186],
       [ 0.04807809,  0.00277642, -0.0001649 ],
       [ 0.04473079, -0.02043191, -0.04517198],
       [ 0.03993343,  0.03947297,  0.03220726],
       [-0.04395154,  0.03991366, -0.04074151],
       [ 0.01765602, -0.03504368,  0.04820906],
       [ 0.00785128, -0.04658672, -0.00375912],
       [ 0.0246125 ,  0.04838799,  0.02064878],
       [ 0.02953463,  0.00808661, -0.02835951],
       [ 0.03881926, -0.0053608 , -0.03023617]], dtype=float32)]


In [13]:
prediction = model.predict(padded_sequences)

In [14]:
prediction

array([[[-0.02805706,  0.02496606, -0.02966118],
        [-0.02805706,  0.02496606, -0.02966118],
        [-0.04563084, -0.01646264, -0.02802988],
        [ 0.02386883, -0.01998019, -0.00714093],
        [ 0.01394485,  0.02984795,  0.03769021],
        [-0.04217581,  0.0048572 ,  0.03675225],
        [ 0.00317006, -0.01415241, -0.02957951]],

       [[-0.02805706,  0.02496606, -0.02966118],
        [-0.02805706,  0.02496606, -0.02966118],
        [-0.02805706,  0.02496606, -0.02966118],
        [-0.00254823, -0.03221738,  0.02868478],
        [ 0.00424597,  0.02475877,  0.03217186],
        [ 0.04807809,  0.00277642, -0.0001649 ],
        [ 0.04473079, -0.02043191, -0.04517198]],

       [[ 0.03993343,  0.03947297,  0.03220726],
        [-0.04395154,  0.03991366, -0.04074151],
        [ 0.01765602, -0.03504368,  0.04820906],
        [ 0.00785128, -0.04658672, -0.00375912],
        [ 0.0246125 ,  0.04838799,  0.02064878],
        [ 0.02953463,  0.00808661, -0.02835951],
        [ 0.0388