# CNN应用于文本分类

1、文本数据预处理

2、构建卷积模型，注意卷积核大小的设计

3、将卷积后的特征图池化成一个特征

4、将多个特征拼接成一个，传入全连接层

In [10]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
num_features = 3000
sequence_length = 300
embedding_dimension=100
(x_train,y_train),(x_test,y_test) = tf.keras.datasets.imdb.load_data(num_words=num_features)
print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


In [11]:
x_train = pad_sequences(x_train,maxlen=sequence_length)
x_test = pad_sequences(x_test,maxlen=sequence_length)
print(x_train.shape)
print(x_test.shape)

(25000, 300)
(25000, 300)


In [12]:
filter_size = [3,4,5]
def convolution():
    inn = layers.Input(shape=(sequence_length,embedding_dimension,1)) # 输入：300x100x1 
    cnns = []
    for size in filter_size:
        # filters：卷积核个数
        conv = layers.Conv2D(filters=64,kernel_size=(size,embedding_dimension),strides=1,padding='valid',activation='relu')(inn)
        pool = layers.MaxPool2D(pool_size=(sequence_length-size+1,1),padding='valid')(conv) #pool_size与特征图同等大小
        cnns.append(pool) # pool有64个特征
    out = layers.concatenate(cnns) # 64x3=192
    
    model = keras.Model(inputs=inn,outputs=out)
    return model
        
def cnn_mulfilter():
    model = keras.Sequential([
        # input_dim=num_features : 3000个不同的词
        layers.Embedding(input_dim=num_features,output_dim=embedding_dimension,input_length=sequence_length),
        layers.Reshape((sequence_length,embedding_dimension,1)),
        convolution(),
        layers.Flatten(),
        layers.Dense(10,activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1,activation='sigmoid')
    ])
    model.compile(optimizer=keras.optimizers.Adam(),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
    return model

model = cnn_mulfilter()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          300000    
_________________________________________________________________
reshape_1 (Reshape)          (None, 300, 100, 1)       0         
_________________________________________________________________
model_1 (Functional)         (None, 1, 1, 192)         76992     
_________________________________________________________________
flatten_1 (Flatten)          (None, 192)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1930      
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [13]:
history = model.fit(x_train,y_train,batch_size=64,epochs=5,validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['training','validation'],loc='upper left')
plt.show()