In [36]:
import keras
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation

In [37]:
model = Sequential()
model.add(Conv1D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu', strides = 1, input_shape = (1000, 300)))

由于词之间的相对垂直关系是任意的，因此关联信息主要体现在水平方向上。 所以卷积使用的卷积核为扁平的一维卷积核。

# 准备数据

cd introduction_to_ml_with_python/data
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar xvzf aclImdb_v1.tar.gz

In [10]:
import glob
import os

from random import shuffle


def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')

    pos_label = 1
    neg_label = 0

    dataset = []

    for filename in glob.glob("H:/NLP_Data/aclImdb_v1/aclImdb/train/pos/*.txt"):
        with open(filename, 'r', encoding = 'utf-8', errors='ignore' 'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob("H:/NLP_Data/aclImdb_v1/aclImdb/train/neg/*.txt"):
        with open(filename, 'r', encoding = 'utf-8', errors='ignore') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)
    return dataset

In [11]:
dataset = pre_process_data('H:/NLP_Data/aclImdb_v1/aclImdb/train')

In [12]:
dataset[0]

(1,
 "To all the miserable people who have done everything from complain about the dialogue, the budget, the this and the that....who wants to hear it? IF you missed the point of this beyond-beautiful movie, that's your loss. The rest of us who deeply love this movie do not care what you think. I am a thirthysomething guy who has seen thousands of movies in my life, and this one stands in its own entity, in my book. It was not supposed to be a documentary, or a completely factual account of what happened that night. It is the most amazing love story ever attempted. I know that it is the cynical 90's and the millennium has everyone in a tizzy, but come on. Someone on this comments board complained that it made too much money! How lame is that? It made bundles of money in every civilized country on the planet, and is the top grossing film in the planet. I will gladly side with the majority this time around. Okay, cynics, time to crawl back under your rock, I am done.")

In [13]:
from nlpia.loaders import get_data
word_vectors = get_data('w2v', limit = 200000)

  [datetime.datetime, pd.datetime, pd.Timestamp])
  MIN_TIMESTAMP = pd.Timestamp(pd.datetime(1677, 9, 22, 0, 12, 44), tz='utc')
  np = pd.np
  np = pd.np
INFO:nlpia.constants:Starting logger in nlpia.constants...
  np = pd.np
  np = pd.np
INFO:nlpia.loaders:No BIGDATA index found in H:\ANACONDA\lib\site-packages\nlpia\data\bigdata_info.csv so copy H:\ANACONDA\lib\site-packages\nlpia\data\bigdata_info.latest.csv to H:\ANACONDA\lib\site-packages\nlpia\data\bigdata_info.csv if you want to "freeze" it.
INFO:nlpia.futil:Reading CSV with `read_csv(*('H:\\ANACONDA\\lib\\site-packages\\nlpia\\data\\mavis-batey-greetings.csv',), **{'low_memory': False})`...
INFO:nlpia.futil:Reading CSV with `read_csv(*('H:\\ANACONDA\\lib\\site-packages\\nlpia\\data\\sms-spam.csv',), **{'low_memory': False})`...
INFO:nlpia.loaders:Downloading w2v
INFO:nlpia.web:URL too short: w2v
DEBUG:nlpia.futil:regex pattern = ^[.]?([^.]*)\.([^.]{1,10})*\.300d\.zip$, string=googlenews-vectors-negative300.bin.gz
DEBUG:nlpia.fu

In [27]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
import numpy as np

In [14]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [15]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected


In [20]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [31]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [32]:

maxlen = 400
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
filters = 250           # Number of filters we will train
kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 250       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2              # Number of times we will pass the entire training dataset through the network


CNN网络输入尺寸需要一致，但是每个文本的向量大小却不同 因此需要对长的截取短的补0

In [33]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:

        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data



In [34]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)


# 训练CNN模型

In [38]:
print('Build model...')
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")
print('Model saved.')


Build model...
Epoch 1/2
Epoch 2/2
Model saved.
