In [1]:
# coding:utf-8

from keras.layers.core import Activation, Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import nltk  
import collections  
import numpy as np

Using TensorFlow backend.


In [4]:
'''
nltk.sent_tokenize(text) #对文本按照句子进行分割
nltk.word_tokenize(sent) #对句子进行分词
'''


maxlen = 0 # max length of the sentence
word_freqs = collections.Counter()  # frequency of words
num_recs = 0 # number of sentences
with open('training.txt','r+') as f:
    for line in f:
        label, sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence.lower())
        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            word_freqs[word] += 1
        num_recs += 1
print ('max_len ',maxlen)
print ('nb_words ', len(word_freqs))

('max_len ', 42)
('nb_words ', 2330)


### From above we can see that in this dataset we have 2330 words including the punctuations like '.' , '!' and the max length of the sentence is 42 which is also including the puctuation.
### So, we define the number of words (the true word) is 2000 and the length of sentence is 40. Besides, we use 'PAD' to padding the sentence in the training set whose length is less than 40. We also use 'UNK' to replace the word in the sentence of the training set which is not in the dictionary we've defined

In [6]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40

# 2000 + 2 <- 2000 words + 2(UNK,PAD)

vocab_size = 2002
# We take the 2000 most_common words in the word_freqs as our new frequency
# every x is ('word',num) 

word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

X = np.empty(num_recs,dtype=list)
y = np.zeros(num_recs)
i=0
with open('training.txt','r+') as f:
    for line in f:
        label, sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence.lower())
        seqs = []
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i] = seqs
        y[i] = int(label)
        i += 1
    
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
# dropout: Fraction of the units to drop for the linear transformation of the inputs.
# recurrent_dropout: Fraction of the units to drop for the linear transformation of the recurrent state.
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])

In [8]:
BATCH_SIZE = 32
NUM_EPOCHS = 10
model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(Xtest, ytest))

Train on 5668 samples, validate on 1418 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11ce6a350>

In [9]:
loss, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("\nTest score: %.3f, accuracy: %.3f" % (loss, acc))
print('{}   {}      {}'.format('预测','真实','句子'))
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1,40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent = " ".join([index2word[x] for x in xtest[0] if x != 0])
    print(' {}      {}     {}'.format(int(round(ypred)), int(ylabel), sent))
    


Test score: 0.048, accuracy: 0.991
预测   真实      句子
 1      1     the da vinci code was absolutely awesome !
 1      1     i wanted desperately to love'the da vinci code as a film .
 1      1     the da vinci code is an awesome book ! ! !
 0      0     i really hate the da vinci code ...
 0      0     brokeback mountain is fucking horrible..


In [22]:
print model.metrics_names

['loss', 'acc']


In [25]:
INPUT_SENTENCES = ['I like what you say.','I love you']
XX = np.empty(len(INPUT_SENTENCES),dtype=list)
i=0
for sentence in INPUT_SENTENCES:
    words = nltk.word_tokenize(sentence.lower())
    seq = []
    for word in words:
        if word in word2index:
            seq.append(word2index[word])
        else:
            seq.append(word2index['UNK'])
    XX[i] = seq
    i+=1

XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH)
labels = [int(round(x[0])) for x in model.predict(XX) ]
label2word = {1:'积极', 0:'消极'}
for i in range(len(INPUT_SENTENCES)):
    print('{}   {}'.format(label2word[labels[i]], INPUT_SENTENCES[i]))

积极   I like what you say.
积极   I love you


In [19]:
word_freqs = collections.Counter()
with open('training.txt','r') as f:
    for line in f:
        label, sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence.lower())
        for word in words:
            word_freqs[word]+=1
        print word_freqs.most_common(2)
        break

[('code', 1), ('just', 1)]


In [20]:
X = np.empty(3,dtype=list)

In [21]:
X

array([None, None, None], dtype=object)