https://github.com/UKPLab/deeplearning4nlp-tutorial/blob/master/LICENSE

In [1]:
import numpy as np
import os

# Preprocessing

## Read input data

In [2]:
def readFile(filepath):
    sentences = []
    labels = []
    
    for line in open(filepath):
        splits = line.split()
        label = int(splits[0])
        words = splits[1:]
        
        labels.append(label)
        sentences.append(words)
    
    print(filepath, len(sentences), 'sentences')
    return sentences, labels

In [3]:
traind = readFile('data/sent-class-cnn/train.txt')
devd   = readFile('data/sent-class-cnn/dev.txt')
testd  = readFile('data/sent-class-cnn/test.txt')

data/sent-class-cnn/train.txt 5330 sentences
data/sent-class-cnn/dev.txt 2664 sentences
data/sent-class-cnn/test.txt 2668 sentences


In [4]:
traind[0][:1]

[['i',
  'like',
  'my',
  'christmas',
  'movies',
  'with',
  'more',
  'elves',
  'and',
  'snow',
  'and',
  'less',
  'pimps',
  'and',
  "ho's",
  '.']]

In [5]:
traind[1][:1]

[0]

## word index

In [6]:
unique_words = set()
for dataset in [traind, devd, testd]:
    for sentence in dataset[0]:
        for word in sentence:
            unique_words.add(word.lower())

In [7]:
unique_words.add('__PADDING__')

In [8]:
word_index_dict = dict([(x,i) for i,x in enumerate(unique_words)])

## glove

In [9]:
glove_index = {}
f = open('data/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_index[word] = coefs
f.close()

In [10]:
embedding_dim = 100
embedding_matrix = np.zeros((len(unique_words), embedding_dim))
for word, i in word_index_dict.items():
    embedding_vector = glove_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
embedding_matrix.shape

(21348, 100)

## matrix

In [12]:
def createMatrix(sentences):
    padding_index = word_index_dict['__PADDING__']
    
    xMatrix = []
    for sentence in sentences:
        wordIndices = []
        for word in sentence:
            if word.lower() in word_index_dict:
                wordIndices.append(word_index_dict[word.lower()])
            else:
                wordIndices.append(padding_index)
        xMatrix.append(wordIndices)
    
    return xMatrix

In [13]:
train_mat = createMatrix(traind[0])
dev_mat   = createMatrix(devd[0])
test_mat  = createMatrix(testd[0])

In [14]:
print(len(train_mat[0]))
print(len(train_mat[1]))
print(len(train_mat[2]))

16
28
18


In [15]:
# :: Find the longest sentence in our dataset ::
max_sentence_len = 0
for sentence in train_mat + dev_mat + test_mat:
    max_sentence_len = max(len(sentence), max_sentence_len)
max_sentence_len

59

In [16]:
y_train = np.array(traind[1])
y_dev = np.array(devd[1])
y_test = np.array(testd[1])

In [17]:
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
X_train = sequence.pad_sequences(train_mat, maxlen=max_sentence_len)
X_dev = sequence.pad_sequences(dev_mat, maxlen=max_sentence_len)
X_test = sequence.pad_sequences(test_mat, maxlen=max_sentence_len)

In [19]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate, Embedding, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
from keras.regularizers import Regularizer

In [20]:
words_input = Input(shape=(max_sentence_len,), dtype='int32', name='words_input')
wordsEmbeddingLayer = Embedding(embedding_matrix.shape[0],
                                embedding_matrix.shape[1],                                     
                                weights=[embedding_matrix],
                                trainable=False)
words = wordsEmbeddingLayer(words_input)

In [21]:
#Now we add a variable number of convolutions
words_convolutions = []
for filter_length in [1,2,3]:
    words_conv = Convolution1D(filters=50,
                            kernel_size=filter_length,
                            padding='same',
                            activation='relu',
                            strides=1)(words)
                            
    words_conv = GlobalMaxPooling1D()(words_conv)      
    
    words_convolutions.append(words_conv)  

output = concatenate(words_convolutions)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [22]:
import keras

In [23]:
output = concatenate(words_convolutions)
output = Dropout(0.5)(output)
output = Dense(100, activation='tanh', kernel_regularizer=keras.regularizers.l2(0.01))(output)
output = Dropout(0.25)(output)
output = Dense(1, activation='sigmoid',  kernel_regularizer=keras.regularizers.l2(0.01))(output)
model = Model(inputs=[words_input], outputs=[output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words_input (InputLayer)        (None, 59)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 59, 100)      2134800     words_input[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 59, 50)       5050        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 59, 50)       10050       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [24]:
for epoch in range(20):
    print("\n------------- Epoch %d ------------" % (epoch+1))
    model.fit(X_train, y_train, batch_size=50, epochs=1, verbose=2)
    
    #Use Keras to compute the loss and the accuracy
    dev_loss, dev_accuracy = model.evaluate(X_dev, y_dev, verbose=False)
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=False)
    
  
    print("Dev-Accuracy: %.2f%% (loss: %.4f)" % (dev_accuracy*100, dev_loss))
    print("Test-Accuracy: %.2f%% (loss: %.4f)" % (test_accuracy*100, test_loss))


------------- Epoch 1 ------------
Epoch 1/1
 - 2s - loss: 1.6136 - acc: 0.5257
Dev-Accuracy: 60.06% (loss: 1.2605)
Test-Accuracy: 60.68% (loss: 1.2585)

------------- Epoch 2 ------------
Epoch 1/1
 - 1s - loss: 1.1076 - acc: 0.6242
Dev-Accuracy: 71.21% (loss: 0.9118)
Test-Accuracy: 70.50% (loss: 0.9144)

------------- Epoch 3 ------------
Epoch 1/1
 - 1s - loss: 0.8445 - acc: 0.7011
Dev-Accuracy: 72.56% (loss: 0.7524)
Test-Accuracy: 71.78% (loss: 0.7575)

------------- Epoch 4 ------------
Epoch 1/1
 - 1s - loss: 0.7087 - acc: 0.7319
Dev-Accuracy: 72.18% (loss: 0.6717)
Test-Accuracy: 71.70% (loss: 0.6821)

------------- Epoch 5 ------------
Epoch 1/1
 - 1s - loss: 0.6224 - acc: 0.7565
Dev-Accuracy: 74.40% (loss: 0.6129)
Test-Accuracy: 73.16% (loss: 0.6253)

------------- Epoch 6 ------------
Epoch 1/1
 - 1s - loss: 0.5687 - acc: 0.7705
Dev-Accuracy: 74.29% (loss: 0.5836)
Test-Accuracy: 73.80% (loss: 0.5974)

------------- Epoch 7 ------------
Epoch 1/1
 - 1s - loss: 0.5228 - acc: 0.