In [5]:
import os
import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Model
from keras.layers import Conv1D, Dense, Embedding, Flatten, Input, LSTM, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

Using TensorFlow backend.


In [6]:
#
# Configuration
#
MAX_NB_WORDS=25000
MAX_SEQUENCE_LENGTH=1000
N_GLOVE_TOKENS=400000
EMBEDDING_DIM = 100

In [7]:
#
# Load the data
#
positive_dir = "aclImdb/train/pos"
negative_dir = "aclImdb/train/neg"

def read_text(filename):
        with open(filename,encoding='utf8') as f:
                return f.read().lower()

print ("Reading negative reviews.")
negative_text = [read_text(os.path.join(negative_dir, filename))
        for filename in tqdm.tqdm(os.listdir(negative_dir))]
        
print ("Reading positive reviews.")
positive_text = [read_text(os.path.join(positive_dir, filename))
        for filename in tqdm.tqdm(os.listdir(positive_dir))]


labels_index = { "negative": 0, "positive": 1 }

labels = [0 for _ in range(len(negative_text))] + \
        [1 for _ in range(len(negative_text))]
    
texts = negative_text + positive_text
 


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np_utils.to_categorical(np.asarray(labels))
print ("data.shape = {0}, labels.shape = {1}".format(data.shape, labels.shape))

x_train, x_test, y_train, y_test = train_test_split(data, labels)


#
# Load word embeddings
#
print("Loading word embeddings.")
embeddings_index = dict()
with open("glove.6B.100d.txt",encoding='utf8') as f:
        for line in tqdm.tqdm(f, total=N_GLOVE_TOKENS):
                values = line.split()
                word, coefficients = values[0], np.asarray(values[1:], dtype=np.float32)
                embeddings_index[word] = coefficients

embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

print ("embedding_matrix.shape = {0}".format(embedding_matrix.shape))

embedding_layer = Embedding(len(word_index)+1,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

Reading negative reviews.


100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:12<00:00, 173.37it/s]


Reading positive reviews.


100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:11<00:00, 175.90it/s]


data.shape = (25000, 1000), labels.shape = (25000, 2)
Loading word embeddings.


100%|███████████████████████████████████████████████████████████████████████| 400000/400000 [00:14<00:00, 28484.99it/s]


embedding_matrix.shape = (88583, 100)


In [8]:
#
# Build 1D ConvNet
#
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedded_sequences = embedding_layer(sequence_input)


x = Conv1D(128, 5, activation="relu")(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)


#x = LSTM(64, dropout_W=0.2, dropout_U=0.2)(x)
#x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation="relu")(x)

preds = Dense(len(labels_index), activation="softmax")(x)

model = Model(sequence_input, preds)
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"])

#
# Train the model
#
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          nb_epoch=4, batch_size=128)



Train on 18750 samples, validate on 6250 samples
Epoch 1/4
Epoch 2/4


Epoch 3/4


Epoch 4/4




<keras.callbacks.History at 0x22b1feedf28>

In [9]:
%store -r data
data.head()

Unnamed: 0,sentiment,text
1,positive,Greetings and welcome to the Microsoft Fiscal ...
10,positive,technology. Microsoft 365 helps every organiza...
100,positive,is coming from the line of Walter Pritchard wi...
101,neutral,we're growing at eye-popping rates right now. ...
102,positive,the key things that we think about is differen...


In [10]:
texts = data.text
labels = data.sentiment.map(str.lower).map({'positive' : 1, 'neutral' : 0, 'negative' : 0}).map(int)

In [11]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index


test_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [12]:
labels_pred = model.predict(test_data)
labels_pred

array([[ 0.77918488,  0.22081508],
       [ 0.93208724,  0.06791282],
       [ 0.89135247,  0.10864758],
       ..., 
       [ 0.83458674,  0.16541333],
       [ 0.78091168,  0.21908833],
       [ 0.82761705,  0.17238298]], dtype=float32)

In [13]:
l_p = np.argmax(labels_pred, axis=1)

In [14]:
confusion_matrix(y_pred=l_p, y_true=labels)

array([[119, 186],
       [120, 197]], dtype=int64)

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         8858300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
__________