In [2]:
#https://github.com/richliao/textClassifier/blob/master/textClassifierConv.py

import numpy as np
import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPool1D, Embedding, Merge, Dropout
from keras.models import Model

In [4]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [5]:
def clean_str(string):
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [7]:
data_train = pd.read_csv("../Data/imdb_kaggle/labeledTrainData.tsv", sep="\t")

In [9]:
data_train.shape

(25000, 3)

In [10]:
texts = []
labels = []

In [20]:
for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    texts.append(clean_str(str(text.get_text().encode("ascii", "ignore"))))
    labels.append(data_train.sentiment[idx])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [21]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [31]:
word_index = tokenizer.word_index
print("Found %s unique tokens" % len(word_index))

Found 83244 unique tokens


In [33]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [35]:
data.shape

(25000, 1000)

In [36]:
labels = to_categorical(np.asarray(labels))

In [38]:
labels.shape, data.shape

((25000, 2), (25000, 1000))

In [39]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [40]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [42]:
print('Number of positive and negative reviews in traing and validation set ')
y_train.sum(axis=0), y_val.sum(axis=0)

Number of positive and negative reviews in traing and validation set 


(array([  9957.,  10043.]), array([ 2543.,  2457.]))

In [44]:
GLOVE_DIR = "../../Data/glove1.2-6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, "glove.6B.100d.txt"), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [45]:
print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [47]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

In [53]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Model

In [54]:
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)

In [58]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")

In [59]:
embedding_sequences = embedding_layer(sequence_input)

In [63]:
l_conv1 = Conv1D(128, 5, activation="relu")(embedding_sequences)

In [65]:
l_pool1 = MaxPool1D(5)(l_conv1)

In [68]:
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPool1D(5)(l_cov2)

In [69]:
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPool1D(35)(l_cov3)

In [70]:
l_flat = Flatten()(l_pool3)

In [71]:
l_Dense = Dense(128, activation="relu")(l_flat)

In [72]:
preds = Dense(2, activation="softmax")(l_Dense)

In [73]:
model = Model(sequence_input, preds)

In [74]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])

In [75]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         8324500   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 35, 128)           82048     
__________

In [76]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=128)

  


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1dc01a82ef0>

### Model 2

In [77]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

In [78]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [79]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [92]:
convs = []
filter_sizes = [3, 4, 5]

In [93]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedded_sequences = embedding_layer(sequence_input)

In [94]:
for fsz in filter_sizes:
    l_conv = Conv1D(filters=128,kernel_size=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPool1D(5)(l_conv)
    convs.append(l_pool) 

In [96]:
l_merge = Merge(mode="concat", concat_axis=1)(convs)

  """Entry point for launching an IPython kernel.


In [98]:
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_pool1 = MaxPool1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPool1D(30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

In [99]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1000, 100)     8324500     input_3[0][0]                    
____________________________________________________________________________________________________
conv1d_12 (Conv1D)               (None, 998, 128)      38528       embedding_2[1][0]                
____________________________________________________________________________________________________
conv1d_13 (Conv1D)               (None, 997, 128)      51328       embedding_2[1][0]                
___________________________________________________________________________________________

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=50)