In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
#Read the IMDB dataset with 25K reviews for training. 

df = pd.read_csv("./all/labeledTrainData.tsv", sep = '\t', 
                 error_bad_lines=False )
print("Total no. of reviews are ", df.shape[0])
print("cols are ", df.columns)
print("Sample reviews are ")
print(df.loc[:5,['review','sentiment']])

Total no. of reviews are  25000
cols are  Index(['id', 'sentiment', 'review'], dtype='object')
Sample reviews are 
                                              review  sentiment
0  With all this stuff going down at the moment w...          1
1  \The Classic War of the Worlds\" by Timothy Hi...          1
2  The film starts with a manager (Nicholas Bell)...          0
3  It must be assumed that those who praised this...          0
4  Superbly trashy and wondrously unpretentious 8...          1
5  I dont know why people think this is such a ba...          1


In [3]:
#Import the stopwords (common words) to be removed from the corpus
"""
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
s = set(stopwords.words('english'))
s.remove('not')
print("Stopwords length", len(s))
"""

'\nimport re\nimport nltk\nnltk.download(\'stopwords\')\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\ncorpus = []\ns = set(stopwords.words(\'english\'))\ns.remove(\'not\')\nprint("Stopwords length", len(s))\n'

### 引入外部資料源的向量

In [5]:
word2vec = {}
with open('./all/glove.6B.50d.txt', encoding="utf8") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Found 400000 word vectors.


### 將每篇文章做預處理，最後塞入一個向量

In [6]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
MAX_VCOCAB_SIZE = 5000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 1500

tokenizer = Tokenizer( filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ')
sequences = tokenizer.fit_on_texts(df['review'])
word_index = tokenizer.word_index
documents = tokenizer.texts_to_sequences(df['review'])
#print(word_index)
token_count = len(word_index)+1
print('Found {} unique tokens.'.format(token_count))

#print(t.word_counts)
print("Total documents ", tokenizer.document_count)
#print(t.word_index)
#print(t.word_docs)
print("max sequence length:", max(len(s) for s in documents))
print("min sequence length:", min(len(s) for s in documents))

# pad sequences so that we get a N x T matrix
data = pad_sequences(documents, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Shape of data tensor:', data.shape)
print(data[1])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 88583 unique tokens.
Total documents  25000
max sequence length: 2493
min sequence length: 10
Shape of data tensor: (25000, 1500)
[  1 353 322 ...   0   0   0]


### 將每個字對應glove引入的向量

In [7]:
print('Filling pre-trained embeddings...')
embedding_matrix = np.zeros((token_count, EMBEDDING_DIM))
for word, i in word_index.items():
  #if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word) #get(word) is used instead of [word] as it won't give exception in case word is not found
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i,:] = embedding_vector

print("Sample embedded dimension ")
print(embedding_matrix[10][:5])

Filling pre-trained embeddings...
Sample embedded dimension 
[ 0.11891   0.15255  -0.082073 -0.74144   0.75917 ]


In [8]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Dense, GlobalAveragePooling1D 
from keras.layers import Embedding, Conv2D, GlobalMaxPooling1D 
from keras import regularizers

embedding_layer = Embedding(
  token_count,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False)

In [9]:
model = Sequential()
model.add(embedding_layer)#, input_shape= (token_count, EMBEDDING_DIM))
model.add(Conv1D(filters = 64, kernel_size = 4, padding = 'same', activation='relu'))
                 #input_shape=(token_count,EMBEDDING_DIM)))
model.add(MaxPooling1D())#kernel_size=500))
model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',  activation='relu', 
                 kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.25))
model.add(MaxPooling1D())
model.add(Conv1D(filters = 256, kernel_size = 2, padding = 'same', activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
#model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1500, 50)          4429150   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1500, 64)          12864     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 750, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 750, 128)          24704     
_________________________________________________________________
dropout_1 (Dropout)          (None, 750, 128)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 375, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 375, 256)          65792     
__________

In [10]:
from keras.layers import Input, Dense, Concatenate
from keras.models import Model

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))

x = embedding_layer(inputs)
print(x)
x1 = Conv1D(filters = 100, kernel_size = 3, padding = 'same', activation='relu'
           ,kernel_regularizer=regularizers.l1(0.01))(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x1 = GlobalMaxPooling1D()(x1)

x2 = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation='relu'
           ,kernel_regularizer=regularizers.l1(0.01))(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x2 = GlobalMaxPooling1D()(x2) #pool_size=1500


x3 = Conv1D(filters = 100, kernel_size = 5, padding = 'same', activation='relu'
           ,kernel_regularizer=regularizers.l1(0.01))(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x3 = GlobalMaxPooling1D()(x3)

# a layer instance is callable on a tensor, and returns a tensor
print(x3)
x = Concatenate()([x1,x2,x3])
print(x)
x = Dense(192)(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.25)(x)
output = Dense(1, activation='sigmoid')(x)

# This creates a model that includes
# the Input layer and three Dense layers
Fmodel = Model(inputs=inputs, outputs=output)
Fmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(Fmodel.summary())

Tensor("embedding_1_1/Gather:0", shape=(?, 1500, 50), dtype=float32)
Tensor("global_max_pooling1d_4/Max:0", shape=(?, 100), dtype=float32)
Tensor("concatenate_1/concat:0", shape=(?, 300), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1500)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1500, 50)     4429150     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 1500, 100)    15100       embedding_1[1][0]                
__________________________________________________________________________________________________
conv1

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, df['sentiment'], 
                                                    test_size=0.2, random_state=42)


print(x_train.shape)
#model.fit(x_train, y_train , batch_size=96, epochs=35, validation_split = 0.25)
#score = model.evaluate(x_test, y_test, batch_size=32)

(20000, 1500)


In [12]:
Fmodel.fit(x_train, y_train , batch_size=96, epochs=35, validation_split = 0.1)

Train on 18000 samples, validate on 2000 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x130410780>

In [13]:
print("Concatenated CNN Result")
print("Loss & accuracty on test set is", Fmodel.evaluate(x_test, y_test))

#print("Traditional CNN Result")
#print("Loss & accuracty on test set is", model.evaluate(x_test, y_test))

Concatenated CNN Result
Loss & accuracty on test set is [0.7597295370101929, 0.5038]
