In [18]:
# Word Embeddings + CNN = Text Classification

# Develop an embedding + CNN model for sentimental analysis 

import string,re
from nltk.corpus import stopwords
import os 
from os import listdir

In [20]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [9]:
def clean_doc(doc):
    tokens = doc.split()
    text = [t for t in tokens if t not in string.punctuation]
    text = [t for t in text if t.isalpha()]
    text = [t for t in text if t not in stopwords.words('english')]
    text = [t.lower() for t in text]
    text = [t for t in text if len(t) > 1]
    return text

In [11]:
#laod doc and add to vocab
def add_doc_to_vocab(filename,vocab):
    doc = load_doc(filename)
    preprocessed_doc = clean_doc(doc)
    vocab.update(preprocessed_doc)

In [12]:
def process_docs(directory,vocab):
    
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path,vocab)

In [13]:
# save list to file
def save_to_list(lines,filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [14]:
from collections import Counter

In [21]:
vocab = Counter()

#add all docs to vocab 
process_docs('review_polarity/txt_sentoken/pos', vocab)
process_docs('review_polarity/txt_sentoken/neg', vocab)


In [22]:
print(len(vocab))

36037


In [23]:
# keep tokens with minimum occurence
min_occurence = 2

tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))

save_to_list(tokens,'vocab1.txt')

23260


In [24]:
vocab_filename = 'vocab1.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())


In [30]:
def process_docs(directory, vocab, is_train):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
    # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

In [40]:
def clean_doc(doc,vocab):
    tokens = doc.split()
    text = [t for t in tokens if t not in string.punctuation]
    text = [t for t in text if t.isalpha()]
    text = [t for t in text if t not in stopwords.words('english')]
    text = [t.lower() for t in text]
    text = [t for t in text if len(t) > 1]
    text = [t for t in text if t in vocab]
    text = ' '.join(text)
    return text

In [43]:
import numpy as np
def load_clean_dataset(vocab, is_train):
    # load documents
    neg = process_docs('review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = np.array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

The next step is to encode each document as a sequence of integers. The Keras Embedding
layer requires integer inputs where each integer maps to a single token that has a specific
real-valued vector representation within the embedding. These vectors are random at the
beginning of training, but during training become meaningful to the network. We can encode
the training documents as sequences of integers using the Tokenizer class in the Keras API.
First, we must construct an instance of the class then train it on all documents in the training
dataset. In this case, it develops a vocabulary of all tokens in the training dataset and develops
a consistent mapping from words in the vocabulary to unique integers. We could just as easily
develop this mapping ourselves using our vocabulary file. The create tokenizer() function
below will prepare a Tokenizer from the training data.

In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

Now that the mapping of words to integers has been prepared, we can use it to encode the
reviews in the training dataset. We can do that by calling the texts to sequences() function
on the Tokenizer. We also need to ensure that all documents have the same length. This is a
requirement of Keras for efficient computation. We could truncate reviews to the smallest size
or zero-pad (pad with the value 0) reviews to the maximum length, or some hybrid. In this case,
we will pad all reviews to the length of the longest review in the training dataset. First, we can
find the longest review using the max() function on the training dataset and take its length.
We can then call the Keras function pad sequences() to pad the sequences to the maximum
length by adding 0 values on the end.

In [48]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def encode_seq(tokenzier,max_length,docs):
    tokens = tokenizer.texts_to_sequences(docs)
    
    padded_seq = pad_sequences(tokens,maxlen=max_length,padding='post')
    
    return padded_seq
    

In [46]:
train_docs , ytrain = load_clean_dataset(vocab,True)
test_docs, ytest = load_clean_dataset(vocab,False)

tokenizer = create_tokenizer(train_docs)

max_length = max([len(s.split()) for s in train_docs])




AttributeError: 'Tokenizer' object has no attribute 'text_to_sequences'

In [49]:
Xtrain = encode_seq(tokenizer,max_length,train_docs)
Xtest = encode_seq(tokenizer,max_length,test_docs)

In [50]:
vocab_size = len(tokenizer.word_index) + 1

In [56]:
print(vocab_size)

13850


We are now ready to define our neural network model. The model will use an Embedding
layer as the first hidden layer. The Embedding layer requires the specification of the vocabulary
size, the size of the real-valued vector space, and the maximum length of input documents. The
vocabulary size is the total number of words in our vocabulary, plus one for unknown words.
This could be the vocab set length or the size of the vocab within the tokenizer used to integer
encode the documents



In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Conv1D,MaxPooling1D,Flatten

We will use a 100-dimensional vector space, but you could try other values, such as 50 or
150. Finally, the maximum document length was calculated above in the max length variable
used during padding. The complete model definition is listed below including the Embedding
layer. We use a Convolutional Neural Network (CNN) as they have proven to be successful
at document classification problems. A conservative CNN configuration is used with 32 filters
(parallel fields for processing words) and a kernel size of 8 with a rectified linear (relu) activation
function. This is followed by a pooling layer that reduces the output of the convolutional layer
by half.
Next, the 2D output from the CNN part of the model is flattened to one long 2D vector to
represent the features extracted by the CNN. The back-end of the model is a standard Multilayer
Perceptron layers to interpret the CNN features. The output layer uses a sigmoid activation
function to output a value between 0 and 1 for the negative and positive sentiment in the review

In [59]:
def define_model(vocab_size,max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,150,input_length = max_length))
    model.add(Conv1D(filters=32,kernel_size=8,activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [60]:
model = define_model(vocab_size,max_length)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1186, 150)         2077500   
_________________________________________________________________
conv1d (Conv1D)              (None, 1179, 32)          38432     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 589, 32)           0         
_________________________________________________________________
flatten (Flatten)            (None, 18848)             0         
_________________________________________________________________
dense (Dense)                (None, 10)                188490    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 2,304,433
Trainable params: 2,304,433
Non-trainable params: 0
____________________________________________

In [61]:
model.fit(Xtrain,ytrain,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ebba5ca8e0>

In [62]:
model.save('embedding_model.h5')

In [63]:
# Evaluating model

lossTrain,accTrain = model.evaluate(Xtrain,ytrain)
lossTest,accTest = model.evaluate(Xtest,ytest)

print(accTrain)
print(accTest)

1.0
0.8849999904632568


In [66]:
# classify a review as negative or positive 

def predict_sentiment(review,vocab,tokenizer,max_length,model):
    
    preprocessed_doc = clean_doc(review,vocab)
    
    padded = encode_seq(tokenizer,max_length,[review])
    
    yhat = model.predict(padded)
    
    #retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
        return percent_pos, 'POSITIVE'

    

In [70]:
text = 'Everyone will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


TypeError: cannot unpack non-iterable NoneType object

In [68]:
# test negative text
text = 'This is a bad movie. Do not watch it. It sucks.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: NEGATIVE (55.143%)


# N-gram CNN model for sentimental analysis

A standard deep learning model for text classification and sentiment analysis uses a word
embedding layer and one-dimensional convolutional neural network. The model can be expanded
by using multiple parallel convolutional neural networks that read the source document using
different kernel sizes. This, in effect, creates a multichannel convolutional neural network for
text that reads text with different n-gram sizes (groups of words). In this tutorial, you will
discover how to develop a multichannel convolutional neural network for sentiment prediction
on text movie review data

In [74]:
from pickle import dump
# save a dataset to file
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)


In [75]:
# save training datasets
save_dataset([train_docs, ytrain], 'train.pkl')
save_dataset([test_docs, ytest], 'test.pkl')


Saved: train.pkl
Saved: test.pkl


In [78]:
# Develop multichannel model
# 1) Encode data

from pickle import load 
def load_dataset(filename):
    return load(open(filename,'rb'))

In [79]:
trainLines,trainLabels = load_dataset('train.pkl')

In [81]:
print(trainLines[0])

plot two teen couples go church party drink drive get accident one guys dies girlfriend continues see life nightmares deal watch movie sorta find critique movie teen generation touches cool idea presents bad package makes review even harder one write since generally applaud films attempt break mold mess head lost highway memento good bad ways making types films folks one correctly seem taken pretty neat concept executed terribly problems movie well main problem simply jumbled starts normal fantasy world audience member idea going dreams characters coming back dead others look like dead strange apparitions chase scenes tons weird things happen simply explained personally mind trying unravel film every give clue get kind fed biggest problem obviously got big secret hide seems want hide completely final five minutes make things entertaining thrilling even engaging meantime really sad part arrow dig flicks like actually figured point strangeness start make little bit sense still make film 

In Keras, a multiple-input model can be defined using the functional API. We will define a
model with three input channels for processing 4-grams, 6-grams, and 8-grams of movie review
text. Each channel is comprised of the following elements:
 Input layer that defines the length of input sequences.
 Embedding layer set to the size of the vocabulary and 100-dimensional real-valued representations.
 Conv1D layer with 32 filters and a kernel size set to the number of words to read at once.
 MaxPooling1D layer to consolidate the output from the convolutional layer.
 Flatten layer to reduce the three-dimensional output to two dimensional for concatenation.
The output from the three channels are concatenated into a single vector and process by a
Dense layer and an output layer. The function below defines and returns the model. As part of
defining the model, a summary of the defined model is printed and a plot of the model graph is
created and saved to file.

In [105]:
from pickle import load
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import concatenate

In [109]:
def build_model(vocab_size,length):
    
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters=32,kernel_size=4,activation='relu')(embedding1)
    drop1 = Dropout(0.4)(conv1)
    maxpool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(maxpool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters=32,kernel_size=6,activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    maxpool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(maxpool2)
    
    #channel 3 
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters=32,kernel_size=8,activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    maxpool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(maxpool3)
    
    #merge 
    merged = concatenate([flat1,flat2,flat3])
    
    #Dense layers
    Dense1 = Dense(10,activation='relu')(merged)
    outputs = Dense(1,activation='sigmoid')(Dense1)
    
    model = Model(inputs = [inputs1,inputs2,inputs3],outputs=outputs)
    
    #compile
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['accuracy'])
    
    model.summary()
    
    return model
    
    

In [84]:
#getting the training data
trainLines , trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')

In [85]:
print(testLines[0])

may critic alive harbors much affection monster movies delighted entertainment ron yarn tremors even last yarn anaconda something films causes lower return saturday youth spent company creature black blob deep rising yarn quite pass test sure enough modern monster movie ingredients place conspicuously collection bait excuse characters isolated location derelict cruise ship south china sea comic relief least one big explosion elements like sleazy anthony heald also appears marine slinky international jewel thief famke janssen whose white cotton tank top hides heart gold happens deep rising noteworthy primarily mechanical manner spits ingredients terrorist crew led mercenary wes studi boat captain finnegan treat williams shows loot cruise ship sea monsters show eat mercenary crew survivors make closing credits go lights hard work much enthusiasm sort especially monster make laugh every time makes scream laughs provided almost entirely kevin generally amusing mechanic stephen sommers seem

In [86]:
tokenizer = create_tokenizer(trainLines)

In [87]:
max_length = max([len(t) for t in trainLines])

In [88]:
vocab_size = len(tokenizer.word_index) + 1

In [90]:
Xtrain = encode_seq(tokenizer,max_length,trainLines)
Xtest = encode_seq(tokenizer,max_length,testLines)

In [110]:
model = build_model(vocab_size,max_length)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 8711)]       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 8711)]       0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 8711)]       0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 8711, 100)    1385000     input_11[0][0]                   
____________________________________________________________________________________________

In [117]:
from tensorflow.keras.utils import plot_model
import graphviz
import pydot
plot_model(model,show_shapes=True,to_file='model.png')

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [119]:
model.fit([Xtrain,Xtrain,Xtrain],trainLabels,epochs=1,batch_size=16)



<tensorflow.python.keras.callbacks.History at 0x2eb805fd400>

In [120]:
model.save('N-gram-CNN.h5')

In [121]:
loss, acc = model.evaluate([Xtest,Xtest,Xtest],testLabels)
print(acc)

0.5


# Extensions 

 Different n-grams. Explore the model by changing the kernel size (number of n-grams)
used by the channels in the model to see how it impacts model skill.

 More or Fewer Channels. Explore using more or fewer channels in the model and see
how it impacts model skill.

 Shared Embedding. Explore configurations where each channel shares the same word
embedding and report on the impact on model skill.16.7. Further Reading 187

 Deeper Network. Convolutional neural networks perform better in computer vision
when they are deeper. Explore using deeper models here and see how it impacts model
skill.

 Truncated Sequences. Padding all sequences to the length of the longest sequence
might be extreme if the longest sequence is very different to all other reviews. Study the
distribution of review lengths and truncate reviews to a mean length.

 Truncated Vocabulary. We removed infrequently occurring words, but still had a large
vocabulary of more than 25,000 words. Explore further reducing the size of the vocabulary
and the effect on model skill.

 Epochs and Batch Size. The model appears to fit the training dataset quickly. Explore
alternate configurations of the number of training epochs and batch size and use the test
dataset as a validation set to pick a better stopping point for training the model.

 Pre-Train an Embedding. Explore pre-training a Word2Vec word embedding in the
model and the impact on model skill with and without further fine tuning during training.

 Use GloVe Embedding. Explore loading the pre-trained GloVe embedding and the
impact on model skill with and without further fine tuning during training.

 Train Final Model. Train a final model on all available data and use it make predictions
on real ad hoc movie reviews from the internet.



