Sentiment Analysis is a familiar task in NLTK domain. In **TextClassification_NeuralNetwork** notebook,we explore CNN model on movie review based on **word** approach. In this, we learn **word embedding** method and show how Keras **Embedding** layer outperformes **bag-of-word** approach. In this notebook we study n-gram CNN model for sentiment analysis.   
In keras, a multiple  inpur model can be defined using function API. We will define a model with three input channels for processing 4 grams, 6-grams and 8-grams of movie review test. Each channel is comprised of the following element   
* **Input** layer defines the length of sequences
* **Embedding** layer set to the size of vocabulary and 100-dimensional real-valued representation   
* **Conv1D** layer with 32 filters and a kernel size set to the number of words ro read at once.   
* **MaxPooling1D** layer to consolidate the output from the convolutional layer.   
* **Flatten** layer to reduce the three-dimensional outputto two dimenstional for concatenation.   

The output from the three channels are concatenated into a single vector and process by a **Dense** layer and an output layer. This can be done as 

In [17]:
def create_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters=32,kernel_size=6, activation ='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size =8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1,flat2, flat3])
    
    # intepretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1,inputs2,inputs3],outputs=outputs)
    
    # compile
    model.compile(loss='binary_crossentropy', optimizer ='adam',metrics =['accuracy'])
    # summary
    model.summary()
    
    return model
    


## Remain code

In [4]:
from pickle import load,dump
import numpy as np
import string
import re
from nltk.corpus import stopwords
from os import listdir
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Dropout, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate

In [7]:
# load doc into memory
def load_doc(filename):
    file = open(filename,'r')
    # read all text
    text = file.read()
    file.close()
    return text

#generate clean tokens
def clean_doc(doc):
    # split into token
    tokens = doc.split()
    # prepare regex for char
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    # remove punctuation
    tokens = [re_punc.sub('',w) for w in tokens]
    # eleminate non alphabetic
    tokens = [w for w in tokens if w.isalpha()]
    
    # filter out stop word
    stop_words =set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # filter out short tokens
    tokens = [w for w in tokens if len(w) > 1]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, is_train):
    documents = list()
    # walk through all files
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory +'/' + filename
        doc = load_doc(path)
    # clean doc
        tokens = clean_doc(doc)
    # add to list
        documents.append(tokens)
    return documents

def load_clean_dataset(is_train):
# load and clean a dataset
    neg = process_docs('/home/tri/Downloads/txt_sentoken/neg',is_train)
    pos = process_docs('/home/tri/Downloads/txt_sentoken/pos',is_train)
    docs = neg +pos
    
    labels =[0 for _ in range(len(neg))] +[1 for _ in range(len(pos))]
    return docs, labels
 
def save_dataset(dataset, filename):
    dump(dataset, open(filename,'wb'))
    print('Saved: %s' % filename)
    
    
train_docs , ytrain = load_clean_dataset(True)
test_docs,ytest = load_clean_dataset(False)

# save training datasets

save_dataset([train_docs,ytrain],'train.pkl')
save_dataset([test_docs,ytest],'test.pkl')

    

Saved: train.pkl
Saved: test.pkl


In [10]:
# load a clean dataset
def load_dataset(filename):
    return load(open(filename,'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])


# encoding a list of lines
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    # padding encoded sequences
    padded = pad_sequences(encoded, maxlen= length, padding='post')
    
    return padded

In [None]:
# load training
trainLines, trainLabels = load_dataset('train.pkl')

# create tokenizer
tokenizer = create_tokenizer(trainLines)

# calculate max doc length
length= max_length(trainLines)
print('Max document length: %d' % length)

#calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' %vocab_size)

# encode data
trainX = encode_text(tokenizer, trainLines, length)

# create model
model = create_model(length, vocab_size)

# fit model
model.fit([trainX, trainX, trainX], np.array(trainLabels), epochs=7, batch_size =16)

# save model
model.save('model.h5')

Max document length: 1380
Vocabulary size: 44277
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 1380)          0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 1380)          0                                            
____________________________________________________________________________________________________
input_13 (InputLayer)            (None, 1380)          0                                            
____________________________________________________________________________________________________
embedding_11 (Embedding)         (None, 1380, 100)     4427700     input_11[0][0]                   
__________________________________________