In [1]:
TXT_SENTOKEN_DIR = 'review_polarity/txt_sentoken'

In [2]:
from nltk.corpus import stopwords
import string
import re
import os


def load_doc(filename):
    with open(filename, 'r') as f:
        text = f.read()
    return text

def clean_doc(doc):
    tokens = doc.split()
    
    re_punc = re.compile('[{}]'.format(re.escape(string.punctuation)))
    
    tokens = [re_punc.sub('', w) for w in tokens]
    
    tokens = [w for w in tokens if w.isalpha()]
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [w for w in tokens if not w in stop_words]
    
    tokens = [w for w in tokens if len(w) > 1]
    return tokens

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    
    tokens = clean_doc(doc)
    
    vocab.update(tokens)
    
def process_docs(directory, vocab):
    for fn in os.listdir(directory):
        if fn.startswith('cv9'):
            continue
        
        path = os.path.join(directory, fn)
        add_doc_to_vocab(path, vocab)
        
def save_list(lines, filename):
    data = '\n'.join(lines)
    
    with open(filename, 'w') as f:
        f.write(data)
        

In [3]:
fn = os.path.join(TXT_SENTOKEN_DIR, 'pos/cv000_29590.txt')
text = load_doc(fn)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [4]:
from collections import Counter
vocab = Counter()

process_docs(os.path.join(TXT_SENTOKEN_DIR, 'neg'), vocab)
process_docs(os.path.join(TXT_SENTOKEN_DIR, 'pos'), vocab)

print(len(vocab))

min_occurrence = 2

tokens = [k for k,c in vocab.items() if c >= min_occurrence]

print(len(tokens))

save_list(tokens, 'vocab.txt')

44276
25767


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.utils import plot_model
from numpy import array

def process_docs(directory, vocab, is_train):
    documents = list()
    
    for fn in os.listdir(directory):
        if is_train and fn.startswith('cv9'):
            continue
        if not is_train and not fn.startswith('cv9'):
            continue
        
        path = os.path.join(directory, fn)
        
        doc = load_doc(path)
        
        tokens = clean_doc(doc, vocab)
        
        documents.append(tokens)
    return documents

def clean_doc(doc, vocab):
    tokens = doc.split()
    
    re_punc = re.compile('[{}]'.format(re.escape(string.punctuation)))
    
    tokens = [re_punc.sub('', w) for w in tokens]
    
    tokens = [w for w in tokens if w in vocab]
    
    tokens = ' '.join(tokens)
    return tokens

def load_clean_dataset(vocab, is_train):
    neg = process_docs(os.path.join(TXT_SENTOKEN_DIR, 'neg'), vocab, is_train)
    pos = process_docs(os.path.join(TXT_SENTOKEN_DIR, 'pos'), vocab, is_train)
    
    docs = neg + pos
    
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    model.summary()
    plot_model(model, to_file='embedding_cnn.png', show_shapes=True)
    return model

Using TensorFlow backend.
  return f(*args, **kwds)


In [6]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab, True)

tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1

print('Vocabulary size :{}'.format(vocab_size))

max_length = max([len(s.split()) for s in train_docs])

print('Maximum length: {}'.format(max_length))

x_train = encode_docs(tokenizer, max_length, train_docs)

model = define_model(vocab_size, max_length)

model.fit(x_train, ytrain, epochs=10, verbose=2)

model.save('cnn-model.h5')

Vocabulary size :25768
Maximum length: 1317
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
______________________

In [7]:
from keras.models import load_model
test_docs, ytest = load_clean_dataset(vocab, False)
x_test = encode_docs(tokenizer, max_length, test_docs)

In [8]:
model = load_model('cnn-model.h5')
_, acc = model.evaluate(x_train, ytrain, verbose=0)
print('Train Accuracy: {}'.format(acc*100))

_, acc = model.evaluate(x_test, ytest, verbose=0)
print('Test Accuracy: {}'.format(acc*100))

Train Accuracy: 100.0
Test Accuracy: 87.5


In [9]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    
    padded = encode_docs(tokenizer, max_length, [line])
    
    yhat = model.predict(padded, verbose=0)
    
    percent_pos = yhat[0,0]
    
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [10]:
text = 'Everyone will enjoy this film. I love it, recommneded!'

percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: {}\nSentiment: {} ({})'.format(text, sentiment, percent*100))

Review: Everyone will enjoy this film. I love it, recommneded!
Sentiment: NEGATIVE (51.3288289308548)


In [11]:
text = 'This is a bad movie. Do not watch it. It sucks.'

percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: {}\nSentiment: {} ({})'.format(text, sentiment, percent*100))

Review: This is a bad movie. Do not watch it. It sucks.
Sentiment: NEGATIVE (57.188668847084045)
