Movie Review Polarity Dataset (review polarity.tar.gz, 3MB). http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar. gz

In [3]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [4]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [5]:
# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    
    return tokens

In [6]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [7]:
# load all docs in a directory
def process_docs(directory, vocab,is_train):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue    
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

In [8]:
# load and clean a dataset
def load_clean_dataset(vocab,is_train):
    # load documents
    neg = process_docs(r'c:/Download/review_polarity/txt_sentoken/neg', vocab,is_train)
    pos = process_docs(r'c:/Download/review_polarity/txt_sentoken/pos', vocab,is_train)
    docs = neg + pos
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

In [9]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [10]:
# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [11]:
# load the vocabulary
vocab_filename = 'newvocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())



In [12]:
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)


In [13]:
type(train_docs)

list

In [14]:
type(ytrain)

list

In [22]:
import numpy as np

In [23]:
ytrain = np.array(ytrain)

In [24]:
type(ytrain)

numpy.ndarray

In [27]:
ytest = np.array(ytest)

In [28]:
type(ytest)

numpy.ndarray

In [15]:
# create the tokenizer
tokenizer = create_tokenizer(train_docs)


In [16]:
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')


In [17]:
type(Xtrain)

numpy.ndarray

In [18]:
Xtrain.shape

(1800, 25768)

In [19]:
Xtest.shape

(200, 25768)

In [20]:
# define the model
n_words = Xtest.shape[1]
print("n_words : ",n_words)


model = define_model(n_words)


n_words :  25768
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                1288450   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________


In [25]:
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)


Epoch 1/10
57/57 - 0s - loss: 0.6916 - accuracy: 0.5911
Epoch 2/10
57/57 - 0s - loss: 0.6818 - accuracy: 0.6872
Epoch 3/10
57/57 - 0s - loss: 0.6622 - accuracy: 0.8817
Epoch 4/10
57/57 - 0s - loss: 0.6306 - accuracy: 0.8406
Epoch 5/10
57/57 - 0s - loss: 0.5911 - accuracy: 0.9350
Epoch 6/10
57/57 - 0s - loss: 0.5451 - accuracy: 0.9417
Epoch 7/10
57/57 - 0s - loss: 0.4977 - accuracy: 0.9506
Epoch 8/10
57/57 - 0s - loss: 0.4509 - accuracy: 0.9561
Epoch 9/10
57/57 - 0s - loss: 0.4060 - accuracy: 0.9650
Epoch 10/10
57/57 - 0s - loss: 0.3649 - accuracy: 0.9678


<tensorflow.python.keras.callbacks.History at 0x1fa29e09940>

In [29]:

# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 87.000000
