In [1]:
from numpy import array
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from pandas import DataFrame
from matplotlib import pyplot

import nltk
nltk.download('stopwords')

from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text

In [3]:
def clean_doc(doc):
 # split into tokens by white space
 tokens = doc.split()
 # remove punctuation from each token
 table = str.maketrans('', '', punctuation)
 tokens = [w.translate(table) for w in tokens]
 # remove remaining tokens that are not alphabetic
 tokens = [word for word in tokens if word.isalpha()]
 # filter out stop words
 stop_words = set(stopwords.words('english'))
 tokens = [w for w in tokens if not w in stop_words]
 # filter out short tokens
 tokens = [word for word in tokens if len(word) > 1]
 return tokens

In [4]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
 # load doc
 doc = load_doc(filename)
 # clean doc
 tokens = clean_doc(doc)
 # update counts
 vocab.update(tokens)

In [5]:
# load all docs in a directory
def process_docs(directory, vocab):
 # walk through all files in the folder
 for filename in listdir(directory):
  # skip any reviews in the test set
  if filename.startswith('cv9'):
    continue
  # create the full path of the file to open
  path = directory + '/' + filename
  # add doc to vocab
  add_doc_to_vocab(path, vocab)

In [6]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('Dataset/review_polarity/txt_sentoken/pos', vocab)
process_docs('Dataset/review_polarity/txt_sentoken/neg', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


# New Section

In [7]:
# keep tokens with a min occurrence
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

25767


In [8]:
# save list to file
def save_list(lines, filename):
 # convert lines to a single blob of text
 data = '\n'.join(lines)
 # open file
 file = open(filename, 'w')
 # write text
 file.write(data)
 # close file
 file.close()

In [9]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [10]:
def doc_to_line(filename, vocab):
 # load the doc
 doc = load_doc(filename)
 # clean doc
 tokens = clean_doc(doc)
 # filter by vocab
 tokens = [w for w in tokens if w in vocab]
 return ' '.join(tokens)

In [11]:
def process_docs(directory, vocab, is_train):
 lines = list()
 # walk through all files in the folder
 for filename in listdir(directory):
  # skip any reviews in the test set
  if is_train and filename.startswith('cv9'):
    continue
  if not is_train and not filename.startswith('cv9'):
    continue
  # create the full path of the file to open
  path = directory + '/' + filename
  # load and clean the doc
  line = doc_to_line(path, vocab)
  # add to list
  lines.append(line)
 return lines

In [12]:
def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
 scores = list()
 n_repeats = 30
 n_words = Xtest.shape[1]
 for i in range(n_repeats):
  # define network
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # fit network
  model.fit(Xtrain, ytrain, epochs=50, verbose=2)
  # evaluate
  loss, acc = model.evaluate(Xtest, ytest, verbose=0)
  scores.append(acc)
  print('%d accuracy: %s' % ((i+1), acc))
 return scores

In [13]:
def prepare_data(train_docs, test_docs, mode):
 # create the tokenizer
 tokenizer = Tokenizer()
 # fit the tokenizer on the documents
 tokenizer.fit_on_texts(train_docs)
 # encode training data set
 Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
 # encode training data set
 Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
 return Xtrain, Xtest

In [14]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
positive_lines = process_docs('Dataset/review_polarity/txt_sentoken/pos', vocab, True)
negative_lines = process_docs('Dataset/review_polarity/txt_sentoken/neg', vocab, True)
train_docs = negative_lines + positive_lines
# load all test reviews
positive_lines = process_docs('Dataset/review_polarity/txt_sentoken/pos', vocab, False)
negative_lines = process_docs('Dataset/review_polarity/txt_sentoken/neg', vocab, False)
test_docs = negative_lines + positive_lines
# prepare labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
docs = train_docs
tokenizer.fit_on_texts(docs)

# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs, mode='freq')
print(Xtrain.shape)

docs = test_docs
# encode training data set
Xtest = tokenizer.texts_to_matrix(docs, mode='freq')
print(Xtest.shape)

(1800, 25768)
(200, 25768)


In [15]:
n_words = Xtest.shape[1]

# define network
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# fit network
model.fit(Xtrain, ytrain, epochs=50, verbose=2)

Epoch 1/50
57/57 - 2s - 43ms/step - accuracy: 0.5806 - loss: 0.6912
Epoch 2/50
57/57 - 1s - 22ms/step - accuracy: 0.7556 - loss: 0.6798
Epoch 3/50
57/57 - 1s - 21ms/step - accuracy: 0.8511 - loss: 0.6590
Epoch 4/50
57/57 - 1s - 22ms/step - accuracy: 0.9239 - loss: 0.6273
Epoch 5/50
57/57 - 1s - 20ms/step - accuracy: 0.9211 - loss: 0.5863
Epoch 6/50
57/57 - 1s - 21ms/step - accuracy: 0.9256 - loss: 0.5416
Epoch 7/50
57/57 - 1s - 18ms/step - accuracy: 0.9511 - loss: 0.4933
Epoch 8/50
57/57 - 1s - 21ms/step - accuracy: 0.9544 - loss: 0.4455
Epoch 9/50
57/57 - 1s - 19ms/step - accuracy: 0.9589 - loss: 0.4008
Epoch 10/50
57/57 - 1s - 18ms/step - accuracy: 0.9656 - loss: 0.3587
Epoch 11/50
57/57 - 1s - 17ms/step - accuracy: 0.9717 - loss: 0.3210
Epoch 12/50
57/57 - 1s - 17ms/step - accuracy: 0.9811 - loss: 0.2871
Epoch 13/50
57/57 - 1s - 16ms/step - accuracy: 0.9844 - loss: 0.2573
Epoch 14/50
57/57 - 1s - 18ms/step - accuracy: 0.9856 - loss: 0.2306
Epoch 15/50
57/57 - 1s - 20ms/step - accura

<keras.src.callbacks.history.History at 0x275b9d76c90>

In [17]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 91.000003


In [18]:
def predict_sentiment(review, vocab, tokenizer, model):
 # clean
 tokens = clean_doc(review)
 # filter by vocab
 tokens = [w for w in tokens if w in vocab]
 # convert to line
 line = ' '.join(tokens)
 # encode
 encoded = tokenizer.texts_to_matrix([line], mode='freq')
 # prediction
 yhat = model.predict(encoded, verbose=0)
 return round(yhat[0,0])

In [19]:
# test positive text
text = 'Best movie ever!'
print(predict_sentiment(text, vocab, tokenizer, model))
# test negative text
text = 'This is a bad movie.'
print(predict_sentiment(text, vocab, tokenizer, model))

1
0
