In [1]:
# load libraries
import random
import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from hmmlearn import hmm
from tqdm import tqdm
from sklearn.metrics import classification_report

2022-07-04 11:31:55.120002: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-04 11:31:55.120017: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# load database using keras: the most frequent 10000 words
(train_data, train_labels), \
(test_data, test_labels) = imdb.load_data(num_words = 10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
# Find the maximum of all max indexes
max_word_index = max([max(sequence) for sequence in train_data])
min_word_index = min([min(sequence) for sequence in train_data])
max_seq_len = max([len(sequence) for sequence in train_data])
min_seq_len = min([len(sequence) for sequence in train_data])
print(f'Maximum word index (training data): {max_word_index}')
print(f'Minimum word index (training data): {min_word_index}')
print(f'Maximum seq length (training data): {max_seq_len}')
print(f'Minimum seq length (training data): {min_seq_len}')

Maximum word index (training data): 9999
Minimum word index (training data): 1
Maximum seq length (training data): 2494
Minimum seq length (training data): 11


In [5]:
# step 1 : get word index
word_index = imdb.get_word_index()
ind = random.randint(0,len(train_data))

# step 2: reverse word index to map integer indexes to their respective words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Step 3: decode the review, mapping integer indices to words
#
# indices are off by 3 because 0, 1, and 2 are reserverd indices for "padding", "Start of sequence" and "unknown"
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[ind]])
label = 'positive review' if train_labels[ind] == 1 else 'negative review'

print(f'REVIEW:\n {decoded_review}\n')
print(f'Encoded sequence of words:\n {train_data[ind]}\n')
print(f'Label: {label}\n')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
REVIEW:
 ? the true measure of any fictional piece of work is whether or not the characters grow from their experiences and emerge from the experience altered in some significant way note that this change need not be positive or ? at the end br br by that measure enchanted april is a ? success as a film in general it succeeds quite well excellent ensemble cast well developed characters you come to care about wonderful script and beautiful sets and locations in short the film is well enchanting although all the performances are first rate three must be mentioned ? lawrence jim ? and joan ? it says something when miranda richardson does her usual fine work and yet is overshadowed by so many others in the cast most highly recommended particularly if you are a romantic at heart further ? ? not

Encoded sequence of words:
 [1, 4, 283, 4160, 7, 101, 2615, 418, 7, 157, 9, 726, 42, 24, 4, 105

In [6]:
# split dataset into positive and negative review ones
train_data_pos = train_data[train_labels==1]
train_data_neg = train_data[train_labels==0]
seqlen_train_pos = [len(sequence) for sequence in train_data_pos]
seqlen_train_neg = [len(sequence) for sequence in train_data_neg]
print(f'Number of positive reviews: {len(train_data_pos)}')
print(f'Number of negative reviews: {len(train_data_neg)}')
print(f'Maximum length of the positive reviews: {max(seqlen_train_pos)}')
print(f'Maximum lengths of the negative reviews: {max(seqlen_train_neg)}')

Number of positive reviews: 12500
Number of negative reviews: 12500
Maximum length of the positive reviews: 2494
Maximum lengths of the negative reviews: 1571


In [7]:
# create training matrices
X_train_pos = np.concatenate(train_data_pos).reshape(-1,1)
X_train_neg = np.concatenate(train_data_neg).reshape(-1,1)
print(X_train_pos.shape)
print(X_train_neg.shape)

(3019537, 1)
(2948304, 1)


In [8]:
# build HMMs, one for each category
hmm_pos = hmm.MultinomialHMM(
    n_components=7,
    n_iter=10)
hmm_neg = hmm.MultinomialHMM(
    n_components=7,
    n_iter=10)

# train HMMs (it may take a while)
hmm_pos.fit(X_train_pos, seqlen_train_pos)
hmm_neg.fit(X_train_neg, seqlen_train_neg)

In [10]:
# score each test sample, that is, calculate P(O|model)
(true_pos, true_neg, false_pos, false_neg) = (0, 0, 0, 0)
for x, l in zip(test_data, test_labels):
    score_pos = hmm_pos.score(np.asarray(x).reshape(-1,1))
    score_neg = hmm_neg.score(np.asarray(x).reshape(-1,1))
    if l == 1:
        if score_pos > score_neg:
            true_pos += 1
        else:
            false_neg += 1
    else:
        if score_pos > score_neg:
            false_pos += 1
        else:
            true_neg += 1

# number of samples
nsamples_all = len(test_data)
nsamples_pos = len(test_data[test_labels==1])
nsamples_neg = len(test_data[test_labels==0])

# calculate accuracy, precision, recall and F1
acc = (true_pos + true_neg) / nsamples_all
pre_class0 = true_pos / (true_pos + false_pos)
pre_class1 = true_neg / (true_neg + false_neg)
rec_class0 = true_pos / (true_pos + false_neg)
rec_class1 = true_neg / (true_neg + false_pos)
f1_score_class0 = 2.0 * pre_class0 * rec_class0 / (pre_class0 + rec_class0)
f1_score_class1 = 2.0 * pre_class1 * rec_class1 / (pre_class1 + rec_class1)

print(f'Accuracy in the test set: {acc}')
print(f'Precision in the test set (class 0): {pre_class0}')
print(f'Recall in the test set (class 0): {rec_class0}')
print(f'F1 score in the test set (class 0): {f1_score_class0}')
print(f'Precision in the test set (class 1): {pre_class1}')
print(f'Recall in the test set (class 1): {rec_class1}')
print(f'F1 score in the test set (class 1): {f1_score_class1}')

Accuracy in the test set: 0.81644
Precision in the test set (class 0): 0.847004123168699
Recall in the test set (class 0): 0.7724
F1 score in the test set (class 0): 0.8079835976400686
Precision in the test set (class 1): 0.7908242041026395
Recall in the test set (class 1): 0.86048
F1 score in the test set (class 1): 0.8241829814949618


In [9]:
# eval model using Scikit-learn
# score each test sample, that is, calculate P(O|model)
y_test = np.asarray(test_labels)
y_pred = np.zeros(y_test.shape)
for i, x in enumerate(test_data):
    score_pos = hmm_pos.score(np.asarray(x).reshape(-1,1))
    score_neg = hmm_neg.score(np.asarray(x).reshape(-1,1))
    if score_pos > score_neg:
        y_pred[i] = 1.0
    else:
        y_pred[i] = 0.0
        
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.86      0.82     12500
           1       0.85      0.77      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

