In [1]:
import random
import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from hmmlearn import hmm

2022-06-30 06:50:33.591148: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-30 06:50:33.591182: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)

In [3]:
# Find the maximum of all max indexes
max_word_index = max([max(sequence) for sequence in train_data])
min_word_index = min([min(sequence) for sequence in train_data])
max_seq_len = max([len(sequence) for sequence in train_data])
min_seq_len = min([len(sequence) for sequence in train_data])
print(f'Maximum word index: {max_word_index}')
print(f'Minimum word index: {min_word_index}')
print(f'Maximum seq length: {max_seq_len}')
print(f'Minimum seq length: {min_seq_len}')

Maximum word index: 9999
Minimum word index: 1
Maximum seq length: 2494
Minimum seq length: 11


In [7]:
word_index = imdb.get_word_index()

# step 1: get an index randomly
ind = random.randint(0,25000)

# step 2: reverse word index to map integer indexes to their respective words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Step 3: decode the review, mapping integer indices to words
#
# indices are off by 3 because 0, 1, and 2 are reserverd indices for "padding", "Start of sequence" and "unknown"
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[ind]])
label = 'positive review' if train_labels[ind] == 1 else 'negative review'

print(f'REVIEW:\n {decoded_review}\n')
print(f'Encoded sequence of words:\n {train_data[ind]}\n')
print(f'Label: {label}\n')

REVIEW:
 ? this movie is so bad it's comical in fact mystery science theatre 3000 the television show in which three characters watch and parody bad movies used this very film to mock i suggest watching it maybe on youtube instead of actually seeing this movie br br please do not see hobgoblins if you're not prepared to stop within the first scene actually do not see this movie period please at least not seriously its jokes are not funny to say the least and you'll have much more fun ? or watching a parody of it then viewing the movie br br you may feel yourself becoming sick upon watching so spare yourself read a book do the ? anything is more fun than watching hobgoblins

Encoded sequence of words:
 [1, 14, 20, 9, 38, 78, 45, 2849, 11, 192, 736, 1067, 1716, 5083, 4, 699, 123, 11, 63, 289, 105, 106, 5, 2111, 78, 102, 343, 14, 55, 22, 8, 7259, 13, 1467, 149, 12, 279, 23, 6221, 305, 7, 165, 319, 14, 20, 10, 10, 591, 81, 24, 67, 8029, 48, 335, 24, 2848, 8, 570, 746, 4, 86, 136, 165, 81, 

In [8]:
# split dataset into positive and negative review ones
train_data_pos = train_data[train_labels==1]
train_data_neg = train_data[train_labels==0]
print(len(train_data_pos))
print(len(train_data_neg))
seqlen_train_pos = [len(sequence) for sequence in train_data_pos]
seqlen_train_neg = [len(sequence) for sequence in train_data_neg]

12500
12500


In [10]:
# create training matrices
X_train_pos = np.concatenate(train_data_pos).reshape(-1,1)
X_train_neg = np.concatenate(train_data_neg).reshape(-1,1)
print(X_train_pos.shape)
print(X_train_neg.shape)

(3019537, 1)
(2948304, 1)


In [15]:
# build HMMs, one for each category
hmm_pos = hmm.GaussianHMM(n_components=7, covariance_type="tied", n_iter=10)
hmm_neg = hmm.GaussianHMM(n_components=7, covariance_type="tied", n_iter=10)

# train HMMs (it may take while)
hmm_pos.fit(X_train_pos, seqlen_train_pos)
hmm_neg.fit(X_train_neg, seqlen_train_neg)

In [17]:
# calculate accuracy, precision, recall and F1
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0
for x,l in zip(test_data, test_labels):
    score_pos = hmm_pos.score(np.asarray(x).reshape(-1,1))
    score_neg = hmm_neg.score(np.asarray(x).reshape(-1,1))
    if l == 1:
        if score_pos > score_neg:
            true_pos += 1
        else:
            false_neg += 1
    else:
        if score_pos > score_neg:
            false_pos += 1
        else:
            true_neg += 1

nsamples = len(test_data)
nsamples_pos = len(test_data[test_labels==1])
nsamples_neg = len(test_data[test_labels==0])

acc = (true_pos + true_neg) / nsamples
pre = true_pos / (true_pos + false_pos)
rec = true_pos / (true_pos + false_neg)
f1_score = 2.0 * pre*rec/(pre + rec)

print(f'Accuracy in the test set: {acc}')
print(f'Precision in the test set: {pre}')
print(f'Recall in the test set: {rec}')
print(f'F1 score in the test set: {f1_score}')

Accuracy in the test set: 0.53516
Precision in the test set: 0.5464048146974976
Recall in the test set: 0.414
F1 score in the test set: 0.4710755086250057
