In [1]:
import collections
import math
import os
import random
import tarfile
import re
from six.moves import urllib
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'
def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path + DOWNLOADED_FILENAME, DOWNLOADED_FILENAME)
    print('Found and verified file from this path: ', url_path + DOWNLOADED_FILENAME)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [3]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")
def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    reviews = []
    labels = []
    for file in os.listdir(dirname):
        if file.endswith(".txt"):
            with open(dirname + file, 'r+') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                reviews.append(review)
                labels.append(label)
    return reviews, labels

     
    

In [4]:
def extract_labels_data():
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
    positive_reviews, positive_labels = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews, negative_labels = get_reviews("aclImdb/train/neg/",positive=False)
    data = positive_reviews+negative_reviews
    labels = positive_labels+negative_labels
    return data, labels

In [5]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)


Found and verified file from this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gzImdbReviews.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [6]:
data,labels = extract_labels_data()

In [7]:
len(labels),len(data)

(25000, 25000)

In [8]:
max_document_length = max([len(x.split(" ")) for x in data])
print(max_document_length)

2470


In [9]:
MAX_SEQUENCE_LENGTH = 250

In [10]:
words = np.load('aclImdb/wordsList.npy')

In [11]:
words[:5], len(words)

(array([b'0', b',', b'.', b'of', b'to'], dtype='|S68'), 400000)

In [12]:
def get_word_index_dict(words):
    dict = {}
    index = 0
    for word in words:
        dict[word.decode('utf-8')] = index
        index += 1
    return dict

In [13]:
dictionary = get_word_index_dict(words)

In [14]:
dictionary['good']

219

In [15]:
review_ids = []
def convert_review_to_ids(data, words):
    words_list = words.tolist()
    progress = 0
    for review in data:
        review_id = []
        index = 0
        for word in review:
            if index >= MAX_SEQUENCE_LENGTH:
                break
            try:
                review_id.append(dictionary[word])
            except KeyError:
                review_id.append(0)
            index += 1
        if len(review_id) < MAX_SEQUENCE_LENGTH:
            review_id = np.pad(review_id, (0, MAX_SEQUENCE_LENGTH - index), 'constant')
        review_ids.append(review_id)
        progress += 1
        
        if progress % 1000 == 0:
            print("Progress: {}/{}".format(progress, len(data)))    
        

In [16]:
convert_review_to_ids(data, words)

Progress: 1000/25000
Progress: 2000/25000
Progress: 3000/25000
Progress: 4000/25000
Progress: 5000/25000
Progress: 6000/25000
Progress: 7000/25000
Progress: 8000/25000
Progress: 9000/25000
Progress: 10000/25000
Progress: 11000/25000
Progress: 12000/25000
Progress: 13000/25000
Progress: 14000/25000
Progress: 15000/25000
Progress: 16000/25000
Progress: 17000/25000
Progress: 18000/25000
Progress: 19000/25000
Progress: 20000/25000
Progress: 21000/25000
Progress: 22000/25000
Progress: 23000/25000
Progress: 24000/25000
Progress: 25000/25000


In [17]:
review_ids[1825]

[41,
 2404,
 1110,
 0,
 1911,
 1110,
 7,
 1968,
 0,
 1534,
 4868,
 1993,
 1110,
 0,
 2159,
 1110,
 1911,
 1911,
 41,
 1556,
 5025,
 1110,
 0,
 2159,
 5918,
 41,
 3814,
 3410,
 1534,
 0,
 7,
 1556,
 4868,
 6479,
 2159,
 0,
 2159,
 5918,
 41,
 1534,
 0,
 3880,
 41,
 5025,
 1993,
 0,
 1534,
 4868,
 0,
 41,
 0,
 5140,
 7,
 1534,
 0,
 3420,
 1911,
 1110,
 3420,
 7,
 1911,
 1110,
 1968,
 0,
 3880,
 4868,
 1911,
 0,
 2159,
 5918,
 1110,
 0,
 5140,
 4868,
 1911,
 1534,
 2159,
 0,
 1864,
 4868,
 3814,
 3880,
 6479,
 1534,
 41,
 3814,
 3410,
 0,
 1993,
 6479,
 1968,
 1968,
 5025,
 1110,
 1968,
 0,
 5918,
 4868,
 1911,
 1911,
 41,
 1556,
 5025,
 3524,
 0,
 1534,
 2159,
 1911,
 6479,
 1864,
 2159,
 6479,
 1911,
 1110,
 1968,
 0,
 5140,
 5918,
 41,
 5025,
 1110,
 0,
 2159,
 5918,
 1110,
 1911,
 1110,
 0,
 1993,
 7,
 3524,
 0,
 1556,
 1110,
 0,
 1993,
 1110,
 1911,
 41,
 2159,
 0,
 2159,
 4868,
 0,
 1534,
 4868,
 1993,
 1110,
 0,
 4868,
 3880,
 0,
 2159,
 5918,
 1110,
 1534,
 1110,
 0,
 7,
 1864,
 1

In [18]:
review_ids = np.load('aclImdb/idsMatrix.npy')
review_ids.shape, review_ids[:5]

((25000, 250),
 array([[174943,    152,     14, ...,      0,      0,      0],
        [ 26494,     46, 399999, ...,   2153,    144,      7],
        [  6520, 399999,     21, ...,      0,      0,      0],
        [    37,     14,   2407, ...,      0,      0,      0],
        [    37,     14,     36, ...,      0,      0,      0]], dtype=int32))

In [19]:
x_data = review_ids
y_output = np.array(labels)

In [20]:
vacabulary_size = len(words)
print(vacabulary_size)

400000


In [21]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))
x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [22]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [23]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32,[None])





In [24]:
num_epochs = 20
batch_size = 25
embedding_size = 50
max_label = 2

In [25]:
saved_embeddings = np.load('aclImdb/wordVectors.npy')  
embeddings = tf.nn.embedding_lookup(saved_embeddings, x)
saved_embeddings.shape

(400000, 50)

In [26]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(embedding_size)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell,output_keep_prob=0.75)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


In [27]:
_,(encoding,_)=tf.nn.dynamic_rnn(lstmCell,embeddings,dtype=tf.float32)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [28]:
encoding.get_shape()

TensorShape([Dimension(None), Dimension(50)])

In [29]:
logits = tf.layers.dense(encoding,max_label,activation=None)

Instructions for updating:
Use keras.layers.dense instead.


In [30]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y)
loss = tf.reduce_mean(cross_entropy)

In [31]:
prediction = tf.equal(tf.argmax(logits, 1), tf.cast(y,tf.int64))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

In [32]:
optimizer = tf.train.AdamOptimizer(0.01)
train_step = optimizer.minimize(loss)




In [33]:
init = tf.global_variables_initializer()




In [34]:
with tf.Session() as sess:
    init.run()
    for epoch in range(num_epochs):
        num_batches = int(len(train_data)//batch_size)+1
        for i in range(num_batches):
            min_ix = i*batch_size
            max_ix = np.min([len(train_data), ((i+1)*batch_size)])
            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]

            train_dict = {x: x_train_batch, y: y_train_batch}
            sess.run(train_step, feed_dict=train_dict)
            
            train_loss, train_acc = sess.run([loss, accuracy], feed_dict=train_dict)
        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = sess.run([loss, accuracy], feed_dict=test_dict)
        print('Epoch: {}, TestLoss: {:.2}, TestAcc: {:.5}'.format(epoch + 1, test_loss, test_acc))




2024-06-15 06:52:53.789966: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA


Epoch: 1, TestLoss: 0.7, TestAcc: 0.495
Epoch: 2, TestLoss: 0.71, TestAcc: 0.502
Epoch: 3, TestLoss: 0.71, TestAcc: 0.496
Epoch: 4, TestLoss: 0.72, TestAcc: 0.505
Epoch: 5, TestLoss: 0.71, TestAcc: 0.552
Epoch: 6, TestLoss: 0.7, TestAcc: 0.532
Epoch: 7, TestLoss: 0.84, TestAcc: 0.626
Epoch: 8, TestLoss: 0.55, TestAcc: 0.77
Epoch: 9, TestLoss: 0.56, TestAcc: 0.78
Epoch: 10, TestLoss: 0.59, TestAcc: 0.768
Epoch: 11, TestLoss: 0.65, TestAcc: 0.777
Epoch: 12, TestLoss: 0.65, TestAcc: 0.78
Epoch: 13, TestLoss: 0.66, TestAcc: 0.761
Epoch: 14, TestLoss: 0.74, TestAcc: 0.764
Epoch: 15, TestLoss: 0.73, TestAcc: 0.759
Epoch: 16, TestLoss: 0.75, TestAcc: 0.768
Epoch: 17, TestLoss: 0.77, TestAcc: 0.758
Epoch: 18, TestLoss: 0.8, TestAcc: 0.77
Epoch: 19, TestLoss: 1.2, TestAcc: 0.504
Epoch: 20, TestLoss: 0.79, TestAcc: 0.484
