In [1]:
import tensorflow as tf
from tensorflow import keras
from pathlib import Path

root='http://ai.stanford.edu/~amaas/data/sentiment/'
filename='aclImdb_v1.tar.gz'
filepath=keras.utils.get_file(filename,root+filename,extract=True)
path=Path(filepath).parent/'aclimdb'
path

WindowsPath('C:/Users/User/.keras/datasets/aclimdb')

In [2]:
import os

def reviewpath(dirpath):
    return[str(path)for path in dirpath.glob('*.txt')]

train_pos=reviewpath(path/'train'/'pos')
train_neg=reviewpath(path/'train'/'neg')
test_valid_pos=reviewpath(path/'valid'/'pos')
test_valid_neg=reviewpath(path/'valid'/'neg')

len(train_pos),len(train_neg)

(12500, 12500)

In [3]:
import numpy as np

np.random.shuffle(test_valid_pos)
test_pos=test_valid_pos[:5000]
test_neg=test_valid_neg[:5000]
valid_pos=test_valid_pos[5000:]
valid_neg=test_valid_neg[5000:]

In [19]:
def imdbdata(pathpos,pathneg):
    reviews=[]
    labels=[]
    for filepaths,label in ((pathneg,0),(pathpos,1)):
        for fp in filepaths:
            with open(fp,encoding='utf8') as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices((tf.constant(reviews),tf.constant(labels)))

batchsize=32
trainset=imdbdata(train_pos,train_neg).shuffle(25000).batch(batchsize).prefetch(1)
validset=imdbdata(valid_pos,valid_neg).batch(batchsize).prefetch(1)
testset=imdbdata(test_pos,test_neg).batch(batchsize).prefetch(1)

In [46]:
for x,y in imdbdata(train_pos,train_neg).take(3):
    print(x)
    print(y)
    print()

tf.Tensor(b"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)

tf.Tensor(b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public a

In [11]:
def preprocess(xbatch,n_words=50):
    shape=tf.shape(xbatch)*tf.constant([1,0])+tf.constant([0,n_words])
    z=tf.strings.substr(xbatch,0,300)
    z=tf.strings.lower(z)
    z=tf.strings.regex_replace(z,b'<br\\s*/?>',b" ")
    z=tf.strings.regex_replace(z,b'[^a-z]',b' ')
    z=tf.strings.split(z)
    return z.to_tensor(shape=shape,default_value=b'<pad>')
x_example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!!!"])
preprocess(x_example)

<tf.Tensor: shape=(2, 50), dtype=string, numpy=
array([[b'it', b's', b'a', b'great', b'great', b'movie', b'i', b'loved',
        b'it', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
       [b'it', b'was', b'terrible', b'run', b'away', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'

In [12]:
from collections import Counter

def get_voca(data_sample,max_size=1000):
    preprocessreview=preprocess(data_sample).numpy()
    counter=Counter()
    for words in preprocessreview:
        for word in words:
            if word !=b'<pad>':
                counter[word]+=1
    return [b'<pad>']+[word for word,count in counter.most_common(max_size)]

get_voca(x_example)

[b'<pad>',
 b'it',
 b'great',
 b's',
 b'a',
 b'movie',
 b'i',
 b'loved',
 b'was',
 b'terrible',
 b'run',
 b'away']

In [32]:
class BagofWord(keras.layers.Layer):
    def __init__(self,n_tokens,dtype=tf.int32,**kwargs):
        super().__init__(dtype=dtype,**kwargs)
        self.n_tokens=n_tokens
    def call(self,inputs):
        onehot=tf.one_hot(inputs,self.n_tokens)
        return tf.reduce_sum(onehot,axis=1)[:,1:]
    
def meanembedding(inputs):
    notpad=tf.math.count_nonzero(inputs,axis=-1)
    nwords=tf.math.count_nonzero(notpad,axis=-1,keepdims=True)
    sqrtnword=tf.math.sqrt(tf.cast(nwords,tf.float32))
    return tf.reduce_mean(inputs,axis=1)*sqrtnword

In [34]:
max_vocabulary_size = 1000
n_oov_buckets = 100
n_tokens = max_vocabulary_size + n_oov_buckets + 1
embedding_size = 20
model=keras.models.Sequential([
    keras.layers.experimental.preprocessing.TextVectorization(max_tokens=1000),
    keras.layers.Embedding(input_dim=n_tokens,output_dim=embedding_size,mask_zero=True),
    keras.layers.Lambda(meanembedding),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])

In [35]:

model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['accuracy'])
model.fit(trainset,epochs=5,validation_data=validset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1edaec1efd0>

In [36]:
import tensorflow_datasets as tfds

datasets=tfds.load(name='imdb_reviews')
trainset,testste=datasets['train'],datasets['test']

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\User\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


HBox(children=(HTML(value='Dl Completed...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='…

HBox(children=(HTML(value='Dl Size...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'…







HBox(children=(HTML(value='Generating splits...'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Generating train examples...'), FloatProgress(value=1.0, bar_style='info', layout=L…

HBox(children=(HTML(value='Shuffling imdb_reviews-train.tfrecord...'), FloatProgress(value=0.0, max=25000.0), …

HBox(children=(HTML(value='Generating test examples...'), FloatProgress(value=1.0, bar_style='info', layout=La…

HBox(children=(HTML(value='Shuffling imdb_reviews-test.tfrecord...'), FloatProgress(value=0.0, max=25000.0), H…

HBox(children=(HTML(value='Generating unsupervised examples...'), FloatProgress(value=1.0, bar_style='info', l…

HBox(children=(HTML(value='Shuffling imdb_reviews-unsupervised.tfrecord...'), FloatProgress(value=0.0, max=500…

[1mDataset imdb_reviews downloaded and prepared to C:\Users\User\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


In [39]:
for example in trainset.take(4):
    print(example['text'])
    print(example['label'])

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on t