# import necessary packages

In [1]:
from tensorflow.keras import layers
import tensorflow.keras as keras
import tensorflow as tf

import tensorflow_datasets as tfds

from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

Using TensorFlow backend.


# imdb_reviews/submwords8k - 8000 vocab size
# read train_data, test_data and info

In [2]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', split=(tfds.Split.TRAIN, tfds.Split.TEST), 
                                          with_info=True, as_supervised=True)

# Type of 'info'

In [3]:
type(info)

tensorflow_datasets.core.dataset_info.DatasetInfo

# What is in 'info'

In [4]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Pot

# info.features

In [5]:
info.features

FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
})

# info.features['text']

In [6]:
info.features['text']

Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>)

# info.features['text'].encoder

In [7]:
info.features['text'].encoder

<SubwordTextEncoder vocab_size=8185>

In [9]:
encoder = info.features['text'].encoder

# info.features['text'].encoder.vocab_size

In [10]:
encoder.vocab_size

8185

# info.features['text'].encoder.subwords - after tokenization

In [11]:
info.features['text'].encoder.subwords

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_',
 't_',
 'with_',
 'for_',
 '.<',
 'on_',
 'but_',
 'movie_',
 ' (',
 'are_',
 'his_',
 'have_',
 'film_',
 'not_',
 'ing_',
 'be_',
 'ed_',
 'you_',
 ' "',
 'it',
 'd_',
 'an_',
 'he_',
 'by_',
 'at_',
 'one_',
 'who_',
 'y_',
 'from_',
 'e_',
 'or_',
 'all_',
 'like_',
 'they_',
 '" ',
 'so_',
 'just_',
 'has_',
 ') ',
 'her_',
 'about_',
 'out_',
 'This_',
 'some_',
 'ly_',
 'movie',
 'film',
 'very_',
 'more_',
 'It_',
 'would_',
 'what_',
 'when_',
 'which_',
 'good_',
 'if_',
 'up_',
 'only_',
 'even_',
 'their_',
 'had_',
 'really_',
 'my_',
 'can_',
 'no_',
 'were_',
 'see_',
 'she_',
 '? ',
 'than_',
 '! ',
 'there_',
 'get_',
 'been_',
 'into_',
 ' - ',
 'will_',
 'much_',
 'story_',
 'because_',
 'ing',
 'time_',
 'n_',
 'we_',
 'ed',
 'me_',
 ': ',
 'most_',
 'other_',
 'don',
 'do_',
 'm_',
 'es_',
 'how_',
 'also

# type of 'train_data'

In [12]:
type(train_data)

tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter

# see what is in train_data

In [13]:
for x,y in train_data:
    
    print(x)
    print(y)
    
    break

tf.Tensor(
[  62   18   41  604  927   65    3  644 7968   21   35 5096   36   11
   43 2948 5240  102   50  681 7862 1244    3 3266   29  122  640    2
   26   14  279  438   35   79  349  384   11 1991    3  492   79  122
  188  117   33 4047 4531   14   65 7968    8 1819 3947    3   62   27
    9   41  577 5044 2629 2552 7193 7961 3642    3   19  107 3903  225
   85  198   72    1 1512  738 2347  102 6245    8   85  308   79 6936
 7961   23 4981 8044    3 6429 7961 1141 1335 1848 4848   55 3601 4217
 8050    2    5   59 3831 1484 8040 7974  174 5773   22 5240  102   18
  247   26    4 3903 1612 3902  291   11    4   27   13   18 4092 4008
 7961    6  119  213 2774    3   12  258 2306   13   91   29  171   52
  229    2 1245 5790  995 7968    8   52 2948 5240 8039 7968    8   74
 1249    3   12  117 2438 1369  192   39 7975], shape=(163,), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)


# types contained in train_data 

In [14]:
for x,y in train_data:
    
    print(type(x))
    print(type(y))
    
    break

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


# this is how to take the array in an EagerTensor element in train_data

In [15]:
for x,y in train_data:
    
    print(x.numpy())
    
    break

[  62   18   41  604  927   65    3  644 7968   21   35 5096   36   11
   43 2948 5240  102   50  681 7862 1244    3 3266   29  122  640    2
   26   14  279  438   35   79  349  384   11 1991    3  492   79  122
  188  117   33 4047 4531   14   65 7968    8 1819 3947    3   62   27
    9   41  577 5044 2629 2552 7193 7961 3642    3   19  107 3903  225
   85  198   72    1 1512  738 2347  102 6245    8   85  308   79 6936
 7961   23 4981 8044    3 6429 7961 1141 1335 1848 4848   55 3601 4217
 8050    2    5   59 3831 1484 8040 7974  174 5773   22 5240  102   18
  247   26    4 3903 1612 3902  291   11    4   27   13   18 4092 4008
 7961    6  119  213 2774    3   12  258 2306   13   91   29  171   52
  229    2 1245 5790  995 7968    8   52 2948 5240 8039 7968    8   74
 1249    3   12  117 2438 1369  192   39 7975]


# take the text sequences (integer encoded sequences) in train_data into a list
# also take the labels into a list

In [16]:
train_text_sequences = list()
train_labels = list()
for x,y in train_data:
    train_text_sequences.append(x.numpy())
    train_labels.append(y.numpy())

# take the text sequences list and labels list into a dataframe

In [17]:
dct_train_labeled_text = {'text sequences': train_text_sequences, 'labels': train_labels}

In [18]:
df_train_labeled_text = pd.DataFrame(dct_train_labeled_text)

In [19]:
df_train_labeled_text.head()

Unnamed: 0,text sequences,labels
0,"[62, 18, 41, 604, 927, 65, 3, 644, 7968, 21, 3...",0
1,"[12, 31, 93, 867, 7, 1256, 6585, 7961, 421, 36...",0
2,"[636, 102, 4714, 8, 1, 4333, 4, 4135, 47, 1325...",0
3,"[62, 9, 1, 312, 6, 32, 23, 4, 7809, 47, 7513, ...",1
4,"[249, 929, 31, 2699, 104, 2, 51, 1, 707, 13, 1...",1


# the text sequences are of variable length

In [20]:
length_of_train_text = list()
for i,r in df_train_labeled_text.iterrows():
    length_of_train_text.append(len(r['text sequences']))

In [21]:
length_of_train_text[:5]

[163, 142, 200, 117, 106]

In [22]:
max(length_of_train_text)

3944

In [23]:
min(length_of_train_text)

11

# take the text sequences and labels in the test_data into lists

In [24]:
test_text_sequences = list()
test_labels = list()
for x,y in test_data:
    test_text_sequences.append(x.numpy())
    test_labels.append(y.numpy())

# convert these lists into a dataframe 

In [25]:
dct_test_labeled_text = {'text sequences': test_text_sequences, 'labels': test_labels}

In [26]:
df_test_labeled_text = pd.DataFrame(dct_test_labeled_text)

In [27]:
df_test_labeled_text.head()

Unnamed: 0,text sequences,labels
0,"[173, 29, 185, 13, 115, 1956, 8044, 3, 398, 12...",1
1,"[133, 2237, 64, 1229, 1795, 6, 4, 615, 7974, 1...",1
2,"[3567, 47, 3634, 7978, 7974, 7981, 2, 7998, 58...",0
3,"[7128, 127, 2196, 2185, 1098, 1040, 687, 1145,...",0
4,"[249, 4, 2256, 3293, 453, 3483, 7961, 5014, 12...",1


# length of text sequences in test_data are of variable length

In [28]:
length_of_test_text = list()
for i,r in df_test_labeled_text.iterrows():
    length_of_test_text.append(len(r['text sequences']))

In [29]:
length_of_test_text[:5]

[283, 451, 839, 394, 103]

In [30]:
max(length_of_test_text)

3454

In [31]:
min(length_of_test_text)

9

# take the train text sequences and test text sequences as lists

In [32]:
X_train = list(df_train_labeled_text['text sequences'])

In [33]:
X_test = list(df_test_labeled_text['text sequences'])

# pad the text sequences
# add 0s to text sequences having length less than the max length
# truncate the text sequences having length more than the max length to max length

In [34]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# after padding see the lengths of text sequences in train and test data 

In [35]:
train_post_padding_length = list()
for a in X_train:
    train_post_padding_length.append(len(a))
    
test_post_padding_length = list()
for a in X_test:
    test_post_padding_length.append(len(a))

In [36]:
train_post_padding_length[:5]

[100, 100, 100, 100, 100]

In [37]:
test_post_padding_length[:5]

[100, 100, 100, 100, 100]

# build the neural network model_1 - global average pooling 1D

In [38]:
embedding_dim=16

model_1 = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(16, activation='relu'),
  layers.Dense(1)
])

model_1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


# take the train labels as a numpy array

In [39]:
y_train = list(df_train_labeled_text['labels'])
y_train = np.array(y_train)

# take the test labels as a numpy array 

In [40]:
y_test = list(df_test_labeled_text['labels'])
y_test = np.array(y_test)

# compile the model_1

In [41]:
model_1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

# train the model_1

In [42]:
history_1 = model_1.fit(X_train, y_train, epochs=5)

Train on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# evaluate the trained model_1 on test data

In [43]:
model_1.evaluate(X_test, y_test)



[0.4727481142568588, 0.80756]

# build the neural network model_2 - lstm

In [44]:
embedding_dim=16

model_2 = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.LSTM(128),
  layers.Dense(16, activation='relu'),
  layers.Dense(1)
])

model_2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               74240     
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 207,281
Trainable params: 207,281
Non-trainable params: 0
_________________________________________________________________


# compile the model_2

In [45]:
model_2.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

# train the model_2

In [46]:
history_2 = model_2.fit(X_train, y_train, epochs=5)

Train on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# evaluate the model_2

In [None]:
model_2.evaluate(X_test, y_test)