# import necessary packages

In [1]:
from tensorflow.keras import layers
import tensorflow.keras as keras
import tensorflow as tf

import tensorflow_datasets as tfds

from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

Using TensorFlow backend.


# imdb_reviews/submwords8k - 8000 vocab size
# read train_data, test_data and info

In [2]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', split=(tfds.Split.TRAIN, tfds.Split.TEST), 
                                          with_info=True, as_supervised=True)

# Type of 'info'

In [3]:
type(info)

tensorflow_datasets.core.dataset_info.DatasetInfo

# What is in 'info'

In [4]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Pot

# info.features

In [5]:
info.features

FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
})

# info.features['text']

In [6]:
info.features['text']

Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>)

# info.features['text'].encoder

In [7]:
info.features['text'].encoder

<SubwordTextEncoder vocab_size=8185>

In [8]:
encoder = info.features['text'].encoder

# info.features['text'].encoder.vocab_size

In [9]:
encoder.vocab_size

8185

# info.features['text'].encoder.subwords - after tokenization

In [10]:
info.features['text'].encoder.subwords

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_',
 't_',
 'with_',
 'for_',
 '.<',
 'on_',
 'but_',
 'movie_',
 ' (',
 'are_',
 'his_',
 'have_',
 'film_',
 'not_',
 'ing_',
 'be_',
 'ed_',
 'you_',
 ' "',
 'it',
 'd_',
 'an_',
 'he_',
 'by_',
 'at_',
 'one_',
 'who_',
 'y_',
 'from_',
 'e_',
 'or_',
 'all_',
 'like_',
 'they_',
 '" ',
 'so_',
 'just_',
 'has_',
 ') ',
 'her_',
 'about_',
 'out_',
 'This_',
 'some_',
 'ly_',
 'movie',
 'film',
 'very_',
 'more_',
 'It_',
 'would_',
 'what_',
 'when_',
 'which_',
 'good_',
 'if_',
 'up_',
 'only_',
 'even_',
 'their_',
 'had_',
 'really_',
 'my_',
 'can_',
 'no_',
 'were_',
 'see_',
 'she_',
 '? ',
 'than_',
 '! ',
 'there_',
 'get_',
 'been_',
 'into_',
 ' - ',
 'will_',
 'much_',
 'story_',
 'because_',
 'ing',
 'time_',
 'n_',
 'we_',
 'ed',
 'me_',
 ': ',
 'most_',
 'other_',
 'don',
 'do_',
 'm_',
 'es_',
 'how_',
 'also

# type of 'train_data'

In [11]:
type(train_data)

tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter

# see what is in train_data

In [12]:
for x,y in train_data:
    
    print(x)
    print(y)
    
    break

tf.Tensor(
[  62    9    4  301 4161  267  148    1 3240 1779  787    3   62 2315
  260 7968   21 1240   20  445   20  261  204    2    5   15  635 7742
  149   97  101   25  184 4127    3 4666 7913  690   25    9  176    1
  175  233   60   14  694    2   26   30 1858 3162   34    9 3636   40
  267   11   14 2362 8050    3   19  695   29   51 1816 7138    2   26
   14  101    1  397 3953  199  615    3   19  328    9 3362 4712 7961
    5    1  184    9   77 4167   64 1152    2   55   12  459 1461    6
 1713 3326   11 1147    7 1464 5691 7961  421 8026   38 1746 1074  618
   54   65    3 1987    2    1 3326   29  214    5  318 2338 3960 8039
    2    5  325    2   14   32    9 4157 7961   44  883 7975], shape=(138,), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)


# types contained in train_data 

In [13]:
for x,y in train_data:
    
    print(type(x))
    print(type(y))
    
    break

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


# this is how to take the array in an EagerTensor element in train_data

In [14]:
for x,y in train_data:
    
    print(x.numpy())
    
    break

[  62    9    4  301 4161  267  148    1 3240 1779  787    3   62 2315
  260 7968   21 1240   20  445   20  261  204    2    5   15  635 7742
  149   97  101   25  184 4127    3 4666 7913  690   25    9  176    1
  175  233   60   14  694    2   26   30 1858 3162   34    9 3636   40
  267   11   14 2362 8050    3   19  695   29   51 1816 7138    2   26
   14  101    1  397 3953  199  615    3   19  328    9 3362 4712 7961
    5    1  184    9   77 4167   64 1152    2   55   12  459 1461    6
 1713 3326   11 1147    7 1464 5691 7961  421 8026   38 1746 1074  618
   54   65    3 1987    2    1 3326   29  214    5  318 2338 3960 8039
    2    5  325    2   14   32    9 4157 7961   44  883 7975]


# take the text sequences (integer encoded sequences) in train_data into a list
# also take the labels into a list

In [15]:
train_text_sequences = list()
train_labels = list()
for x,y in train_data:
    train_text_sequences.append(x.numpy())
    train_labels.append(y.numpy())

# take the text sequences list and labels list into a dataframe

In [16]:
dct_train_labeled_text = {'text sequences': train_text_sequences, 'labels': train_labels}

In [17]:
df_train_labeled_text = pd.DataFrame(dct_train_labeled_text)

In [18]:
df_train_labeled_text.head()

Unnamed: 0,text sequences,labels
0,"[62, 9, 4, 301, 4161, 267, 148, 1, 3240, 1779,...",0
1,"[2130, 99, 12, 18, 55, 2554, 2, 3508, 5, 7995,...",0
2,"[4491, 40, 6, 1, 7450, 34, 4798, 80, 4, 238, 7...",1
3,"[398, 105, 14, 9, 4, 98, 13, 732, 22, 63, 333,...",0
4,"[62, 9, 33, 4, 132, 65, 3, 69, 2494, 1, 293, 5...",1


# the text sequences are of variable length

In [19]:
length_of_train_text = list()
for i,r in df_train_labeled_text.iterrows():
    length_of_train_text.append(len(r['text sequences']))

In [20]:
length_of_train_text[:5]

[138, 200, 708, 146, 126]

In [21]:
max(length_of_train_text)

3944

In [22]:
min(length_of_train_text)

11

# take the text sequences and labels in the test_data into lists

In [23]:
test_text_sequences = list()
test_labels = list()
for x,y in test_data:
    test_text_sequences.append(x.numpy())
    test_labels.append(y.numpy())

# convert these lists into a dataframe 

In [24]:
dct_test_labeled_text = {'text sequences': test_text_sequences, 'labels': test_labels}

In [25]:
df_test_labeled_text = pd.DataFrame(dct_test_labeled_text)

In [26]:
df_test_labeled_text.head()

Unnamed: 0,text sequences,labels
0,"[69, 5680, 22, 155, 6819, 7961, 6197, 309, 215...",1
1,"[12, 582, 448, 14, 44, 82, 1080, 2667, 464, 44...",0
2,"[62, 631, 32, 620, 3783, 8, 84, 3877, 190, 3, ...",1
3,"[6270, 762, 21, 5290, 6724, 3077, 8, 11, 59, 4...",0
4,"[8002, 7968, 111, 81, 33, 215, 7, 613, 82, 101...",0


# length of text sequences in test_data are of variable length

In [27]:
length_of_test_text = list()
for i,r in df_test_labeled_text.iterrows():
    length_of_test_text.append(len(r['text sequences']))

In [28]:
length_of_test_text[:5]

[226, 248, 203, 163, 159]

In [29]:
max(length_of_test_text)

3454

In [30]:
min(length_of_test_text)

9

# take the train text sequences and test text sequences as lists

In [31]:
X_train = list(df_train_labeled_text['text sequences'])

In [32]:
X_test = list(df_test_labeled_text['text sequences'])

# pad the text sequences
# add 0s to text sequences having length less than the max length
# truncate the text sequences having length more than the max length to max length

In [33]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# after padding see the lengths of text sequences in train and test data 

In [34]:
train_post_padding_length = list()
for a in X_train:
    train_post_padding_length.append(len(a))
    
test_post_padding_length = list()
for a in X_test:
    test_post_padding_length.append(len(a))

In [35]:
train_post_padding_length[:5]

[100, 100, 100, 100, 100]

In [36]:
test_post_padding_length[:5]

[100, 100, 100, 100, 100]

# build the neural network model_1 - global average pooling 1D

In [37]:
embedding_dim=16

model_1 = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(16, activation='relu'),
  layers.Dense(1)
])

model_1.summary()

NameError: name 'keras' is not defined

# take the train labels as a numpy array

In [None]:
y_train = list(df_train_labeled_text['labels'])
y_train = np.array(y_train)

# take the test labels as a numpy array 

In [None]:
y_test = list(df_test_labeled_text['labels'])
y_test = np.array(y_test)

# compile the model_1

In [None]:
model_1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

# train the model_1

In [None]:
history_1 = model_1.fit(X_train, y_train, epochs=5)

# evaluate the trained model_1 on test data

In [None]:
model_1.evaluate(X_test, y_test)

# build the neural network model_2 - lstm

In [None]:
embedding_dim=16

model_2 = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.LSTM(128),
  layers.Dense(16, activation='relu'),
  layers.Dense(1)
])

model_2.summary()

# compile the model_2

In [None]:
model_2.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

# train the model_2

In [None]:
history_2 = model_2.fit(X_train, y_train, epochs=5)

# evaluate the model_2

In [None]:
model_2.evaluate(X_test, y_test)