In [1]:
import pandas as pd

In [2]:
reviews_df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [3]:
label = reviews_df.pop('sentiment')

In [4]:
import tensorflow as tf

In [5]:
ds_raw = tf.data.Dataset.from_tensor_slices((reviews_df.values, label.values))

In [6]:
ds_raw_count = label.count()

In [7]:
tf.random.set_seed(42)

In [8]:
ds_raw = ds_raw.shuffle(ds_raw_count, reshuffle_each_iteration = False)

In [9]:
test_count = int(0.5 * ds_raw_count)
train_count = int(0.8 * (ds_raw_count - test_count))
valid_count = int(ds_raw_count - test_count - train_count)

In [10]:
train_count, valid_count, test_count

(20000, 5000, 25000)

In [11]:
ds_test_raw = ds_raw.take(test_count)
ds_train_and_valid_raw = ds_raw.skip(test_count)
ds_train_raw = ds_train_and_valid_raw.take(train_count)
ds_valid_raw = ds_train_and_valid_raw.skip(train_count)

In [12]:
import tensorflow_datasets as tfds
from collections import Counter

In [13]:
tokenizer = tfds.features.text.Tokenizer()
word_counts = Counter()

In [14]:
for review in ds_train_raw.as_numpy_iterator():
    words = tokenizer.tokenize(review[0][0])
    word_counts.update(words)

In [15]:
for review in ds_train_raw.take(5).as_numpy_iterator():
    print(review[0][0], review[1])

b'When I fist watched the movie, I said to myself, "so a film can be made like this." Wong Kar Wai\'s gorgeous poetic love story captured me throughout and even after the film. I must admit this is one of the best love movies, maybe the best of all, I have ever watched. The content and the form overlaps perfectly. As watching the secret love we see the characters in bounded frames that limits their movements as well as their feelings. Beautiful camera angles and the lighting makes the feelings and the blues even touchable. I want to congratulate Christopher Doyle and Pin Bing Lee for their fantastic cinematography which creates the mood for love. Also the music defines the sadness of the love which plays along the beautiful slow motion frames and shows the characters in despairing moods. And of course the performances of the actors which makes the love so real. Eventually, all the elements in the film combined in a perfect way under the direction of WKW and give the audience the feelin

In [16]:
text_encoder = tfds.features.text.TokenTextEncoder(word_counts)

In [17]:
def encode_text(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = text_encoder.encode(text)
    return encoded_text, label

In [18]:
for review in ds_train_raw.take(5):
    print(encode_text(review[0], review[1]))

([1, 2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 5, 12, 2, 32, 33, 17, 34, 35, 36, 5, 37, 24, 38, 39, 5, 37, 36, 40, 2, 41, 42, 4, 43, 44, 29, 5, 45, 46, 47, 48, 49, 5, 50, 24, 51, 52, 5, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 61, 59, 63, 64, 65, 66, 29, 5, 67, 68, 5, 63, 29, 5, 69, 30, 70, 2, 71, 8, 72, 73, 74, 29, 75, 76, 77, 78, 59, 79, 80, 81, 82, 5, 83, 78, 24, 84, 5, 85, 86, 5, 87, 36, 5, 24, 81, 88, 89, 5, 90, 91, 92, 56, 29, 93, 5, 53, 54, 94, 95, 96, 36, 97, 5, 98, 36, 5, 99, 81, 68, 5, 24, 10, 100, 101, 40, 5, 102, 54, 5, 12, 103, 54, 11, 104, 105, 106, 5, 107, 36, 108, 29, 109, 5, 110, 5, 111, 112, 24], <tf.Tensor: shape=(), dtype=int64, numpy=1>)
([113, 114, 115, 116, 117, 11, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 5, 128, 36, 5, 6, 129, 130, 131, 5, 132, 133, 134, 135, 6, 136, 137, 138, 54, 61, 139, 61, 136, 13, 140, 141, 6, 34, 10, 142, 57, 136, 143, 144, 8, 145, 146, 124, 147, 29, 148, 149

In [19]:
def encode_text_map(text_tensor, label):
    return tf.py_function(encode_text, inp=[text_tensor, label], Tout=[tf.int64, tf.int64])

In [20]:
ds_train_encoded = ds_train_raw.map(encode_text_map)
ds_valid_encoded = ds_valid_raw.map(encode_text_map)
ds_test_encoded = ds_test_raw.map(encode_text_map)

In [21]:
for review in ds_train_encoded.take(5).as_numpy_iterator():
    print(review[0], review[1])

[  1   2   3   4   5   6   2   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31   5  12   2  32
  33  17  34  35  36   5  37  24  38  39   5  37  36  40   2  41  42   4
  43  44  29   5  45  46  47  48  49   5  50  24  51  52   5  53  54  55
  56  57  58  59  60  61  62  61  59  63  64  65  66  29   5  67  68   5
  63  29   5  69  30  70   2  71   8  72  73  74  29  75  76  77  78  59
  79  80  81  82   5  83  78  24  84   5  85  86   5  87  36   5  24  81
  88  89   5  90  91  92  56  29  93   5  53  54  94  95  96  36  97   5
  98  36   5  99  81  68   5  24  10 100 101  40   5 102  54   5  12 103
  54  11 104 105 106   5 107  36 108  29 109   5 110   5 111 112  24] 1
[113 114 115 116 117  11 118 119 120 121 122 123 124 125 126 127   5 128
  36   5   6 129 130 131   5 132 133 134 135   6 136 137 138  54  61 139
  61 136  13 140 141   6  34  10 142  57 136 143 144   8 145 146 124 147
  29 148 149   5 150  36  17  12 151 152 152 153 136

In [22]:
ds_train = ds_train_encoded.padded_batch(32, padded_shapes=([-1],[]))
ds_valid = ds_valid_encoded.padded_batch(32, padded_shapes=([-1],[]))
ds_test = ds_test_encoded.padded_batch(32, padded_shapes=([-1],[]))

In [23]:
for review in ds_train.take(5).as_numpy_iterator():
    print(review[0][0])

[  1   2   3   4   5   6   2   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31   5  12   2  32
  33  17  34  35  36   5  37  24  38  39   5  37  36  40   2  41  42   4
  43  44  29   5  45  46  47  48  49   5  50  24  51  52   5  53  54  55
  56  57  58  59  60  61  62  61  59  63  64  65  66  29   5  67  68   5
  63  29   5  69  30  70   2  71   8  72  73  74  29  75  76  77  78  59
  79  80  81  82   5  83  78  24  84   5  85  86   5  87  36   5  24  81
  88  89   5  90  91  92  56  29  93   5  53  54  94  95  96  36  97   5
  98  36   5  99  81  68   5  24  10 100 101  40   5 102  54   5  12 103
  54  11 104 105 106   5 107  36 108  29 109   5 110   5 111 112  24   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

[ 572 5029 5030 1067 5031 4229  351   11 2337 2458   54 5032   29 3154
    5  530 5033 5034    2  137 1155    8  125   29    5 5035 5036 2337
 2458    2  115  116 4336  237 1852  125  146  908 5037 5038   12   34
  585  264  134 2550 5037 5038 1277   54  167  867 2529 5039 1340  503
  337  189 4096  726    2  309  505 5040 4021    8  294    5   12  478
 2723  152  152 3179 5041   34 3496   11 2856   36 5042  202  337  751
   54   11 5043   57 5044    5 5045   29    5 2353   21  525 5046   34
 5047  256 5048    5 5049  141  615  116 1166  825  139  237    5 1282
   34  249   35  357 4825    8  941 4547 5050 3920   51 2546  949   29
    5  560   36    5 5046   29    5 5041   29   36    5 5043  152  152
   43   12   34 5051 5052   78   11 5053   12   29 5054  367   54 2613
  460 1173  616  475 1337    8   14  794  625   43   25   13   14   11
  287  998    8  440  185  207  237   40   68 1448 2800   43   12  512
   11  458 5055  185    5  455  237  460 2898  264   11 5056 5057   12
  374 

In [24]:
from tensorflow.keras import Sequential

In [25]:
conv_bidir_lstm_model = Sequential()

In [26]:
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, GRU, Dense, Flatten

In [27]:
vocab_size = len(word_counts) + 2
embedding_size = 32
conv_bidir_lstm_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size))

In [28]:
conv_bidir_lstm_model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))

In [29]:
conv_bidir_lstm_model.add(MaxPooling1D(pool_size=5))

In [30]:
conv_bidir_lstm_model.add(Bidirectional(LSTM(units=16, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))

In [31]:
conv_bidir_lstm_model.add(LSTM(units=32, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))

In [32]:
conv_bidir_lstm_model.add(Dense(64, activation='relu')) 

In [33]:
conv_bidir_lstm_model.add(Dense(1, activation='sigmoid')) 

In [34]:
conv_bidir_lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2791616   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         12416     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 32)          18560     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 64)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6

In [35]:
conv_bidir_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
from tensorflow.keras.callbacks import TensorBoard
import time

In [37]:
tensorboard_callback = TensorBoard(log_dir='.\logs\conv_bidir_lstm_model', histogram_freq=1, write_graph=True)

In [38]:
conv_bidir_lstm_model.fit(ds_train, epochs=8, validation_data=ds_valid, callbacks=[tensorboard_callback])

Epoch 1/8
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x287ecbb9e80>

In [39]:
loss, accuracy = conv_bidir_lstm_model.evaluate(ds_test)

