In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import CountVectorizer

tf.random.set_seed(42)

## IMDB movie reviews

## Retrieving and preparing the Data

We will work with the IMDb movie reviews data.

In [61]:
# Read in the IMDB Dataset into "data". Do not set an index column

# YOUR CODE HERE

data = pd.read_csv("IMDB Dataset.csv")


In [62]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [63]:
# Replace all "negative" and "positive" sentiment values with o and 1 respectively.
# You can use a simple logical operator instead of label encodeing. 

# YOUR CODE HERE
replacements = {'positive': 1 , 'negative': 0}

# replace values using the .map() method
data['sentiment'] = data['sentiment'].map(replacements).fillna(data['sentiment'])
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [65]:
# Get the dependent data and assign to y
# YOUR CODE HERE
y= data['sentiment']

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [67]:
from sklearn.model_selection import train_test_split

# Split the X data (data['review']) and y data into X_train, X_test, y_train, and y_test
# With a test size of 0.2 and a random_state of 42

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [68]:
print(f"""
Train samples: {X_test.shape[0]}
Test samples: {y_test.shape[0]}
"""
)


Train samples: 10000
Test samples: 10000



In [69]:
y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

Inspect the frequence of each sentiment in the traning dataset (it is balanced!)

In [74]:
# Calculate the training data's frequency and assign the output to "frequency"

# YOUR CODE HERE
frequency = X_train['sentiment'].value_counts() / X_train.shape[0]

print(frequency)

0    0.500975
1    0.499025
Name: sentiment, dtype: float64


In [78]:
# Let's turn the target into a dummy vector

# YOUR CODE HERE
y_train = pd.get_dummies(X_train['sentiment']).to_numpy()


In [79]:
y_train.shape

(40000, 2)

## Unigram Multi-hot Encoding Baseline

Next, let us see the performance of a neural net that is trained from the scratch using multi-hot encoding. 

In [80]:
# Set the maximum number of tokens to 2412. 
# Also set up our Text Vectorization layer using multi-hot encoding

# YOUR CODE HERE
max_tokens = 2412
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot")

In [82]:
# The vocabulary that will be indexed is given by the text corpus on our train dataset
# YOUR CODE HERE
text_vectorization.adapt(X_train['review'])
text_vectorization.get_vocabulary()[-20:]

['believed',
 'albert',
 'witness',
 'morgan',
 'exploitation',
 'edward',
 'draw',
 'costume',
 'costs',
 'continuity',
 'asleep',
 'angles',
 'amateur',
 'promising',
 'display',
 'deadly',
 'hide',
 'haunted',
 'gordon',
 'figured']

In [85]:
# We vectorize our input
# YOUR CODE HERE
X_train = text_vectorization(X_train['review'])
X_test = text_vectorization(X_test['review'])

In [98]:
# Now create your model. start with 32 dense relu layers, a dropout layer of 0.5, and a final softmax layer

inputs = keras.Input(shape=(max_tokens, ))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs, outputs)


model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 2412)]            0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                77216     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 66        
Total params: 77,282
Trainable params: 77,282
Non-trainable params: 0
_________________________________________________________________


In [99]:
# Compile your model

# YOUR CODE HERE
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

In [101]:
# YOUR CODE HERE
model.fit(x=X_train, y=y_train,
          validation_data=(X_test, y_test),
          epochs=10,
          batch_size=32,)

Epoch 1/10

ValueError: in user code:

    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1330 test_function  *
        return step_function(self, iterator)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1320 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1313 run_step  **
        outputs = model.test_step(data)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1270 test_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/compile_utils.py:201 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:141 __call__
        losses = call_fn(y_true, y_pred)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:245 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:1666 categorical_crossentropy
        y_true, y_pred, from_logits=from_logits, axis=axis)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/backend.py:4839 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 2) are incompatible


In [102]:
# Evaluate your model. You should be able to get your model to 85% at this point
# YOUR CODE HERE
model.evaluate(x=X_test, y=y_test)

ValueError: in user code:

    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1330 test_function  *
        return step_function(self, iterator)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1320 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1313 run_step  **
        outputs = model.test_step(data)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/training.py:1270 test_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    /home/codio/.local/lib/python3.6/site-packages/keras/engine/compile_utils.py:201 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:141 __call__
        losses = call_fn(y_true, y_pred)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:245 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/losses.py:1666 categorical_crossentropy
        y_true, y_pred, from_logits=from_logits, axis=axis)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/codio/.local/lib/python3.6/site-packages/keras/backend.py:4839 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /home/codio/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 2) are incompatible


## Extend Baseline Model

Let's create more complex models to increase the accuracy on our test sample. Try combining different models by changing:
- Number of hidden units
- Adding another hidden layer.
- Changing the number of epochs.
- Using bigrams instead of unigrams.

To guide your search for the best parameters, note how the accuracy changes on both train and test data.

In [94]:
# Begin your model here

# YOUR CODE HERE
inputs = keras.Input(shape=(max_tokens, ))
x = keras.layers.Dense(40, activation="relu")(inputs)
x = keras.layers.Dropout(0.2)(x)
outputs = keras.layers.Dense(5, activation="softmax")(x)

model = keras.Model(inputs, outputs)


model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 2412)]            0         
_________________________________________________________________
dense_2 (Dense)              (None, 40)                96520     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 205       
Total params: 96,725
Trainable params: 96,725
Non-trainable params: 0
_________________________________________________________________
