In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import CountVectorizer

tf.random.set_seed(42)

## IMDB movie reviews

## Retrieving and preparing the Data

We will work with the IMDb movie reviews data.

In [56]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
# Read in the IMDB Dataset into "data". Do not set an index column

# YOUR CODE HERE
#raise NotImplementedError()
data=pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [58]:
# Replace all "negative" and "positive" sentiment values with o and 1 respectively.
# You can use a simple logical operator instead of label encodeing.
data['sentiment']=data['sentiment'].replace({'positive':1,'negative':0})
data.head()
# YOUR CODE HERE
#raise NotImplementedError()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [59]:
# Get the dependent data and assign to y
# YOUR CODE HERE
#raise NotImplementedError()
y=data.pop('sentiment')
print(y[0:10])

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    1
Name: sentiment, dtype: int64


In [60]:
from sklearn.model_selection import train_test_split
# Split the X data (data['review']) and y data into X_train, X_test, y_train, and y_test
# With a test size of 0.2 and a random_state of 42
# YOUR CODE HERE
#raise NotImplementedError()
X_train, X_test,y_train, y_test = train_test_split(data['review'],y, test_size=0.2, random_state=42)

In [61]:
print(f"""
Train samples: {X_test.shape[0]}
Test samples: {y_test.shape[0]}
"""
)


Train samples: 10000
Test samples: 10000



In [None]:
y_train

39084    0
30892    0
45275    1
16398    1
13653    1
        ..
11284    0
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 39997, dtype: int64

Inspect the frequence of each sentiment in the traning dataset (it is balanced!)

In [62]:
# Calculate the training data's frequency and assign the output to "frequency"

# YOUR CODE HERE
#raise NotImplementedError()
frequency = X_train.value_counts()
print(frequency)

review
Loved today's show!!! It was a variety and not solely cooking (which would have been great too). Very stimulating and captivating, always keeping the viewer peeking around the corner to see what was coming up next. She is as down to earth and as personable as you get, like one of us which made the show all the more enjoyable. Special guests, who are friends as well made for a nice surprise too. Loved the 'first' theme and that the audience was invited to play along too. I must admit I was shocked to see her come in under her time limits on a few things, but she did it and by golly I'll be writing those recipes down. Saving time in the kitchen means more time with family. Those who haven't tuned in yet, find out what channel and the time, I assure you that you won't be disappointed.                                                                                                                                                                                                         

In [63]:
# Let's turn the target into a dummy vector

# YOUR CODE HERE
#raise NotImplementedError()
y_train = pd.get_dummies(y_train).to_numpy()
y_test = pd.get_dummies(y_test).to_numpy()

In [None]:
y_train.shape

(40000, 2)

## Unigram Multi-hot Encoding Baseline

Next, let us see the performance of a neural net that is trained from the scratch using multi-hot encoding.

In [64]:
# Set the maximum number of tokens to 2412.
# Also set up our Text Vectorization layer using multi-hot encoding

# YOUR CODE HERE
#raise NotImplementedError()
max_tokens = 2412
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot")

In [65]:
# The vocabulary that will be indexed is given by the text corpus on our train dataset
# YOUR CODE HERE
#raise NotImplementedError()
text_vectorization.adapt(X_train)

In [66]:
# We vectorize our input
# YOUR CODE HERE
#raise NotImplementedError()
X_train=text_vectorization(X_train)
X_test=text_vectorization(X_test)


In [67]:
# Now create your model. start with 32 dense relu layers, a dropout layer of 0.5, and a final softmax layer

# YOUR CODE HERE
#raise NotImplementedError()
inputs = keras.Input(shape=(max_tokens, ))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 2412)]            0         
                                                                 
 dense_4 (Dense)             (None, 32)                77216     
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 77282 (301.88 KB)
Trainable params: 77282 (301.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [68]:
# Compile your model

# YOUR CODE HERE
#raise NotImplementedError()
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

In [69]:
# YOUR CODE HERE
#raise NotImplementedError()
model.fit(x=X_train, y=y_train,
          validation_data=(X_test, y_test),
          epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x788d8010bee0>

In [70]:
# Evaluate your model. You should be able to get your model to 85% at this point
# YOUR CODE HERE
#raise NotImplementedError()

model.evaluate(x=X_test, y=y_test)



[0.3757628798484802, 0.873199999332428]

In [87]:
#raw_text_data = tf.convert_to_tensor([["This movie was excellent"],])
raw_text_data = tf.convert_to_tensor([["This movie was bad"],])

vect_data = text_vectorization(raw_text_data)
predictions = model.predict(vect_data)
print(predictions)
print(f"{float(predictions[0,0] * 100):.2f} % Negative")
print(f"{float(predictions[0,1] * 100):.2f} % Positive")


[[0.97292274 0.02707731]]
97.29 % Negative
2.71 % Positive


In [None]:
# Begin your model here

# YOUR CODE HERE
#raise NotImplementedError()