<a href="https://colab.research.google.com/github/r-zeeshan/imdb-review-classifier/blob/main/imdb_review_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import tensorflow as tf
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, Input, Conv1D, GlobalMaxPool1D, Dropout, MaxPool1D
from sklearn.preprocessing import LabelEncoder

In [2]:
!wget --no-check-certificate "https://docs.google.com/uc?export=download&id=1rxUd_7UpGz4WVGM1Bjb2uJ74ummZlfTy" -O imdb.csv

--2023-03-05 08:54:08--  https://docs.google.com/uc?export=download&id=1rxUd_7UpGz4WVGM1Bjb2uJ74ummZlfTy
Resolving docs.google.com (docs.google.com)... 108.177.112.138, 108.177.112.113, 108.177.112.100, ...
Connecting to docs.google.com (docs.google.com)|108.177.112.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0k-7s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/pffd8hstqnn7e4iekeii3fvdt8fpa508/1678006425000/14298511845850949288/*/1rxUd_7UpGz4WVGM1Bjb2uJ74ummZlfTy?e=download&uuid=65d83831-23fc-46c3-91fc-1c01705f2fc9 [following]
--2023-03-05 08:54:14--  https://doc-0k-7s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/pffd8hstqnn7e4iekeii3fvdt8fpa508/1678006425000/14298511845850949288/*/1rxUd_7UpGz4WVGM1Bjb2uJ74ummZlfTy?e=download&uuid=65d83831-23fc-46c3-91fc-1c01705f2fc9
Resolving doc-0k-7s-docs.googleusercontent.com (doc-0k-7s-docs.googleusercontent.com)... 209.85.147.132, 2607:f

In [3]:
df = pd.read_csv("/content/imdb.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])

In [5]:
df["sentiment"].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [6]:
label_encoder.classes_

array(['negative', 'positive'], dtype=object)

In [7]:
data_sentences = df["review"].to_list()
data_labels = df["sentiment"].to_list()

In [8]:
train_sentences, X_test, train_labels, y_test = train_test_split(data_sentences, data_labels, test_size = 0.3)

In [9]:
test_sentences, val_sentences, test_labels, val_labels = train_test_split(X_test, y_test, test_size = 0.5)

In [10]:
len(train_sentences), len(test_sentences), len(val_sentences)

(35000, 7500, 7500)

In [11]:
# Calculate the average sentence length
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
output_seq_len = int(np.percentile(sent_lens, 98))
output_seq_len

778

In [12]:
vocab_size = 49600 # From Kaggle Dataset page

In [13]:
# Creating fast loading dataset with tf.data api

train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels)).batch(128).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels)).batch(128).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels)).batch(128).prefetch(tf.data.AUTOTUNE)

In [14]:
# Creating a custom text vectorizer layer using tf.keras.layers.TextVectorizer
text_vectorizer = TextVectorization(max_tokens = vocab_size, output_mode = "int", output_sequence_length=output_seq_len)
text_vectorizer.adapt(train_sentences)

In [15]:
text_vocab = text_vectorizer.get_vocabulary()
print(f"No. of words in vocab: {len(text_vocab)}")
print(f"Most common words in vocab: {text_vocab[:5]}")
print(f"Least common words in data: {text_vocab[-5:]}")

No. of words in vocab: 49600
Most common words in vocab: ['', '[UNK]', 'the', 'a', 'and']
Least common words in data: ['emotionalbr', 'emorys', 'emmerson', 'emits', 'emit']


In [16]:
# Creating a custom text embedding layer using tf.keras.layers.Embedding
text_embedding = Embedding(input_dim = len(text_vocab),
                           output_dim = 512,
                           mask_zero=True,
                           name="text_embedding")

In [32]:
# Creating the final model
token_input = Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(token_input)
x = text_embedding(x)
x = Conv1D(128, kernel_size=12, padding="same", activation="relu")(x)
x = MaxPool1D()(x)
x = Conv1D(128, kernel_size=12, padding="same", activation="relu")(x)
x = GlobalMaxPool1D()(x)
output = Dense(1, activation="sigmoid")(x)


model = tf.keras.Model(inputs = token_input,
                       outputs = output)

In [33]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [34]:
# Setup EarlyStopping callback to stop training if model's val_loss doesn't improve for 3 epochs
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", # watch the val loss metric
                                                  patience=3) # if val loss decreases for 3 epochs in a row, stop training

# Creating learning rate reduction callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",  
                                                 factor=0.2, # multiply the learning rate by 0.2 (reduce by 5x)
                                                 patience=2,
                                                 verbose=1, # print out when learning rate goes down 
                                                 min_lr=1e-7)

In [35]:
history = model.fit(train_dataset,
                    epochs=50,
                    validation_data= val_dataset,
                    callbacks=[early_stopping, reduce_lr])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 5/50


In [36]:
model.evaluate(test_dataset)



[0.9731510877609253, 0.8704000115394592]

In [54]:
test_sentence = """
                I wish I had enjoyed it, given the Tim Burton credential, and my moviegoer love for this director and his nightmarish visions and lovable weird characters. But this is no Tim Burton per se creation, it is a Netflix series, Netflix flows in its veins from the start, from the trailer and the "Paint it Black" cover (which somehow sounds like the cover of the Westworld cover), and the buzz around it. I was expecting "Wednesday" but watching it was like a cold shower, going back to reality, Netflix reality that is : pleasing the majority of its users, transforming the Addams girl into an obnoxious teen in school with her peers, obnoxious teens. Then there was no point in the Addams pariah basis. And no point in keeping watching the series.
                """

In [52]:
def make_preds(model, sentence, label_encoder):

    pred = int(model.predict([sentence])[0])

    return label_encoder.inverse_transform([pred])[0]


In [55]:
make_preds(model, test_sentence, label_encoder)



'negative'