In [205]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

# Download the Dataset

In [206]:
imdb, info=tfds.load("imdb_reviews",with_info=True,as_supervised=True)

# preview the raw format 

In [207]:
for example in imdb['train'].take(2):
  print(example)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

# Splitting the dataset

In [208]:
# Get the train and test sets
train_data,test_data=imdb['train'],imdb['test']

In [209]:
# Initialize sentences and labels lists
training_sentences=[]
training_labels=[]
testing_sentences=[]
testing_labels=[]

In [210]:
# Loop over all training examples and save the sentences and labels
for s,l in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())

In [211]:
# Loop over all test examples and save the sentences and labels
for s,l in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())

In [212]:
# Convert labels lists to numpy array
training_labels_final=np.array(training_labels)
testing_labels_final=np.array(testing_labels)

# Generate Padded Sequences

In [213]:
vocab_size=10000
embedding_dim=16
max_length=120
trunc_type='post'
oov_tok="<OOV>"

In [214]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [215]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(training_sentences)
padded=pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)
testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=max_length)

# Build and Compile the Model

In [216]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),  # Use global average pooling instead of flatten
    tf.keras.layers.Dense(6, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.5)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.7),  # Increased dropout rate
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [217]:
model.build((None, max_length))  # Specify input shape here

In [218]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [219]:
model.summary()

# Train the Model

In [220]:
num_epochs=25
model.fit(padded,training_labels_final,epochs=num_epochs,
         validation_data=(testing_padded,testing_labels_final))

Epoch 1/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6518 - loss: 2.4495 - val_accuracy: 0.8329 - val_loss: 0.5257
Epoch 2/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7891 - loss: 0.5038 - val_accuracy: 0.8497 - val_loss: 0.4007
Epoch 3/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8063 - loss: 0.4262 - val_accuracy: 0.8380 - val_loss: 0.3847
Epoch 4/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8224 - loss: 0.3976 - val_accuracy: 0.8512 - val_loss: 0.3628
Epoch 5/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8266 - loss: 0.3723 - val_accuracy: 0.8458 - val_loss: 0.3646
Epoch 6/25
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8381 - loss: 0.3445 - val_accuracy: 0.8408 - val_loss: 0.3761
Epoch 7/25
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x2799de99010>

# Visualize Word Embeddings

In [221]:
# Get the embedding layer from the model
e=model.layers[0]

In [222]:
# Get the weights of the embedding layer
weights=e.get_weights()[0]

In [223]:
print(weights)

[[-4.9490377e-04 -4.5261849e-03 -3.4274138e-03 ... -7.1180877e-03
  -1.9360798e-03  1.7043961e-02]
 [-1.2369893e-03  4.1365912e-03 -8.4127551e-03 ...  3.9281595e-02
  -2.0472264e-02  1.0298745e-02]
 [-4.0566631e-02 -2.4201242e-02  2.0007195e-02 ...  1.5141465e-02
  -4.9818117e-02  3.4140926e-02]
 ...
 [ 1.0710270e-01  1.3043296e-01 -1.7831598e-01 ... -2.1687101e-01
   2.2218329e-01 -4.5774490e-02]
 [-5.7224602e-01 -5.4475617e-01  3.7928215e-01 ...  4.9360916e-01
  -5.5924988e-01  5.4991490e-01]
 [-2.9065998e-02 -1.7957645e-02  4.5398753e-03 ... -6.4608596e-02
  -7.5748698e-03  7.9571851e-02]]


In [224]:
weights.shape

(10000, 16)

In [225]:
import io

In [226]:
# Get the index-word dictionary
reverse_word_index=tokenizer.index_word

In [227]:
# Open writeable files
out_v=io.open('vecs.tsv','w',encoding='utf-8')
out_m=io.open('meta.tsv','w',encoding='utf-8')

In [228]:
for word_num in range(1,vocab_size):
    # Get the word associated at the current index
    word= reverse_word_index[word_num]
    # Get the embedding weights associated with the current index
    embeddings=weights[word_num]
    # Write the word name
    out_m.write(word+"\n")
    # Write the word embedding
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

In [229]:
out_v.close()
out_m.close()

In [230]:
# Import files utilities in Colab
try:
    from google.colab import files
except ImportError:
    pass
# Download the files
else:
    files.download('vecs.tsv')
    files.download('meta.tsv')