<a href="https://colab.research.google.com/github/rahul-bellam/nlp-lab/blob/main/lab6_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow




In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

In [3]:

# Load Dataset
df = pd.read_csv("/content/IMDB Dataset.csv")

# Keep 1.1K Positive & 1.1K Negative Reviews
df_positive = df[df['sentiment'] == 'positive'].sample(n=1100, random_state=1)
df_negative = df[df['sentiment'] == 'negative'].sample(n=1100, random_state=1)
df = pd.concat([df_positive, df_negative]).sample(frac=1, random_state=1)

df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Train-Test Split
train_data = df.iloc[:2000]
test_data = df.iloc[2000:]


In [16]:
df.describe()

Unnamed: 0,sentiment
count,2200.0
mean,0.5
std,0.500114
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [4]:
X_train, y_train = train_data['review'].values, train_data['sentiment'].values
X_test, y_test = test_data['review'].values, test_data['sentiment'].values

In [5]:
# Tokenization & Padding
vocab_size = 10000
max_length = 500
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [6]:
# 1️⃣ Define the Model
model = Sequential()

model.add(SimpleRNN(100, activation='relu', input_shape=(max_length, 1)))  # RNN Layer with 100 units
model.add(Dense(250, activation='relu'))  # Fully connected layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# 2️⃣ Compile the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 3️⃣ Print Model Summary
print(model.summary())

  super().__init__(**kwargs)


None


In [7]:
# 4️⃣ Reshape Input for RNN (RNN expects 3D inputs: batch_size, timesteps, features)
X_train_rnn = np.expand_dims(X_train_padded, axis=-1)
X_test_rnn = np.expand_dims(X_test_padded, axis=-1)

# 5️⃣ Train the Model
model.fit(X_train_rnn, y_train, validation_data=(X_test_rnn, y_test), epochs=10, batch_size=128, verbose=2)


Epoch 1/10
16/16 - 11s - 700ms/step - accuracy: 0.4965 - loss: 2.8015 - val_accuracy: 0.4750 - val_loss: 1.2251
Epoch 2/10
16/16 - 3s - 172ms/step - accuracy: 0.4975 - loss: 1.4406 - val_accuracy: 0.4600 - val_loss: 2.3468
Epoch 3/10
16/16 - 3s - 182ms/step - accuracy: 0.5140 - loss: 1.5988 - val_accuracy: 0.4850 - val_loss: 2.3900
Epoch 4/10
16/16 - 6s - 380ms/step - accuracy: 0.5120 - loss: 2.0270 - val_accuracy: 0.4750 - val_loss: 1.6090
Epoch 5/10
16/16 - 3s - 176ms/step - accuracy: 0.5125 - loss: 1.8195 - val_accuracy: 0.4750 - val_loss: 2.3254
Epoch 6/10
16/16 - 5s - 320ms/step - accuracy: 0.5135 - loss: 2.1974 - val_accuracy: 0.4650 - val_loss: 6.8143
Epoch 7/10
16/16 - 5s - 318ms/step - accuracy: 0.5120 - loss: 1.6682 - val_accuracy: 0.4750 - val_loss: 3.1331
Epoch 8/10
16/16 - 3s - 180ms/step - accuracy: 0.5140 - loss: 1.6922 - val_accuracy: 0.4750 - val_loss: 1.9336
Epoch 9/10
16/16 - 6s - 358ms/step - accuracy: 0.5170 - loss: 0.9307 - val_accuracy: 0.4700 - val_loss: 1.1817


<keras.src.callbacks.history.History at 0x7e959a5bc710>

In [8]:
# 6️⃣ Evaluate the Model
scores = model.evaluate(X_test_rnn, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

# 7️⃣ Save the Model
model.save("sentiment_model_no_embedding.h5")



Accuracy: 47.00%


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

embedding_dim = 100
vocab_size = 10000
max_length = 500

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, max_length))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


print(model.summary())



None


In [13]:
# 7️⃣ Train the Model
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=5, batch_size=128, verbose=2)



Epoch 1/5
16/16 - 17s - 1s/step - accuracy: 0.5055 - loss: 1.0078 - val_accuracy: 0.4650 - val_loss: 0.8466
Epoch 2/5
16/16 - 15s - 917ms/step - accuracy: 0.6620 - loss: 0.6459 - val_accuracy: 0.5100 - val_loss: 1.1295
Epoch 3/5
16/16 - 11s - 701ms/step - accuracy: 0.7975 - loss: 0.4659 - val_accuracy: 0.6550 - val_loss: 0.6567
Epoch 4/5
16/16 - 10s - 650ms/step - accuracy: 0.9770 - loss: 0.1395 - val_accuracy: 0.6150 - val_loss: 0.6494
Epoch 5/5
16/16 - 10s - 627ms/step - accuracy: 0.9960 - loss: 0.0332 - val_accuracy: 0.6500 - val_loss: 0.7143


<keras.src.callbacks.history.History at 0x7e9595247150>

In [14]:

scores = model.evaluate(X_test_padded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

model.save("sentiment_model_with_embedding.h5")




Accuracy: 65.00%
