In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datasets import load_dataset

# Load IMDb dataset from Hugging Face
dataset = load_dataset('imdb')

# Convert train and test datasets to Pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

# Check the DataFrame
print(train_df.sample(5))
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense,Activation
from scipy import stats



# Tokenize the text data using Keras Tokenizer
tokenizer = Tokenizer(num_words=5000)  # Limit the number of words (vocabulary size) to 5000
tokenizer.fit_on_texts(train_df['text'])  # Fit tokenizer on training data

# Convert the text to sequences of integers
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# Pad the sequences to ensure they are all the same length
max_sequence_length = 130  # Define max sequence length (for padding)
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post')

# Prepare the labels (sentiment labels: 0 or 1)
y_train = train_df['label'].values
y_test = test_df['label'].values

In [None]:
# Define vocab_size and embedding_dim
vocab_size = 5000
embedding_dim = 100

# Build the SimpleRNN model with an Embedding layer
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=len(X_train[0]))) # Embedding layer
model.add(SimpleRNN(32,return_sequences=False, activation="relu"))  # SimpleRNN layer
model.add(Dense(1, activation='sigmoid'))  # Output layer (binary classification)

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [None]:
print(model.summary())

In [None]:
history = model.fit(X_train,y_train,validation_data = (X_test,y_test),epochs = 3,batch_size=128,verbose = 1)

In [None]:

plt.figure()
plt.plot(history.history["accuracy"],label="Train");
plt.plot(history.history["val_accuracy"],label="Test");
plt.title("Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epochs")
plt.legend()
plt.show()


In [None]:

plt.figure()
plt.plot(history.history["loss"],label="Train");
plt.plot(history.history["val_loss"],label="Test");
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend()
plt.show()


# EDA



# Check distribution of text lengths (in terms of number of words)
train_df['text_length'] = train_df['text'].apply(lambda x: len(x.split()))

# Plot the distribution
train_df['text_length'].hist(bins=50, color='skyblue', edgecolor='black')
plt.title('Text Length Distribution in IMDb Dataset')
plt.xlabel('Number of Words in Review')
plt.ylabel('Frequency')
plt.show()

# Check quantiles to choose a reasonable max_sequence_length
print(f"Quantiles of text length: \n{train_df['text_length'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 1.0])}")


In [None]:

# Check text length distribution for both train and test datasets
train_df['text_length'] = train_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

# Plot the distribution of text lengths
plt.figure(figsize=(10, 6))
sns.histplot(train_df['text_length'], bins=50, kde=True, color='blue', label='Train Set')
sns.histplot(test_df['text_length'], bins=50, kde=True, color='orange', label='Test Set')
plt.title('Distribution of Text Lengths')
plt.legend()
plt.show()

# Show a few examples of reviews from the dataset
print(train_df['text'].head())

In [None]:
# Check class distribution for both train and test datasets
train_class_distribution = train_df['label'].value_counts(normalize=True)
test_class_distribution = test_df['label'].value_counts(normalize=True)

print("Train class distribution:", train_class_distribution)
print("Test class distribution:", test_class_distribution)

# Plot the class distribution
train_class_distribution.plot(kind='bar', color=['blue', 'red'], alpha=0.7)
plt.title("Train Data Class Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Proportion")
plt.xticks([0, 1], ['Negative', 'Positive'], rotation=0)
plt.show()


In [None]:
from wordcloud import WordCloud

# Combine all reviews into one text for word cloud
all_reviews = ' '.join(train_df['text'].values)

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Create a boxplot of review length vs sentiment
plt.figure(figsize=(8, 6))
sns.boxplot(x='label', y='text_length', data=train_df)
plt.title("Review Length vs Sentiment")
plt.xlabel("Sentiment (0=Negative, 1=Positive)")
plt.ylabel("Review Length (number of words)")
plt.show()

# Experimentation

In [None]:
from tensorflow.keras.optimizers import Adam
# Initialize W&B project
wandb.init(project="imdb_sentiment_analysis_Experimentation_for conclusion", name="rnn_model_training_Adam_32_001")

# Define hyperparameters
lr = 0.001  # Different learning rates
batch_size = 32  # Different batch sizes
epochs = 3

# Set the optimizer with the current learning rate
optimizer = Adam(learning_rate=lr)

# Initialize the W&B run for this set of hyperparameters
wandb.init(project="imdb_sentiment_analysis", name=f"rnn_lr_{lr}_batch_{batch_size}")

# Train the model manually and log the metrics after each epoch
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} with Learning Rate {lr} and Batch Size {batch_size}")

    # Train the model for one epoch
    history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), verbose=1)

    # Get the logs for the current epoch
    logs = history.history

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,  # Current epoch number
        "training_loss": logs.get("loss")[0],  # Training loss for the current epoch
        "validation_loss": logs.get("val_loss")[0],  # Validation loss for the current epoch
        "validation_accuracy": logs.get("val_accuracy")[0],  # Validation accuracy for the current epoch
        "learning_rate": model.optimizer.learning_rate.numpy(),  # Learning rate
    })

# After training, evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the output
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Log the results to W&B
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Log model summary to W&B
wandb.log({"model_summary": model.summary()})

# Finish the W&B run
wandb.finish()


In [None]:
from tensorflow.keras.optimizers import Adam
# Initialize W&B project
wandb.init(project="imdb_sentiment_analysis_Experimentation_for conclusion", name="rnn_model_training_Adam_64_001")

# Define hyperparameters
lr = 0.001  # Different learning rates
batch_size = 64  # Different batch sizes
epochs = 3

# Set the optimizer with the current learning rate
optimizer = Adam(learning_rate=lr)

# Initialize the W&B run for this set of hyperparameters
wandb.init(project="imdb_sentiment_analysis", name=f"rnn_lr_{lr}_batch_{batch_size}")

# Train the model manually and log the metrics after each epoch
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} with Learning Rate {lr} and Batch Size {batch_size}")

    # Train the model for one epoch
    history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), verbose=1)

    # Get the logs for the current epoch
    logs = history.history

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,  # Current epoch number
        "training_loss": logs.get("loss")[0],  # Training loss for the current epoch
        "validation_loss": logs.get("val_loss")[0],  # Validation loss for the current epoch
        "validation_accuracy": logs.get("val_accuracy")[0],  # Validation accuracy for the current epoch
        "learning_rate": model.optimizer.learning_rate.numpy(),  # Learning rate
    })

# After training, evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the output
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Log the results to W&B
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Log model summary to W&B
wandb.log({"model_summary": model.summary()})

# Finish the W&B run
wandb.finish()


In [None]:
from tensorflow.keras.optimizers import Adam
# Initialize W&B project
wandb.init(project="imdb_sentiment_analysis_Experimentation_for conclusion", name="rnn_model_training_Adam_64_0005")

# Define hyperparameters
lr = 0.0005  # Different learning rates
batch_size = 64  # Different batch sizes
epochs = 3

# Set the optimizer with the current learning rate
optimizer = Adam(learning_rate=lr)

# Initialize the W&B run for this set of hyperparameters
wandb.init(project="imdb_sentiment_analysis", name=f"rnn_lr_{lr}_batch_{batch_size}")

# Train the model manually and log the metrics after each epoch
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} with Learning Rate {lr} and Batch Size {batch_size}")

    # Train the model for one epoch
    history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), verbose=1)

    # Get the logs for the current epoch
    logs = history.history

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,  # Current epoch number
        "training_loss": logs.get("loss")[0],  # Training loss for the current epoch
        "validation_loss": logs.get("val_loss")[0],  # Validation loss for the current epoch
        "validation_accuracy": logs.get("val_accuracy")[0],  # Validation accuracy for the current epoch
        "learning_rate": model.optimizer.learning_rate.numpy(),  # Learning rate
    })

# After training, evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the output
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Log the results to W&B
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Log model summary to W&B
wandb.log({"model_summary": model.summary()})

# Finish the W&B run
wandb.finish()


In [None]:
from tensorflow.keras.optimizers import SGD
# Initialize W&B project
wandb.init(project="imdb_sentiment_analysis_Experimentation_for conclusion", name="rnn_model_training_SGD_64_0005")

# Define hyperparameters
lr = 0.0005  # Different learning rates
batch_size = 64  # Different batch sizes
epochs = 3

# Set the optimizer with the current learning rate
optimizer = SGD(learning_rate=lr)

# Initialize the W&B run for this set of hyperparameters
wandb.init(project="imdb_sentiment_analysis", name=f"rnn_lr_{lr}_batch_{batch_size}")

# Train the model manually and log the metrics after each epoch
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} with Learning Rate {lr} and Batch Size {batch_size}")

    # Train the model for one epoch
    history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), verbose=1)

    # Get the logs for the current epoch
    logs = history.history

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,  # Current epoch number
        "training_loss": logs.get("loss")[0],  # Training loss for the current epoch
        "validation_loss": logs.get("val_loss")[0],  # Validation loss for the current epoch
        "validation_accuracy": logs.get("val_accuracy")[0],  # Validation accuracy for the current epoch
        "learning_rate": model.optimizer.learning_rate.numpy(),  # Learning rate
    })

# After training, evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the output
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Log the results to W&B
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Log model summary to W&B
wandb.log({"model_summary": model.summary()})

# Finish the W&B run
wandb.finish()


In [None]:
from tensorflow.keras.optimizers import RMSprop
# Initialize W&B project
wandb.init(project="imdb_sentiment_analysis_Experimentation_for conclusion", name="rnn_model_training_RMS_64_0005")

# Define hyperparameters
lr = 0.0005  # Different learning rates
batch_size = 64  # Different batch sizes
epochs = 3

# Set the optimizer with the current learning rate
optimizer = RMSprop(learning_rate=lr)

# Initialize the W&B run for this set of hyperparameters
wandb.init(project="imdb_sentiment_analysis", name=f"rnn_lr_{lr}_batch_{batch_size}")

# Train the model manually and log the metrics after each epoch
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} with Learning Rate {lr} and Batch Size {batch_size}")

    # Train the model for one epoch
    history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), verbose=1)

    # Get the logs for the current epoch
    logs = history.history

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,  # Current epoch number
        "training_loss": logs.get("loss")[0],  # Training loss for the current epoch
        "validation_loss": logs.get("val_loss")[0],  # Validation loss for the current epoch
        "validation_accuracy": logs.get("val_accuracy")[0],  # Validation accuracy for the current epoch
        "learning_rate": model.optimizer.learning_rate.numpy(),  # Learning rate
    })

# After training, evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the output
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Log the results to W&B
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

# Log model summary to W&B
wandb.log({"model_summary": model.summary()})

# Finish the W&B run
wandb.finish()
