<a href="https://colab.research.google.com/github/rizkyprofs/ML_DL/blob/main/week4task1_imdbRNN_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
import pandas as pd
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [2]:
# Constants
NUM_WORDS = 40000  # Vocabulary size
MAXLEN = 400       # Maximum review length
EMBEDDING_DIM = 128
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001
VALIDATION_SPLIT = 0.2

# Load the IMDb dataset
print("Loading IMDB dataset...")
(X_train_full, y_train_full), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS)

# Pad sequences to ensure uniform length
X_train_full = pad_sequences(X_train_full, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)

# Split train into train and validation
val_size = int(VALIDATION_SPLIT * len(X_train_full))
X_train, X_val = X_train_full[val_size:], X_train_full[:val_size]
y_train, y_val = y_train_full[val_size:], y_train_full[:val_size]

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Testing data shape: {X_test.shape}")

Loading IMDB dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Training data shape: (20000, 400)
Validation data shape: (5000, 400)
Testing data shape: (25000, 400)


In [3]:
# Create a PyTorch dataset
class IMDbDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.FloatTensor(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataloaders
train_dataset = IMDbDataset(X_train, y_train)
val_dataset = IMDbDataset(X_val, y_val)
test_dataset = IMDbDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [4]:
# Define RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=128, output_dim=1, dropout=0.5):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.3, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        # Use the output of the last time step from both directions
        output = output[:, -1, :]
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.relu(self.fc1(output))
        output = self.dropout(output)
        output = self.relu(self.fc2(output))
        output = self.fc3(output)
        return self.sigmoid(output)

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=128, output_dim=1, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.3, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.layer_norm(hidden)
        hidden = self.dropout(hidden)
        output = self.relu(self.fc1(hidden))
        output = self.dropout(output)
        output = self.relu(self.fc2(output))
        output = self.dropout(output)
        output = self.relu(self.fc3(output))
        output = self.fc4(output)
        return self.sigmoid(output)

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=128, output_dim=1, dropout=0.5):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.3, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.layer_norm(hidden)
        hidden = self.dropout(hidden)
        output = self.relu(self.fc1(hidden))
        output = self.dropout(output)
        output = self.relu(self.fc2(output))
        output = self.dropout(output)
        output = self.relu(self.fc3(output))
        output = self.fc4(output)
        return self.sigmoid(output)

In [5]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, model_name):
    model.to(device)
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []

    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        # Training loop
        for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data).squeeze()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_total += target.size(0)
            predicted = (output >= 0.5).float()
            train_correct += (predicted == target).sum().item()

        # Validation loop
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data).squeeze()
                loss = criterion(output, target)

                val_loss += loss.item()
                val_total += target.size(0)
                predicted = (output >= 0.5).float()
                val_correct += (predicted == target).sum().item()

        # Calculate average losses and accuracies
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_accuracy = train_correct / train_total
        val_accuracy = val_correct / val_total

        # Save metrics
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), f"{model_name}_best.pt")
            print(f"Model saved as {model_name}_best.pt")

        # Print progress
        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\tTrain Loss: {avg_train_loss:.3f} | Train Acc: {train_accuracy:.3f}')
        print(f'\tVal. Loss: {avg_val_loss:.3f} | Val. Acc: {val_accuracy:.3f}')

    # Plot training history
    plt.figure(figsize=(12, 5))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(f'{model_name} - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title(f'{model_name} - Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f"{model_name}_training_history_pytorch.png")
    plt.close()

    return model, {'train_losses': train_losses, 'val_losses': val_losses,
                  'train_accs': train_accs, 'val_accs': val_accs}

In [6]:
# Evaluation function
def evaluate_model(model, test_loader, model_name):
    model.eval()
    model.to(device)

    all_predictions = []
    all_probabilities = []
    all_targets = []

    with torch.no_grad():
        for data, target in tqdm(test_loader, desc="Evaluating"):
            data, target = data.to(device), target.to(device)
            output = model(data).squeeze()
            predicted = (output >= 0.5).float()

            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(output.cpu().numpy())
            all_targets.extend(target.cpu().numpy())

    # Convert to numpy arrays
    all_predictions = np.array(all_predictions)
    all_probabilities = np.array(all_probabilities)
    all_targets = np.array(all_targets)

    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    precision = precision_score(all_targets, all_predictions)
    recall = recall_score(all_targets, all_predictions)
    f1 = f1_score(all_targets, all_predictions)

    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(all_targets, all_probabilities)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name} (PyTorch)')
    plt.legend(loc="lower right")
    plt.savefig(f"{model_name}_roc_curve_pytorch.png")
    plt.close()

    # Plot confusion matrix
    cm = confusion_matrix(all_targets, all_predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(f'Confusion Matrix - {model_name} (PyTorch)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f"{model_name}_confusion_matrix_pytorch.png")
    plt.close()

    # Return all metrics
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr
    }

In [7]:
# Instantiate the models
rnn_model = RNNModel(NUM_WORDS, EMBEDDING_DIM)
lstm_model = LSTMModel(NUM_WORDS, EMBEDDING_DIM)
gru_model = GRUModel(NUM_WORDS, EMBEDDING_DIM)

# Loss function and optimizers
criterion = nn.BCELoss()
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=LEARNING_RATE)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=LEARNING_RATE)

# Dictionary to store all results
all_results = {}

In [8]:
# Train and evaluate RNN model
print("\n=== Training RNN Model ===")
trained_rnn_model, rnn_history = train_model(
    rnn_model, train_loader, val_loader, criterion,
    rnn_optimizer, EPOCHS, "RNN_PyTorch"
)
rnn_results = evaluate_model(trained_rnn_model, test_loader, "RNN_PyTorch")
all_results['RNN'] = rnn_results


=== Training RNN Model ===


Epoch 1/10: 100%|██████████| 157/157 [00:05<00:00, 29.07it/s]


Model saved as RNN_PyTorch_best.pt
Epoch: 01 | Time: 0.0m 5.77s
	Train Loss: 0.672 | Train Acc: 0.570
	Val. Loss: 0.626 | Val. Acc: 0.645


Epoch 2/10: 100%|██████████| 157/157 [00:04<00:00, 38.45it/s]


Model saved as RNN_PyTorch_best.pt
Epoch: 02 | Time: 0.0m 4.44s
	Train Loss: 0.598 | Train Acc: 0.676
	Val. Loss: 0.615 | Val. Acc: 0.672


Epoch 3/10: 100%|██████████| 157/157 [00:04<00:00, 37.83it/s]


Model saved as RNN_PyTorch_best.pt
Epoch: 03 | Time: 0.0m 4.50s
	Train Loss: 0.553 | Train Acc: 0.723
	Val. Loss: 0.565 | Val. Acc: 0.729


Epoch 4/10: 100%|██████████| 157/157 [00:04<00:00, 38.25it/s]


Model saved as RNN_PyTorch_best.pt
Epoch: 04 | Time: 0.0m 4.45s
	Train Loss: 0.550 | Train Acc: 0.712
	Val. Loss: 0.544 | Val. Acc: 0.733


Epoch 5/10: 100%|██████████| 157/157 [00:04<00:00, 38.00it/s]


Epoch: 05 | Time: 0.0m 4.46s
	Train Loss: 0.464 | Train Acc: 0.790
	Val. Loss: 0.555 | Val. Acc: 0.743


Epoch 6/10: 100%|██████████| 157/157 [00:04<00:00, 37.79it/s]


Epoch: 06 | Time: 0.0m 4.46s
	Train Loss: 0.487 | Train Acc: 0.770
	Val. Loss: 0.635 | Val. Acc: 0.624


Epoch 7/10: 100%|██████████| 157/157 [00:04<00:00, 38.05it/s]


Epoch: 07 | Time: 0.0m 4.43s
	Train Loss: 0.542 | Train Acc: 0.724
	Val. Loss: 0.575 | Val. Acc: 0.743


Epoch 8/10: 100%|██████████| 157/157 [00:04<00:00, 37.51it/s]


Epoch: 08 | Time: 0.0m 4.51s
	Train Loss: 0.465 | Train Acc: 0.787
	Val. Loss: 0.593 | Val. Acc: 0.677


Epoch 9/10: 100%|██████████| 157/157 [00:04<00:00, 38.01it/s]


Model saved as RNN_PyTorch_best.pt
Epoch: 09 | Time: 0.0m 4.48s
	Train Loss: 0.409 | Train Acc: 0.825
	Val. Loss: 0.510 | Val. Acc: 0.776


Epoch 10/10: 100%|██████████| 157/157 [00:04<00:00, 37.95it/s]


Epoch: 10 | Time: 0.0m 4.44s
	Train Loss: 0.408 | Train Acc: 0.822
	Val. Loss: 0.534 | Val. Acc: 0.755


Evaluating: 100%|██████████| 196/196 [00:01<00:00, 123.31it/s]


In [9]:
# Train and evaluate LSTM model
print("\n=== Training LSTM Model ===")
trained_lstm_model, lstm_history = train_model(
    lstm_model, train_loader, val_loader, criterion,
    lstm_optimizer, EPOCHS, "LSTM_PyTorch"
)
lstm_results = evaluate_model(trained_lstm_model, test_loader, "LSTM_PyTorch")
all_results['LSTM'] = lstm_results


=== Training LSTM Model ===


Epoch 1/10: 100%|██████████| 157/157 [00:14<00:00, 10.70it/s]


Model saved as LSTM_PyTorch_best.pt
Epoch: 01 | Time: 0.0m 16.01s
	Train Loss: 0.672 | Train Acc: 0.563
	Val. Loss: 0.606 | Val. Acc: 0.678


Epoch 2/10: 100%|██████████| 157/157 [00:15<00:00, 10.42it/s]


Model saved as LSTM_PyTorch_best.pt
Epoch: 02 | Time: 0.0m 16.42s
	Train Loss: 0.555 | Train Acc: 0.728
	Val. Loss: 0.493 | Val. Acc: 0.761


Epoch 3/10: 100%|██████████| 157/157 [00:15<00:00, 10.23it/s]


Model saved as LSTM_PyTorch_best.pt
Epoch: 03 | Time: 0.0m 16.74s
	Train Loss: 0.458 | Train Acc: 0.792
	Val. Loss: 0.434 | Val. Acc: 0.799


Epoch 4/10: 100%|██████████| 157/157 [00:15<00:00,  9.95it/s]


Epoch: 04 | Time: 0.0m 17.11s
	Train Loss: 0.424 | Train Acc: 0.818
	Val. Loss: 0.507 | Val. Acc: 0.773


Epoch 5/10: 100%|██████████| 157/157 [00:15<00:00,  9.93it/s]


Epoch: 05 | Time: 0.0m 17.16s
	Train Loss: 0.375 | Train Acc: 0.842
	Val. Loss: 0.446 | Val. Acc: 0.798


Epoch 6/10: 100%|██████████| 157/157 [00:15<00:00,  9.93it/s]


Epoch: 06 | Time: 0.0m 17.15s
	Train Loss: 0.314 | Train Acc: 0.877
	Val. Loss: 0.529 | Val. Acc: 0.785


Epoch 7/10: 100%|██████████| 157/157 [00:15<00:00,  9.93it/s]


Model saved as LSTM_PyTorch_best.pt
Epoch: 07 | Time: 0.0m 17.22s
	Train Loss: 0.274 | Train Acc: 0.893
	Val. Loss: 0.339 | Val. Acc: 0.854


Epoch 8/10: 100%|██████████| 157/157 [00:16<00:00,  9.80it/s]


Epoch: 08 | Time: 0.0m 17.48s
	Train Loss: 0.206 | Train Acc: 0.924
	Val. Loss: 0.366 | Val. Acc: 0.857


Epoch 9/10: 100%|██████████| 157/157 [00:15<00:00,  9.92it/s]


Epoch: 09 | Time: 0.0m 17.21s
	Train Loss: 0.168 | Train Acc: 0.941
	Val. Loss: 0.349 | Val. Acc: 0.869


Epoch 10/10: 100%|██████████| 157/157 [00:15<00:00,  9.92it/s]


Epoch: 10 | Time: 0.0m 17.23s
	Train Loss: 0.138 | Train Acc: 0.953
	Val. Loss: 0.355 | Val. Acc: 0.873


Evaluating: 100%|██████████| 196/196 [00:06<00:00, 28.28it/s]


In [10]:
# Train and evaluate GRU model
print("\n=== Training GRU Model ===")
trained_gru_model, gru_history = train_model(
    gru_model, train_loader, val_loader, criterion,
    gru_optimizer, EPOCHS, "GRU_PyTorch"
)
gru_results = evaluate_model(trained_gru_model, test_loader, "GRU_PyTorch")
all_results['GRU'] = gru_results


=== Training GRU Model ===


Epoch 1/10: 100%|██████████| 157/157 [00:11<00:00, 14.21it/s]


Model saved as GRU_PyTorch_best.pt
Epoch: 01 | Time: 0.0m 12.01s
	Train Loss: 0.670 | Train Acc: 0.577
	Val. Loss: 0.609 | Val. Acc: 0.673


Epoch 2/10: 100%|██████████| 157/157 [00:11<00:00, 14.07it/s]


Model saved as GRU_PyTorch_best.pt
Epoch: 02 | Time: 0.0m 12.13s
	Train Loss: 0.489 | Train Acc: 0.768
	Val. Loss: 0.388 | Val. Acc: 0.839


Epoch 3/10: 100%|██████████| 157/157 [00:11<00:00, 14.01it/s]


Epoch: 03 | Time: 0.0m 12.13s
	Train Loss: 0.295 | Train Acc: 0.880
	Val. Loss: 0.396 | Val. Acc: 0.837


Epoch 4/10: 100%|██████████| 157/157 [00:11<00:00, 13.97it/s]


Model saved as GRU_PyTorch_best.pt
Epoch: 04 | Time: 0.0m 12.21s
	Train Loss: 0.216 | Train Acc: 0.918
	Val. Loss: 0.274 | Val. Acc: 0.889


Epoch 5/10: 100%|██████████| 157/157 [00:11<00:00, 13.93it/s]


Epoch: 05 | Time: 0.0m 12.20s
	Train Loss: 0.146 | Train Acc: 0.947
	Val. Loss: 0.344 | Val. Acc: 0.885


Epoch 6/10: 100%|██████████| 157/157 [00:11<00:00, 13.89it/s]


Epoch: 06 | Time: 0.0m 12.23s
	Train Loss: 0.099 | Train Acc: 0.967
	Val. Loss: 0.341 | Val. Acc: 0.893


Epoch 7/10: 100%|██████████| 157/157 [00:11<00:00, 13.85it/s]


Epoch: 07 | Time: 0.0m 12.27s
	Train Loss: 0.063 | Train Acc: 0.980
	Val. Loss: 0.398 | Val. Acc: 0.889


Epoch 8/10: 100%|██████████| 157/157 [00:11<00:00, 13.81it/s]


Epoch: 08 | Time: 0.0m 12.30s
	Train Loss: 0.041 | Train Acc: 0.989
	Val. Loss: 0.431 | Val. Acc: 0.887


Epoch 9/10: 100%|██████████| 157/157 [00:11<00:00, 13.80it/s]


Epoch: 09 | Time: 0.0m 12.31s
	Train Loss: 0.030 | Train Acc: 0.991
	Val. Loss: 0.473 | Val. Acc: 0.880


Epoch 10/10: 100%|██████████| 157/157 [00:11<00:00, 13.75it/s]


Epoch: 10 | Time: 0.0m 12.35s
	Train Loss: 0.026 | Train Acc: 0.993
	Val. Loss: 0.533 | Val. Acc: 0.871


Evaluating: 100%|██████████| 196/196 [00:04<00:00, 41.49it/s]


In [14]:
# Compare models
model_names = list(all_results.keys())
metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']
comparison_data = {}

for metric in metrics:
    comparison_data[metric] = [all_results[model][metric] for model in model_names]

comparison_df = pd.DataFrame(comparison_data, index=model_names)
print("\n=== Model Comparison ===")
print(comparison_df)

# Plot comparison
plt.figure(figsize=(14, 8))
comparison_df.plot(kind='bar', figsize=(12, 6))
plt.title('PyTorch Model Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig("model_comparison_pytorch.png")
plt.close()

# Plot ROC curves for all models
plt.figure(figsize=(10, 8))
for model_name in model_names:
    plt.plot(
        all_results[model_name]['fpr'],
        all_results[model_name]['tpr'],
        lw=2,
        label=f'{model_name} (AUC = {all_results[model_name]["auc"]:.3f})'
    )

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tingkat Positif Palsu (False Positive Rate)')
plt.ylabel('Tingkat Positif Benar (True Positive Rate)')
plt.title('Perbandingan Kurva ROC (PyTorch)')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.savefig("roc_curves_comparison_pytorch.png")
plt.close()



=== Model Comparison ===
      accuracy  precision   recall        f1       auc
RNN    0.75516   0.734885  0.79832  0.765290  0.821134
LSTM   0.85740   0.870409  0.83984  0.854851  0.930569
GRU    0.85672   0.919631  0.78176  0.845109  0.940992


<Figure size 1400x800 with 0 Axes>

# Penjelasan Matematika Model RNN, LSTM, dan GRU

Dokumen ini memberikan penjelasan matematika dari tiga arsitektur jaringan saraf berulang yang digunakan dalam tugas analisis sentimen: RNN Sederhana (Simple RNN), LSTM (Long Short-Term Memory), dan GRU (Gated Recurrent Unit).

## 1. RNN Sederhana (Simple RNN)

### Unit Berulang Dasar
RNN Sederhana memproses sekuens input dan mempertahankan keadaan tersembunyi (hidden state) yang diperbarui pada setiap langkah waktu. Untuk setiap langkah waktu $t$, keadaan tersembunyi $h_t$ dihitung sebagai:

$$h_t = \sigma(W_{xh} x_t + W_{hh} h_{t-1} + b_h)$$

Dimana:
- $x_t$ adalah input pada langkah waktu $t$
- $h_{t-1}$ adalah keadaan tersembunyi dari langkah waktu sebelumnya
- $W_{xh}$ adalah matriks bobot untuk koneksi input-ke-tersembunyi
- $W_{hh}$ adalah matriks bobot untuk koneksi berulang tersembunyi-ke-tersembunyi
- $b_h$ adalah term bias
- $\sigma$ adalah fungsi aktivasi (biasanya tanh atau ReLU)

Untuk output, kita menggunakan keadaan tersembunyi terakhir:

$$y = \sigma(W_{hy} h_T + b_y)$$

Dimana:
- $h_T$ adalah keadaan tersembunyi pada langkah waktu terakhir
- $W_{hy}$ adalah matriks bobot untuk koneksi tersembunyi-ke-output
- $b_y$ adalah bias output
- $\sigma$ adalah fungsi aktivasi (sigmoid untuk klasifikasi biner)

### Masalah Gradien yang Menghilang/Meledak
RNN sederhana menderita masalah gradien yang menghilang atau meledak selama backpropagation melalui waktu. Saat menghitung gradien, kita mengalikan dengan $W_{hh}$ berulang kali:

$$\frac{\partial L}{\partial h_t} = \frac{\partial L}{\partial h_{t+1}} \cdot \frac{\partial h_{t+1}}{\partial h_t} = \frac{\partial L}{\partial h_{t+1}} \cdot W_{hh} \cdot \text{diag}(\sigma'(W_{xh} x_t + W_{hh} h_{t-1} + b_h))$$

Jika nilai eigen dari $W_{hh}$ kurang dari 1, gradien menghilang pada sekuens panjang. Jika lebih besar dari 1, gradien meledak.

## 2. LSTM (Long Short-Term Memory)

LSTM menyelesaikan masalah gradien yang menghilang dengan memperkenalkan keadaan sel dan tiga mekanisme gerbang:

### Gerbang dan Keadaan Sel
Untuk setiap langkah waktu $t$:

1. **Gerbang Lupa (Forget Gate)**: Memutuskan informasi apa yang akan dibuang dari keadaan sel
   $$f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f)$$

2. **Gerbang Input**: Memutuskan informasi baru apa yang akan disimpan dalam keadaan sel
   $$i_t = \sigma(W_i \cdot [h_{t-1}, x_t] + b_i)$$
   $$\tilde{C}_t = \tanh(W_C \cdot [h_{t-1}, x_t] + b_C)$$

3. **Pembaruan Keadaan Sel**: Memperbarui keadaan sel lama menjadi keadaan sel baru
   $$C_t = f_t \odot C_{t-1} + i_t \odot \tilde{C}_t$$

4. **Gerbang Output**: Memutuskan bagian mana dari keadaan sel yang akan dioutputkan
   $$o_t = \sigma(W_o \cdot [h_{t-1}, x_t] + b_o)$$
   $$h_t = o_t \odot \tanh(C_t)$$

Dimana:
- $\odot$ merepresentasikan perkalian elemen-wise
- $[h_{t-1}, x_t]$ merepresentasikan penggabungan keadaan tersembunyi sebelumnya dan input saat ini
- $\sigma$ adalah fungsi aktivasi sigmoid
- $W_f, W_i, W_C, W_o$ adalah matriks bobot
- $b_f, b_i, b_C, b_o$ adalah term bias

Keadaan sel $C_t$ bertindak sebagai jalan raya yang dapat membawa informasi melintasi banyak langkah waktu dengan perubahan minimal, memungkinkan jaringan untuk mempelajari dependensi jangka panjang.

### LSTM Dua Arah (Bidirectional LSTM)

Dalam implementasi kita, kita menggunakan LSTM dua arah yang memproses sekuens input dalam arah maju dan mundur:

$$\overrightarrow{h_t} = \text{LSTM}_{\text{maju}}(x_t, \overrightarrow{h_{t-1}})$$
$$\overleftarrow{h_t} = \text{LSTM}_{\text{mundur}}(x_t, \overleftarrow{h_{t+1}})$$
$$h_t = [\overrightarrow{h_t}, \overleftarrow{h_t}]$$

Ini memungkinkan model untuk menangkap konteks dari kedua keadaan masa lalu dan masa depan untuk langkah waktu tertentu.

## 3. GRU (Gated Recurrent Unit)

GRU adalah versi yang disederhanakan dari LSTM dengan parameter yang lebih sedikit:

### Gerbang Pembaruan dan Reset

Untuk setiap langkah waktu $t$:

1. **Gerbang Pembaruan (Update Gate)**: Memutuskan berapa banyak informasi masa lalu yang akan disimpan
   $$z_t = \sigma(W_z \cdot [h_{t-1}, x_t] + b_z)$$

2. **Gerbang Reset**: Memutuskan berapa banyak informasi masa lalu yang akan dilupakan
   $$r_t = \sigma(W_r \cdot [h_{t-1}, x_t] + b_r)$$

3. **Keadaan Tersembunyi Kandidat**: Menghitung keadaan tersembunyi kandidat
   $$\tilde{h}_t = \tanh(W_h \cdot [r_t \odot h_{t-1}, x_t] + b_h)$$

4. **Pembaruan Keadaan Tersembunyi**: Memperbarui keadaan tersembunyi
   $$h_t = (1 - z_t) \odot h_{t-1} + z_t \odot \tilde{h}_t$$

Dimana:
- $\odot$ merepresentasikan perkalian elemen-wise
- $[h_{t-1}, x_t]$ merepresentasikan penggabungan keadaan tersembunyi sebelumnya dan input saat ini
- $\sigma$ adalah fungsi aktivasi sigmoid
- $W_z, W_r, W_h$ adalah matriks bobot
- $b_z, b_r, b_h$ adalah term bias

GRU menggabungkan gerbang lupa dan input menjadi satu gerbang pembaruan, dan menggabungkan keadaan sel dan keadaan tersembunyi. Ini membuatnya lebih efisien secara komputasi sambil tetap mengatasi masalah gradien yang menghilang.

## 4. Metrik Evaluasi

### Loss Binary Cross-Entropy
Untuk klasifikasi biner, kita menggunakan loss binary cross-entropy:

$$L = -\frac{1}{N} \sum_{i=1}^{N} [y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)]$$

Dimana:
- $N$ adalah jumlah sampel
- $y_i$ adalah label sebenarnya (0 atau 1)
- $\hat{y}_i$ adalah probabilitas yang diprediksi

### Akurasi (Accuracy)
Proporsi instance yang diklasifikasikan dengan benar:

$$\text{Akurasi} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}$$

### Presisi (Precision)
Proporsi prediksi positif benar di antara semua prediksi positif:

$$\text{Presisi} = \frac{\text{TP}}{\text{TP} + \text{FP}}$$

### Recall
Proporsi prediksi positif benar di antara semua positif aktual:

$$\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}$$

### F1-Score
Rata-rata harmonik dari presisi dan recall:

$$\text{F1} = 2 \times \frac{\text{Presisi} \times \text{Recall}}{\text{Presisi} + \text{Recall}}$$

### AUC-ROC
Area di Bawah Kurva Karakteristik Operasi Penerima (Receiver Operating Characteristic). Kurva ROC memplot Tingkat Positif Benar (Recall) terhadap Tingkat Positif Palsu:

$$\text{TPR} = \frac{\text{TP}}{\text{TP} + \text{FN}}$$
$$\text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}}$$

AUC mengukur area di bawah kurva ini, dengan nilai yang lebih tinggi menunjukkan diskriminasi yang lebih baik.

## 5. Fitur Model Tambahan

### Lapisan Embedding
Mentransformasi indeks kata menjadi vektor padat:

$$e_t = W_e x_t$$

Dimana:
- $x_t$ adalah vektor one-hot encoded atau indeks
- $W_e$ adalah matriks embedding
- $e_t$ adalah vektor embedding yang dihasilkan

### Normalisasi Lapisan (Layer Normalization)
Menormalkan aktivasi dari lapisan sebelumnya untuk memiliki rata-rata nol dan varians satu:

$$\mu = \frac{1}{H} \sum_{i=1}^{H} a_i$$
$$\sigma^2 = \frac{1}{H} \sum_{i=1}^{H} (a_i - \mu)^2$$
$$\hat{a}_i = \frac{a_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$$
$$b_i = \gamma \hat{a}_i + \beta$$

Dimana:
- $a_i$ adalah aktivasi dari neuron ke-i
- $H$ adalah jumlah neuron dalam lapisan
- $\gamma$ dan $\beta$ adalah parameter yang dipelajari
- $\epsilon$ adalah nilai kecil untuk stabilitas numerik