## Imports

In [1]:
from load_combined_model import load_combined_model
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from d2l import torch as d2l 
import os

## Init

In [None]:
combined_model_path = "./weights/best_combined_model.pth"
test_file_path = "./test_data_movie.csv"
output_csv = "predictions.csv"
data_dir = "aclImdb"
batch_size = 64
num_steps = 500 
combined_dim = 764  
hidden_dim = 128
output_dim = 2
dropout_rate = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def read_imdb(data_dir, is_train):
    """Read the IMDb review dataset text sequences and labels."""
    data, labels = [], []
    folder = "train" if is_train else "test"
    for label in ["pos", "neg"]:
        label_dir = os.path.join(data_dir, folder, label)
        for filename in os.listdir(label_dir):
            with open(os.path.join(label_dir, filename), "r", encoding="utf-8") as f:
                data.append(f.read())
            labels.append(1 if label == "pos" else 0)
    return data, labels

def load_vocab(data_dir, num_steps):
    """Load vocabulary based on training data."""
    train_data, _ = read_imdb(data_dir, is_train=True)
    train_tokens = d2l.tokenize(train_data, token="word")
    vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=["<pad>"])
    return vocab

vocab = load_vocab(data_dir, num_steps)
vocab_size = len(vocab)
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 49347


In [4]:
combined_model = load_combined_model(
    combined_model_path, vocab_size, combined_dim, hidden_dim, output_dim, dropout_rate, device
)
print("Combined model successfully loaded!")


Combined model successfully loaded!


  combined_model.load_state_dict(torch.load(combined_model_path, map_location=device))


## Eval

In [7]:
def evaluate_and_save_predictions(model, test_file_path, vocab, device, output_csv):
    """
    Evaluate the combined model on the provided test set and save predictions to a CSV file.

    Args:
        model: Combined PyTorch model to evaluate.
        test_file_path: Path to the test dataset CSV file (text, label).
        vocab: Vocabulary object to tokenize the text data.
        device: Device (CPU or GPU) to run the evaluation.
        output_csv: Filename for the output CSV file.

    Returns:
        metrics: Dictionary containing accuracy, precision, recall, and F1 score.
    """
    test_data = pd.read_csv(test_file_path)
    texts = test_data['text'].tolist()
    labels = test_data['label'].tolist()

    tokenized_texts = [vocab[token] for token in d2l.tokenize(texts, token='word')]
    max_len = 500  
    features = torch.tensor([d2l.truncate_pad(tokens, max_len, vocab['<pad>']) for tokens in tokenized_texts])
    labels_tensor = torch.tensor(labels)

    test_dataset = torch.utils.data.TensorDataset(features, labels_tensor)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

    y_true, y_pred = [], []
    all_texts = []

    with torch.no_grad():
        for batch_idx, (X_batch, y_batch) in enumerate(test_loader):
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            outputs = model(X_batch)
            predictions = torch.argmax(outputs, axis=1).cpu().numpy()
            y_pred.extend(predictions)
            y_true.extend(y_batch.cpu().numpy())

            start_idx = batch_idx * test_loader.batch_size
            end_idx = start_idx + X_batch.size(0)
            batch_texts = texts[start_idx:end_idx]
            all_texts.extend(batch_texts)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="binary")
    recall = recall_score(y_true, y_pred, average="binary")
    f1 = f1_score(y_true, y_pred, average="binary")

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))


    label_map = {0: 'negative', 1: 'positive'}
    predicted_labels = [label_map[label] for label in y_pred]
    true_labels = [label_map[label] for label in y_true]

    results_df = pd.DataFrame({
        'text': all_texts,
        'predicted': predicted_labels,
        'truth': true_labels
    })
    results_df.to_csv(output_csv, index=False)
    print(f"\nPredictions saved to {output_csv}")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}


In [None]:

metrics = evaluate_and_save_predictions(combined_model, test_file_path, vocab, device, output_csv)

print("\nCombined Model Evaluation Metrics:")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1_score']:.4f}")



Classification Report:
              precision    recall  f1-score   support

    Negative       0.95      0.93      0.94     20019
    Positive       0.93      0.95      0.94     19981

    accuracy                           0.94     40000
   macro avg       0.94      0.94      0.94     40000
weighted avg       0.94      0.94      0.94     40000


Predictions saved to predictions.csv

Combined Model Evaluation Metrics:
Accuracy: 0.9405
Precision: 0.9300
Recall: 0.9526
F1 Score: 0.9411
