In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [4]:
# Load IMDB text files manually

def load_imdb_split(folder):
    texts = []
    labels = []

    for label_type in ["pos", "neg"]:
        path = os.path.join(folder, label_type)

        for fname in os.listdir(path):
            if fname.endswith(".txt"):
                with open(os.path.join(path, fname), encoding="utf-8") as f:
                    texts.append(f.read())
                labels.append(1 if label_type == "pos" else 0)

    return texts, np.array(labels, dtype=np.float32)


base_dir = "./aclImdb"  # <-- change if needed

train_texts, train_labels = load_imdb_split(os.path.join(base_dir, "train"))
test_texts, test_labels = load_imdb_split(os.path.join(base_dir, "test"))

In [5]:
# Bag-of-words vectorization (10k words)

max_words = 10000

vectorizer = CountVectorizer(
    max_features=max_words,
    binary=True,
    stop_words="english"
)

x_train = vectorizer.fit_transform(train_texts).toarray().astype(np.float32)
x_test = vectorizer.transform(test_texts).toarray().astype(np.float32)

In [6]:
# Validation split

x_train, x_val, y_train, y_val = train_test_split(
    x_train, train_labels, test_size=10000, random_state=42, stratify=train_labels
)

In [7]:
# Convert to tensors

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = torch.tensor(x_train).to(device)
y_train = torch.tensor(y_train).unsqueeze(1).to(device)

x_val = torch.tensor(x_val).to(device)
y_val = torch.tensor(y_val).unsqueeze(1).to(device)