**Getting the data**

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:33 --:--:--     0
  0 80.2M    0     0    0     0      0      0 --:--:--  0:00:34 --:--:--     0
  0 80.2M    0 98304    0     0   2788      0  8:22:54  0:00:35  8:22:19  2788
  0 80.2M    0  560k    0     0  15824      0  1:28:36  0:00:36  1:28:00 15825
  2 80.2M    2 2368k    0     0  65124      0  0:21:31  0:00:37  0:20:54 65831
  5 80.2M    5 4320k    0     0   112k      0  0:12:07  0:00:38  0:11:29  957k
  9 80.2M    9 7744k    0     0   197k      0  0:06:56  0:00:39  0:06:17 1561k
 13 80.2M   13 10.4M    0     0   265k      0  0:05:09  0:00:40  0:04:29 2126k
 16 80.2M   16 13.5M    0     0   335k      0  0:04

**Preparing the data**

In [None]:
import os, pathlib, shutil, random
from tensorflow import keras

In [None]:
batch_size = 32
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

In [None]:
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)

    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

In [None]:
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)
text_only_train_ds = train_ds.map(lambda x, y: x)

Found 61520 files belonging to 3 classes.
Found 2880 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [None]:
import tensorflow as tf

In [None]:
def sample_dataset(dataset, percent, total_size_estimate=20000):
    """نمونه‌گیری تصادفی از tf.data.Dataset بدون تبدیل به لیست"""
    sample_size = max(1, int(percent * total_size_estimate))
    indices = sorted(random.sample(range(total_size_estimate), sample_size))

    sampled = dataset.enumerate().filter(lambda i, data: tf.reduce_any(i == indices)).map(lambda i, data: data)
    return sampled.batch(batch_size)

In [None]:
small_train_ds = sample_dataset(train_ds.unbatch(), percent=0.01, total_size_estimate=20000)
small_val_ds = sample_dataset(val_ds.unbatch(), percent=0.01, total_size_estimate=5000)
small_test_ds = sample_dataset(test_ds.unbatch(), percent=0.01, total_size_estimate=10000)
text_only_small_train_ds = small_train_ds.map(lambda x, y: x)

Cause: could not parse the source code of <function sample_dataset.<locals>.<lambda> at 0x000001CF60E7DEA0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda i, data: data

Match 1:
lambda i, data: tf.reduce_any(i == indices)

Cause: could not parse the source code of <function sample_dataset.<locals>.<lambda> at 0x000001CF60E7DEA0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda i, data: data

Match 1:
lambda i, data: tf.reduce_any(i == indices)

Cause: could not parse the source code of <function sample_dataset.<locals>.<lambda> at 0x000001CF60E7DAB0>: found multiple definitions with identical signatures at the location. This error may be avoided by defi

In [None]:
def count_samples(dataset):
    return sum(1 for _ in dataset.unbatch())

# شمارش تعداد نمونه‌های دیتاست اصلی
num_train_full = count_samples(train_ds)
num_val_full = count_samples(val_ds)
num_test_full = count_samples(test_ds)

# شمارش تعداد نمونه‌های دیتاست کوچک‌شده
num_small_train = count_samples(small_train_ds)
num_small_val = count_samples(small_val_ds)
num_small_test = count_samples(small_test_ds)

# نمایش تعداد نمونه‌ها
print(f"Train (full): {num_train_full} samples")
print(f"Train (small): {num_small_train} samples")
print(f"Validation (full): {num_val_full} samples")
print(f"Validation (small): {num_small_val} samples")
print(f"Test (full): {num_test_full} samples")
print(f"Test (small): {num_small_test} samples")

Train (full): 61520 samples
Train (small): 200 samples
Validation (full): 2880 samples
Validation (small): 29 samples
Test (full): 25000 samples
Test (small): 100 samples


**Vectorizing the data**

In [None]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_small_train_ds)

int_train_ds = small_train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = small_val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = small_test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)