In [18]:
# EuroSAT — Step 1: Data Preprocessing & Augmentation
# Objective: 
# load EuroSAT,
# inspect class balance,
# implement model-aware preprocessing,
# apply augmentation, 
# demonstrate augmentation worked,
# and provide two imbalance-handling strategies (class weights and targeted oversampling).


In [19]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import collections
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import random

In [20]:
tf.random.set_seed(50)

In [21]:
dataset, info = tfds.load('eurosat', with_info=True, as_supervised=True)
full_ds = dataset['train']

num_examples = info.splits['train'].num_examples
num_classes = info.features['label'].num_classes
class_names = info.features['label'].names
image_shape = info.features['image'].shape
image_dtype = info.features['image'].tf_dtype

print("===== EuroSAT Dataset Info =====")
print(f"Number of examples: {num_examples}")
print(f"Number of classes: {num_classes}")
print(f"Class names: {class_names}")
print(f"Image shape: {image_shape}")
print(f"Image dtype: {image_dtype}")

===== EuroSAT Dataset Info =====
Number of examples: 27000
Number of classes: 10
Class names: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
Image shape: (64, 64, 3)
Image dtype: <dtype: 'uint8'>


In [22]:
# Resize and normalize

def resize_and_rescale(image, label, target=224):
    image = tf.image.resize(image, [target, target])    #resize images from 64×64 to 224×224
    image = tf.cast(image, tf.float32) / 255.0     #convert pixel type
    return image, label

In [23]:
full_ds = full_ds.map(resize_and_rescale)

In [24]:
num_examples = tf.data.experimental.cardinality(full_ds).numpy()
train_size = int(0.85 * num_examples)

# (85% train / 15% val)
train_ds = full_ds.take(train_size)
val_ds = full_ds.skip(train_size)

# how many examples in each split
train_count = tf.data.experimental.cardinality(train_ds).numpy()
val_count   = tf.data.experimental.cardinality(val_ds).numpy()
print(f"Train examples: {train_count}, Val examples: {val_count}")

Train examples: 22950, Val examples: 4050


In [25]:
# Inspect class counts (detect imbalance)
counts = collections.Counter() #dictionary-like object that counts occurrences of items
y_train_list = []


for _, lbl in tfds.as_numpy(train_ds):
    counts[int(lbl)] += 1
    y_train_list.append(int(lbl))

print("Per-class counts (train):")
for cls in sorted(counts.keys()):
    print(f"  class {cls}: {counts[cls]} images")


Per-class counts (train):
  class 0: 2563 images
  class 1: 2564 images
  class 2: 2527 images
  class 3: 2121 images
  class 4: 2128 images
  class 5: 1699 images
  class 6: 2107 images
  class 7: 2546 images
  class 8: 2147 images
  class 9: 2548 images


In [26]:
# no class imbalance, so we'll apply light augmentation just to
# improve model training and generalization and reduce overfitting

#minimal light augmentation

data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
])

In [27]:
#apply augmentation to the training dataset only

train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True),y))