## **<h3 align="center"> Deep Learning - Project </h3>**
# **<h3 align="center">Baseline Model</h3>**
**Group 4 members:**<br>
Alexandra Pinto - 20211599@novaims.unl.pt - 20211599<br>
Steven Carlson - 20240554@novaims.unl.pt - 20240554<br>
Sven Goerdes - 20240503@novaims.unl.pt - 20240503<br>
Tim Straub - 20240505@novaims.unl.pt - 20240505<br>
Zofia Wojcik  - 20240654@novaims.unl.pt - 20240654<br>

# Table of Contents
* [1. Introduction](#intro)
* [2. Setup](#setup)
* [3. Data Loading](#load)
* [4. Train a Baseline Model](#train)

# 1. Introduction <a class="anchor" id="intro"></a>
Bla Bla

# 2. Setup <a class="anchor" id="setup"></a>
In this section, we will import the necessary libraries that will be used throughout the notebook. 

In [1]:
import pandas as pd
import os
from glob import glob
from tensorflow.keras.applications import VGG16
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

2025-04-23 15:05:10.862690: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 3. Data Loading <a class="anchor" id="load"></a>

In [2]:
#Load and concatenate all train/test CSVs
train_csvs = glob('train_test_splits/*_train.csv')
test_csvs = glob('train_test_splits/*_test.csv')

# Load all CSVs into a list of DataFrames
train_dfs = [pd.read_csv(csv_path) for csv_path in train_csvs]
test_dfs = [pd.read_csv(csv_path) for csv_path in test_csvs]

# Concatenate them all
combined_train_df = pd.concat(train_dfs, ignore_index=True)
combined_test_df = pd.concat(test_dfs, ignore_index=True)

In [3]:
# Prepend image paths 
image_root_dir = "rare_species"
combined_train_df['filepath'] = combined_train_df['file_path'].apply(lambda x: os.path.join(image_root_dir, x))
combined_test_df['filepath'] = combined_test_df['file_path'].apply(lambda x: os.path.join(image_root_dir, x))

In [4]:
# Data generator for TRAINING images (includes augmentation)
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,         # Normalize pixel values from [0, 255] → [0, 1]
    rotation_range=15,         # Randomly rotate images by up to 15 degrees 
    zoom_range=0.1,            # Random zoom inside image (10%) 
    horizontal_flip=True,      # Randomly do mirror view
    validation_split=0.2       # Reserve 20% of training data for validation 
)

# Data generator for TEST images (no augmentation)
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

# Function to create generators based on image size and batch size
def create_generators(image_size=(128, 128), batch_size=16):
    train_generator = train_datagen.flow_from_dataframe(
            dataframe=combined_train_df,         # DataFrame with image filepaths and labels
            x_col='filepath',                 # Column with full path to images
            y_col='family',                   # Column with class labels (family)
            target_size=image_size,           # Resize all images to this size
            class_mode='categorical',         # One-hot encode class labels
            batch_size=batch_size,            # Load this many images per batch
            subset='training',                # Use the training subset (80%)
            shuffle=True,                     # Shuffle the data for training
            seed=4                            # For reproducibility
    )

    val_generator = train_datagen.flow_from_dataframe(
            dataframe=combined_train_df,         # DataFrame with image filepaths and labels
            x_col='filepath',                 # Column with full path to images
            y_col='family',                   # Column with class labels (family)
            target_size=image_size,           # Resize all images to this size
            class_mode='categorical',         # One-hot encode class labels
            batch_size=batch_size,            # Load this many images per batch
            subset='validation',              # Use the training subset (80%)
            shuffle=True,                     # Shuffle the data for training
            seed=4                            # For reproducibility
    )

    test_generator = test_datagen.flow_from_dataframe(
            dataframe=combined_test_df,
            x_col='filepath',
            y_col='family',
            target_size=image_size,
            class_mode='categorical',
            batch_size=1,                     # Load one image at a time for testing
            shuffle=False                     # Do not shuffle so predictions align with input order
    )

    return train_generator, val_generator, test_generator

In [5]:
checkpoint = ModelCheckpoint(
    filepath='model_checkpoints/epoch_{epoch:02d}_valacc_{val_accuracy:.2f}.keras',
    save_weights_only=False,        # Set to True if you only want weights
    save_best_only=False,           # Or True if you only want best model
    monitor='val_loss',             # Metric to track
    verbose=1
)

# 4. Train a Baseline Model <a class="anchor" id="train"></a>

In [6]:
# Create generators 
image_size = (224, 224)
batch_size = 32

train_generator, val_generator, test_generator = create_generators(
    image_size=image_size,
    batch_size=batch_size
)

Found 7566 validated image filenames belonging to 202 classes.
Found 1891 validated image filenames belonging to 202 classes.
Found 2367 validated image filenames belonging to 202 classes.


In [7]:
# Load VGG16 
base_model = VGG16(
    input_shape=(224, 224, 3),
    include_top=False,       # Don't include the original classifier
    weights='imagenet'
)
base_model.trainable = False  # freeze all layers for baseline

# Build full model 
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),       # More efficient than Flatten
    layers.Dense(512, activation='relu'),  # Larger dense layer for higher class count
    layers.Dropout(0.5),
    layers.Dense(202, activation='softmax')  # Final output layer for 202 classes
])

# Compile the model
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [8]:
# Train the model 
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=30,
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True), checkpoint],
    verbose=1
)

Epoch 1/30
 46/237 [====>.........................] - ETA: 48:29 - loss: 5.4719 - accuracy: 0.0076

KeyboardInterrupt: 

In [None]:
model = load_model('model_checkpoints/epoch_x_valacc_x.keras')

In [None]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=30,
    initial_epoch=x,  # Replace with the epoch number you want to continue from
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True), checkpoint],
    verbose=1
)

In [None]:
# plot the accuracy results
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training vs. Validation Accuracy')
plt.show()

# Plot the loss results
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()