### Final Project

## Setup


1. In Colab, open tab Runtime > Change runtime type, choose *python3* and *T4 GPU*.
2. Run the following command to set up the environment. (Takes ~ 1.5 min)



In [95]:
! pip install --quiet ipython[notebook]==7.34.0 "setuptools>=68.0.0,<68.3.0" torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
! pip install --quiet --upgrade matplotlib
! pip install --quiet --upgrade ipython
! pip install --quiet numpy==1.26.4
! pip install validators pandas 



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Let's start with importing our standard set of libraries.

In [96]:
import torch
from torch import nn, optim, autograd
import torchvision.models as models
import torchvision.utils as vutils
import matplotlib.pyplot as plt
import time
import sys
from dataclasses import dataclass

%matplotlib inline
torch.set_num_threads(1)
torch.manual_seed(1)

print(torch.cuda.get_arch_list())

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

if device == torch.device("cuda:0"):
  print('Everything looks good; continue')
else:
  # It is OK if you cannot connect to a GPU. In this case, training the model for
  # 2 epoch is sufficient to get full mark. (NOTE THAT 2 epoch takes approximately 1.5 hours to train for CPU)
  print('GPU is not detected. Make sure you have chosen the right runtime type')



['sm_50', 'sm_60', 'sm_61', 'sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_90']
Everything looks good; continue


In [97]:
import numpy as np
from torch.utils.data import Dataset
import os
import pandas as pd
from PIL import Image

# Normalize the CSV file
def normalize(df):
    # Convert all columns except ID (first) to float64 to prevent dtype issues
    df.iloc[:, 0] = df.iloc[:, 0].astype('int64')
    df.iloc[:, -6:] = df.iloc[:, -6:].astype('float64')

    # Apply log-10 transformation
    df.iloc[:, -6:] = np.log10(df.iloc[:, -6:] + 1e-4)

    # Remove outliers (more than 3 standard deviations from the mean)
    df = df[
        (np.abs(df.iloc[:, -6:] - df.iloc[:, -6:].mean()) <= 
            (3 * df.iloc[:, -6:].std())).all(axis=1)
    ]

    # Normalize the traits
    min_train = df.iloc[:, -6:].min()
    max_train = df.iloc[:, -6:].max()
    df.iloc[:, -6:] = (df.iloc[:, -6:] - min_train) / (max_train - min_train)


    duplicate_ids = df[df.duplicated(subset=[df.columns[0]], keep=False)]
    if not duplicate_ids.empty:
        print(f"Duplicate IDs found: {duplicate_ids[df.columns[0]]}")
    else :
        print("No duplicate IDs found")

    return df, min_train, max_train


# ImageDataset Class
class ImageDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.image_dir = img_dir
        self.img_labels = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        # Load image
        img_name = os.path.join(self.image_dir, self.img_labels[idx])
        image = Image.open(img_name).convert('RGB')
        # ID Needs .jpeg removed        
        id = self.img_labels[idx].split('.')[0]

        if self.transform:
            image = self.transform(image)
        
        return image, id

# Baseline Dataset Class (Inherits from ImageDataset, adds trait labels)
class BaselineDataset(ImageDataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True):
        super(BaselineDataset, self).__init__(img_dir, transform)
        self.is_train = is_train

        # Select the ID (first column) and the last 6 columns (traits)
        self.dataframe = dataframe.iloc[:, [0] + list(range(-6, 0))]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load structure: [id] [trait1] [trait2] ... [trait6]
        id = self.dataframe.iloc[idx, 0].astype(int)
        img_name = os.path.join(self.image_dir, str(id) + '.jpeg')
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        if self.is_train:
            # Load plant trait labels (6 columns)
            traits = self.dataframe.iloc[idx, 1:].values.astype('float32')
            
            return image, id, traits

        return image, id
    

# Plasticity Dataset Class (Inherits from ImageDataset, adds all but the six climate labels)
class PlasticityDataset(BaselineDataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True):
        super(PlasticityDataset, self).__init__(dataframe, img_dir, transform, is_train)
        # After call to super, self.dataframe is the dataframe with the first column (id) and the last 6 columns (traits) and has been properly normalized
        # We want to add the features to the dataframe (excluding those pertaining to world climate, and join them by id column, and have them appear before the trait columns)
        non_climate_features = dataframe.iloc[:, [0] + list(range(7, dataframe.shape[1] - 6))]
        # Merging on the 'id' column (make sure 'id' is set correctly in both)
        original_df = self.dataframe
        self.dataframe = pd.merge(self.dataframe, non_climate_features, on="id")
        # Reorder columns so that the features appear before the traits
        ordered_columns = [original_df.columns[0]] + list(non_climate_features.columns[1:]) + list(original_df.columns[1:])
        self.dataframe = self.dataframe[ordered_columns]
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load structure: [id] [world climate1] ... [world climate6] [feature1] [feature2] ... [featureN] [trait1] [trait2] ... [trait6]
        id = self.dataframe.iloc[idx, 0].astype(int)
        img_name = os.path.join(self.image_dir, str(id) + '.jpeg')
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        if self.is_train:
            features = self.dataframe.iloc[idx, 1:-6].values.astype('float32')
            # Load plant trait labels (6 columns)
            traits = self.dataframe.iloc[idx, -6:].values.astype('float32')
            
            return image, id, features, traits

        features = self.dataframe.iloc[idx, 1:].values.astype('float32')
        return image, id, features

# World Climate Dataset Class (Inherits from PlasticityDataset, adds the six climate labels)
class WorldClimateDataset(PlasticityDataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True):
        super(WorldClimateDataset, self).__init__(dataframe, img_dir, transform, is_train)
        # After call to super, self.dataframe is the dataframe with the first column (id) Plasticity columns and traits and has been properly normalized
        # We want to add the features to the dataframe (world climate, and join them by id column, and have them appear right after id column)
        climate_features = dataframe.iloc[:, 0:7]
        # Merging on the 'id' column (make sure 'id' is set correctly in both)
        original_df = self.dataframe
        self.dataframe = pd.merge(self.dataframe, climate_features, on=self.dataframe.columns[0])
        # Reorder columns so that the features appear before the traits
        ordered_columns = [original_df.columns[0]] + list(climate_features.columns[1:]) + list(original_df.columns[1:])
        self.dataframe = self.dataframe[ordered_columns]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load structure: [id] [world climate1] ... [world climate6] [feature1] [feature2] ... [featureN] [trait1] [trait2] ... [trait6]
        id = self.dataframe.iloc[idx, 0].astype(int)
        img_name = os.path.join(self.image_dir, str(id) + '.jpeg')
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        if self.is_train:
            features = self.dataframe.iloc[idx, 1:-6].values.astype('float32')
            # Load plant trait labels (6 columns)
            traits = self.dataframe.iloc[idx, -6:].values.astype('float32')
            
            return image, id, features, traits

        features = self.dataframe.iloc[idx, 1:].values.astype('float32')
        return image, id, features

Dataloaders for Each Model

In [98]:
### Dataloaders for CNNs (Images) ###
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Define transformations for image data
transform = transforms.Compose([
    transforms.Resize((512, 512)), # Upscale images to 512x512
    transforms.RandomHorizontalFlip(),  
    transforms.RandomVerticalFlip(),  
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),  # Adjust to [0.9, 1.1]
    transforms.ToTensor() 
])

# CSV files
train_df = pd.read_csv('data/train.csv')
print(f"Number of training samples before normalization: {len(train_df)}")
train_df, min_train, max_train = normalize(train_df)


print(f"Number of training samples after normalization: {len(train_df)}")

test_df = pd.read_csv('data/test.csv')
batch_size = 16
############################################################################
# Load train dataset
train_dataset = BaselineDataset(dataframe=train_df, img_dir='data/images/train_images/', transform=transform)
# print columns in train_dataset df
print("Base line Cols: ", train_dataset.dataframe.columns)

# split train_dataset into train and val
num_train_images = len(train_dataset) 
train_size = int(0.8 * num_train_images)
val_size = num_train_images - train_size

train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# # Load test dataset
test_dataset = BaselineDataset(dataframe=test_df, img_dir='data/images/test_images/', transform=transform, is_train=False)

baseline_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
baseline_val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
baseline_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

############################################################################
# Load train dataset for plasticity
train_dataset = PlasticityDataset(dataframe=train_df, img_dir='data/images/train_images/', transform=transform)
print("Plasticity Cols: ", train_dataset.dataframe.columns)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
test_dataset = PlasticityDataset(dataframe=test_df, img_dir='data/images/test_images/', transform=transform, is_train=False)

plasticity_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
plasticity_val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
plasticity_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

############################################################################
# Load train dataset for world climate
train_dataset = WorldClimateDataset(dataframe=train_df, img_dir='data/images/train_images/', transform=transform)
print("World Climate Cols: ", train_dataset.dataframe.columns)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
test_dataset = WorldClimateDataset(dataframe=test_df, img_dir='data/images/test_images/', transform=transform, is_train=False)

world_climate_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
world_climate_val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
world_climate_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Number of training samples before normalization: 43363
No duplicate IDs found
Number of training samples after normalization: 39556
Base line Cols:  Index(['id', 'X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean',
       'X3112_mean'],
      dtype='object')
Plasticity Cols:  Index(['id', 'SOIL_bdod_0.5cm_mean_0.01_deg',
       'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg',
       'SOIL_bdod_30.60cm_mean_0.01_deg', 'SOIL_bdod_5.15cm_mean_0.01_deg',
       'SOIL_bdod_60.100cm_mean_0.01_deg', 'SOIL_cec_0.5cm_mean_0.01_deg',
       'SOIL_cec_100.200cm_mean_0.01_deg', 'SOIL_cec_15.30cm_mean_0.01_deg',
       ...
       'VOD_X_1997_2018_multiyear_mean_m09',
       'VOD_X_1997_2018_multiyear_mean_m10',
       'VOD_X_1997_2018_multiyear_mean_m11',
       'VOD_X_1997_2018_multiyear_mean_m12', 'X4_mean', 'X11_mean', 'X18_mean',
       'X26_mean', 'X50_mean', 'X3112_mean'],
      dtype='object', length=164)
World Climate Cols:  Index(['id', 'WORLDCLIM_BIO1_annual_m

In [104]:
from torchvision.models import inception_v3, Inception_V3_Weights
import torch.nn as nn
import torch

# Define the number of traits and features
NUM_TRAITS = 6
NUM_FEATURES = 157 

class CombinedModel(nn.Module):
    def __init__(self, base_model, num_features, num_traits):
        super(CombinedModel, self).__init__()
        self.base_model = base_model
        in_features = self.base_model.fc.in_features # need to do this here before changing the fc layer    
        self.base_model.fc = nn.Identity()  # Remove the final classification layer
        
        # Add a fully connected layer for the additional features
        self.feature_fc = nn.Linear(num_features, 128)

        # Combine the image features and additional features
        self.combined_fc = nn.Linear(in_features + 128, num_traits)

    def forward(self, images, features):
        # Process images through InceptionV3
        image_features = self.base_model(images)

        # Handle InceptionV3 outputs (extract main logits)
        if isinstance(image_features, tuple):
            image_features = image_features.logits

        # Process additional features through a fully connected layer
        feature_output = self.feature_fc(features)

        # Concatenate the image features and the additional features
        combined_input = torch.cat((image_features, feature_output), dim=1)

        # Pass the combined input through the final layer to predict traits
        output = self.combined_fc(combined_input)
        return output

# Load the InceptionV3 model
base_model = inception_v3(weights=Inception_V3_Weights.DEFAULT)

# Initialize the Plasticity Model
plasticity_model = CombinedModel(base_model, num_features=NUM_FEATURES, num_traits=NUM_TRAITS)

# Initialize the World Climate Model
# world_climate_model = CombinedModel(base_model, num_features=NUM_FEATURES + 6, num_traits=NUM_TRAITS)

## Before Training 

Next we define the two models and the optimizers. We use the [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) algorithm.



In [100]:
criterion = nn.MSELoss()
optimizer = optim.RMSprop(base_model.parameters(), lr=0.001, weight_decay=0.0001)

## Training pipeline (6 points)


Finally, we perform training on the two networks. The training consists of two steps: (1) Updating discriminators for n_critic steps (such that we have an optimal critic): here we use an aggregation of three loss functions, (a) The real loss (the output scalar of the critic for real images); (b) The fake loss (same value for fake images); (c) The [gradient penalty](https://arxiv.org/pdf/1704.00028). (2) Updating generators by only considering the fake loss (to fool the critic).


In [105]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, has_features=False):
    model = model.to(device)  # Ensure the model is on the GPU if available
    losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training Loop
        model.train()  # Set the model to training mode
        running_loss = 0.0
        total_batches = len(train_loader) * num_epochs
        for batch, data in enumerate(train_loader):
            # Calculate progress and print it
            progress = 100 * (epoch * len(train_loader) + batch + 1) / total_batches
            print(f'Epoch [{epoch+1}/{num_epochs}] Progress: [{progress:.2f}%]', end='\r')

            # Unpack data according to whether features are included
            if has_features:
                images, ids, features, traits = data
                images, ids, features, traits = images.to(device), ids.to(device), features.to(device), traits.to(device)
                outputs = model(images, features)
            else:
                images, ids, traits = data
                images, ids, traits = images.to(device), ids.to(device), traits.to(device)
                outputs = model(images)

            if isinstance(outputs, tuple):
                outputs, _ = outputs

            optimizer.zero_grad()  # Clear previous gradients
            loss = criterion(outputs, traits)  # Calculate loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights

            running_loss += loss.item()
        
        # Append the average loss for this epoch
        losses.append(running_loss / len(train_loader))
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')
    
        # clear cache
        torch.cuda.empty_cache()
    
        # Validation Loop
        total_batches = len(val_loader) * num_epochs
        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        with torch.no_grad():  # Disable gradient calculation for validation
            for batch, data in enumerate(val_loader):
                progress = 100 * (epoch * len(val_loader) + batch + 1) / total_batches
                print(f'Epoch [{epoch+1}/{num_epochs}] Progress: [{progress:.2f}%]', end='\r')
                
                if has_features:
                    images, ids, features, traits = data
                    images, ids, features, traits = images.to(device), ids.to(device), features.to(device), traits.to(device)
                    outputs = model(images, features)
                else:
                    images, ids, traits = data
                    images, ids, traits = images.to(device), ids.to(device), traits.to(device)
                    outputs = model(images)

                if isinstance(outputs, tuple):
                    outputs, _ = outputs

                val_loss += nn.L1Loss()(outputs, traits).item()  # Use MAE (L1 loss) for validation

            
        val_losses.append(val_loss / len(val_loader))  # Average validation loss for this epoch
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        # clear cache
        torch.cuda.empty_cache()
    
    return losses, val_losses


# Train the model
# training_loss, validation_loss = train_model(base_model, baseline_train_loader, baseline_val_loader, criterion, optimizer, num_epochs=1)
training_loss, validation_loss = train_model(plasticity_model, plasticity_train_loader, plasticity_val_loader, criterion, optimizer, num_epochs=1, has_features=True)
# training_loss, validation_loss = train_model(world_climate_model, world_climate_train_loader, world_climate_val_loader, criterion, optimizer, num_epochs=1, has_features=True)

# Save the trained model
# torch.save(base_model.state_dict(), 'inception_v3_baseline.pth')
torch.save(plasticity_model.state_dict(), 'inception_v3_plasticity.pth')
# torch.save(world_climate_model.state_dict(), 'inception_v3_world_climate.pth')


Epoch [1/1] Progress: [5.71%]

In [103]:
import torch
import numpy as np

def test_model(model, test_loader, min_train, max_train, has_features=False):
    model = model.to(device)  
    model.eval()  # Set the model to evaluation mode
    predictions = []
    seen_ids = set()  # Track seen IDs to avoid duplicates

    # Ensure min_train and max_train are on the GPU if necessary
    min_train_tensor = torch.tensor(min_train.values).to(device)
    max_train_tensor = torch.tensor(max_train.values).to(device)

    with torch.no_grad():
        batchsize = len(test_loader)
        duplicate_ids = 0
        for batch, data in enumerate(test_loader):
            print(f'Progress: [{100*(batch+1)/batchsize:.2f}%]', end='\r')
            if has_features:
                images, ids, features = data
                images, ids, features = images.to(device), ids.to(device), features.to(device)
                outputs = model(images, features)
            else:
                images, ids = data
                images, ids = images.to(device), ids.to(device)
                outputs = model(images) 
            
            # Reverse normalization on the GPU
            outputs = outputs * (max_train_tensor - min_train_tensor) + min_train_tensor
            outputs = torch.pow(10, outputs)

            
            # Group id with predicted traits (output)
            for i, id_val in enumerate(ids):
                if id_val.item() not in seen_ids:
                    seen_ids.add(id_val.item())
                    # Move outputs[i] to CPU before converting to NumPy
                    predictions.append([int(id_val.item())] + outputs[i].cpu().numpy().tolist())  
                else:
                    duplicate_ids += 1
        print(f'Number of duplicate IDs: {duplicate_ids}')
    
    return predictions

# Assuming min_train and max_train were computed earlier during the training data preprocessing
# predictions = test_model(base_model, baseline_test_loader, min_train, max_train)
predictions = test_model(plasticity_model, plasticity_test_loader, min_train, max_train, has_features=True)
# predictions = test_model(world_climate_model, world_climate_test_loader, min_train, max_train, has_features=True)

# Convert to numpy array and save to CSV
predictions = np.array(predictions, dtype=object)  # Ensure proper handling of mixed types (int and float)
np.savetxt('submission.csv', predictions, delimiter=',', fmt='%s', header='id,X4,X11,X18,X26,X50,X3112', comments='')
print('Submission file saved')


Number of duplicate IDs: 0
Submission file saved
