### Final Project

## Setup


1. In Colab, open tab Runtime > Change runtime type, choose *python3* and *T4 GPU*.
2. Run the following command to set up the environment. (Takes ~ 1.5 min)



In [10]:
! pip install tensorflow[and-cuda]==2.16.1

Collecting tensorflow==2.16.1 (from tensorflow[and-cuda]==2.16.1)
  Using cached tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow==2.16.1->tensorflow[and-cuda]==2.16.1)
  Using cached tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting nvidia-cublas-cu12==12.3.4.1 (from tensorflow[and-cuda]==2.16.1)
  Using cached nvidia_cublas_cu12-12.3.4.1-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.3.101 (from tensorflow[and-cuda]==2.16.1)
  Using cached nvidia_cuda_cupti_cu12-12.3.101-py3-none-win_amd64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvcc-cu12==12.3.107 (from tensorflow[and-cuda]==2.16.1)
  Using cached nvidia_cuda_nvcc_cu12-12.3.107-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.3.107 (from tensorflow[and-cuda]==2.16.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.3.107-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cu

ERROR: Could not find a version that satisfies the requirement nvidia-nccl-cu12==2.19.3; extra == "and-cuda" (from tensorflow[and-cuda]) (from versions: 0.0.1.dev5)
ERROR: No matching distribution found for nvidia-nccl-cu12==2.19.3; extra == "and-cuda"

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Let's start with importing our standard set of libraries.

In [11]:
# set up GPU 
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("CUDA available:", tf.test.is_built_with_cuda())
# tensorflow version
print("Tensorflow version: ", tf.__version__)

Num GPUs Available:  0
CUDA available: False
Tensorflow version:  2.17.0


In [None]:
import numpy as np
from torch.utils.data import Dataset
import os
import pandas as pd
from PIL import Image

# Normalize the CSV file
def normalize(df):
    # Convert all columns except ID (first) to float64 to prevent dtype issues
    df.iloc[:, 0] = df.iloc[:, 0].astype('int64')
    df.iloc[:, -6:] = df.iloc[:, -6:].astype('float64')

    # Apply log-10 transformation
    df.iloc[:, -6:] = np.log10(df.iloc[:, -6:] + 1e-4)

    # Remove outliers (more than 3 standard deviations from the mean)
    df = df[
        (np.abs(df.iloc[:, -6:] - df.iloc[:, -6:].mean()) <= 
            (3 * df.iloc[:, -6:].std())).all(axis=1)
    ]

    # Normalize the traits
    min_train = df.iloc[:, -6:].min()
    max_train = df.iloc[:, -6:].max()
    df.iloc[:, -6:] = (df.iloc[:, -6:] - min_train) / (max_train - min_train)


    duplicate_ids = df[df.duplicated(subset=[df.columns[0]], keep=False)]
    if not duplicate_ids.empty:
        print(f"Duplicate IDs found: {duplicate_ids[df.columns[0]]}")
    else :
        print("No duplicate IDs found")

    return df, min_train, max_train


# ImageDataset Class
class ImageDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.image_dir = img_dir
        self.img_labels = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        # Load image
        img_name = os.path.join(self.image_dir, self.img_labels[idx])
        image = Image.open(img_name).convert('RGB')
        # ID Needs .jpeg removed        
        id = self.img_labels[idx].split('.')[0]

        if self.transform:
            image = self.transform(image)
        
        return image, id

# Baseline Dataset Class (Inherits from ImageDataset, adds trait labels)
class BaselineDataset(ImageDataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True):
        super(BaselineDataset, self).__init__(img_dir, transform)
        self.is_train = is_train
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load structure: [id] [world climate1] ... [world climate6] [feature1] [feature2] ... [featureN] [trait1] [trait2] ... [trait6]
        id = self.dataframe.iloc[idx, 0].astype(int)
        img_name = os.path.join(self.image_dir, str(id) + '.jpeg')
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        if self.is_train:
            features = self.dataframe.iloc[idx, 1:-6].values.astype('float32')
            # Load plant trait labels (6 columns)
            traits = self.dataframe.iloc[idx, -6:].values.astype('float32')
            
            return image, id, features, traits

        features = self.dataframe.iloc[idx, 1:].values.astype('float32')
        return image, id, features
    

Dataloaders for Each Model

In [None]:
### Dataloaders for CNNs (Images) ###
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

# Define transformations for image data
transform = transforms.Compose([
    transforms.Resize((512, 512)), # Upscale images to 512x512
    transforms.RandomHorizontalFlip(),  
    transforms.RandomVerticalFlip(),  
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),  # Adjust to [0.9, 1.1]
    transforms.ToTensor() 
])

test_transform = transforms.Compose([
    transforms.Resize((512, 512)),  # Resize images to 512x512
    transforms.ToTensor()           # Convert images to PyTorch tensors
])

# CSV files
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(f"Number of training samples before normalization: {len(train_df)}")
train_df, min_train, max_train = normalize(train_df)
print(f"Number of training samples after normalization: {len(train_df)}")

# Load train dataset
train_dataset = BaselineDataset(dataframe=train_df, img_dir='data/images/train_images/', transform=test_transform)
# print columns in train_dataset df
print("Base line Cols: ", train_dataset.dataframe.columns)

# Load test dataset
test_dataset = BaselineDataset(dataframe=test_df, img_dir='data/images/test_images/', transform=transform, is_train=False)

batch_size = 16
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
from torchvision.models import inception_v3, Inception_V3_Weights
import torch.nn as nn
import torch
from xgboost import XGBRegressor
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Define the number of traits and features
NUM_TRAITS = 6

class FeatureExtractorCNN(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractorCNN, self).__init__()
        self.base_model = base_model
        in_features = base_model.fc.in_features
        self.base_model.fc = nn.Identity()  # Remove the final classification layer to extract features

        # Adding a dropout layer to the extracted image features before outputting
        self.dropout = nn.Dropout(0.5)

    def forward(self, images):
        # Process images through InceptionV3
        image_features = self.base_model(images)
        
        if isinstance(image_features, tuple):
            image_features = image_features[0]
        
        # Apply dropout to the extracted features
        image_features = self.dropout(image_features)
        
        return image_features

# Load the InceptionV3 model
base_model = inception_v3(weights=Inception_V3_Weights.DEFAULT)

# Unfreeze the last block of layers for fine-tuning
for param in base_model.parameters():
    param.requires_grad = False
for param in base_model.Mixed_7c.parameters():
    param.requires_grad = True

feature_extractor = FeatureExtractorCNN(base_model).to(device)

# Initialize optimizer and learning rate scheduler
optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, feature_extractor.parameters()), lr=1e-4, weight_decay=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
critertion = nn.MSELoss()

## Training pipeline (6 points)


Finally, we perform training on the two networks. The training consists of two steps: (1) Updating discriminators for n_critic steps (such that we have an optimal critic): here we use an aggregation of three loss functions, (a) The real loss (the output scalar of the critic for real images); (b) The fake loss (same value for fake images); (c) The [gradient penalty](https://arxiv.org/pdf/1704.00028). (2) Updating generators by only considering the fake loss (to fool the critic).


In [None]:
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

def train_model(feature_extractor, training_dataset, num_epochs=10):
    fold_results = []
    best_val_rmse = float("inf")  

    # Training Loop
    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset)):
        print(f'Fold {fold+1}/{k_folds}')

        # Subset datasets for current fold
        train_subsampler = Subset(train_dataset, train_idx)
        val_subsampler = Subset(train_dataset, val_idx)

        # Data loaders
        train_loader = DataLoader(train_subsampler, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subsampler, batch_size=batch_size, shuffle=False)

        # Initialize XGBoost model
        xgb_model = XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)

        for epoch in range(num_epochs):
            print(f'Epoch {epoch+1}/{num_epochs}')

            # Training Phase
            feature_extractor.train()  # Set the feature extractor to training mode
            train_features = []
            train_traits = []

            total_batches = len(train_loader)
            with tqdm(total=total_batches, desc=f'Fold {fold+1}, Epoch {epoch+1}/{num_epochs}', unit='batch') as pbar:
                for batch, (images, _, additional_features, traits) in enumerate(train_loader):
                    images, additional_features = images.to(device), additional_features.to(device)
                    optimizer.zero_grad()

                    # Extract features using InceptionV3
                    image_features = feature_extractor(images)  
                    
                    # Combine with additional features
                    combined_features = np.hstack((image_features.cpu().detach().numpy(), additional_features.cpu().detach().numpy()))
                    train_features.append(combined_features)
                    train_traits.append(traits.cpu().detach().numpy())

                    pbar.update(1)

            # Stack features and traits for XGBoost
            train_features = np.vstack(train_features)
            train_traits = np.vstack(train_traits)

            # Train XGBoost model
            xgb_model.fit(train_features, train_traits)

             # Calculate loss
            predictions = xgb_model.predict(train_features)
            loss = criterion(torch.tensor(predictions).to(device), torch.tensor(train_traits).to(device))

            # Backpropagate and update the feature extractor
            loss.backward()
            optimizer.step()

            # Validation Phase
            feature_extractor.eval()  # Set the feature extractor to evaluation mode
            val_features = []
            val_traits = []

            with torch.no_grad():  # Only use torch.no_grad() during validation
                for images, _, additional_features, traits in val_loader:
                    images, additional_features = images.to(device), additional_features.to(device)
                    
                    image_features = feature_extractor(images)
                    combined_features = np.hstack((image_features.cpu().numpy(), additional_features.cpu().numpy()))
                    val_features.append(combined_features)
                    val_traits.append(traits.cpu().numpy())

            val_features = np.vstack(val_features)
            val_traits = np.vstack(val_traits)

            # Validate XGBoost model
            val_predictions = xgb_model.predict(val_features)
            val_rmse = np.sqrt(mean_squared_error(val_traits, val_predictions))
            print(f'Fold {fold+1}, Epoch {epoch+1}/{num_epochs}, Validation RMSE: {val_rmse:.4f}')

            scheduler.step(val_rmse)


            # Save the best performing models
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                print(f'New best model found at Fold {fold+1}, Epoch {epoch+1}, Saving the model...')
                torch.save(feature_extractor.state_dict(), f'best_feature_extractor_fold_{fold+1}.pth')
                xgb_model.save_model(f'best_xgb_model_fold_{fold+1}.json')   
        
        fold_results.append(best_val_rmse)
        
    print(f'Cross-Validation Results: {fold_results}')
    print(f'Mean RMSE across folds: {np.mean(fold_results):.4f}, Std Dev: {np.std(fold_results):.4f}')


# Train the model
train_model(feature_extractor, train_dataset, num_epochs=10)

# Save the models
torch.save(feature_extractor.state_dict(), 'feature_extractor.pth')




In [None]:
def test_model(feature_extractor, regression, test_loader, train_min, train_max):
    feature_extractor.eval()
    test_features = []
    test_ids = []
    total_batches = len(test_loader)
    
    with torch.no_grad():
        for batch, data in enumerate(test_loader):
            # Unpack data
            images, ids, additional_features = data
            images, additional_features = images.to(device), additional_features.to(device)
            
            # Extract features from images
            image_features = feature_extractor(images)
            combined_features = torch.cat((image_features, additional_features), dim=1).cpu().numpy()
            test_features.append(combined_features)
            test_ids.extend(ids.numpy().astype(int))  # Store the IDs as integers
    
            print(f"Testing Progress: [{batch + 1}/{total_batches}] Complete", end='\r')
    
    test_features = np.vstack(test_features)
    test_predictions = regression.predict(test_features)
    
    # Renormalize the predictions
    train_min = train_min.values
    train_max = train_max.values
    test_predictions = test_predictions * (train_max - train_min) + train_min
    
    # Undo the log10 transformation
    test_predictions = 10**test_predictions
    
    # Combine the IDs with the predictions
    results = np.column_stack((test_ids, test_predictions))
    
    return results


# Load the model
feature_extractor.load_state_dict(torch.load('feature_extractor.pth'))
xgb_model.load_model('xgb_model.json')

# Test the model
test_predictions = test_model(feature_extractor, xgb_model, test_loader, min_train, max_train)

# Define the header as specified
header = "id,X4,X11,X18,X26,X50,X3112"

# Save the model and predictions with the specified header and ensure IDs are integers
np.savetxt('submission.csv', test_predictions, delimiter=',', header=header, comments='', fmt='%d,' + ','.join(['%.6f'] * 6))
