In [2]:
import os
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import numpy as np
from PIL import Image
#from transformers import DINOFeatureExtractor, DINOModel
from transformers import AutoImageProcessor, AutoModel
from torch.utils.data import Subset

#import timm



In [3]:
dinov2_vits14_lc = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:02<00:00, 33.7MB/s]


In [4]:
class PlantDataset(Dataset):
    def __init__(self, image_folder, csv_file, transform=None):
        self.image_folder = image_folder
        self.csv_file = csv_file
        self.transform = transform
        self.data = pd.read_csv(csv_file)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = os.path.join(self.image_folder, str(self.data.iloc[idx, 0]) + '.jpeg')
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        ancillary_data = self.data.iloc[idx, 1:-6].values
        ancillary_data = torch.tensor(ancillary_data, dtype=torch.float32)
        
        labels = self.data.iloc[idx, -6:].values
        labels = torch.tensor(labels, dtype=torch.float32)
        
        
        return image, ancillary_data, labels
        


In [27]:
class TestDataset(Dataset):
    def __init__(self, image_folder, csv_file, ids, transform=None):
        self.image_folder = image_folder
        self.csv_file = csv_file
        self.transform = transform
        self.ids = ids
        self.data = pd.read_csv(csv_file)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        id_ = self.ids[idx]
        
        img_name = os.path.join(self.image_folder, str(self.data.iloc[idx, 0]) + '.jpeg')
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        ancillary_data = self.data.iloc[idx, 1:].values
        ancillary_data = torch.tensor(ancillary_data, dtype=torch.float32)
        
        return image, ancillary_data, id_

In [29]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Example size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Example normalization
])

ids_df = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
ids_input = ids_df['id'].values

# Create the DataLoader for training
train_dataset = PlantDataset('/kaggle/input/cs-480-2024-spring/data/train_images',
                             '/kaggle/input/cs-480-2024-spring/data/train.csv', transform=transform)

test_dataset = TestDataset(
   '/kaggle/input/cs-480-2024-spring/data/test_images',
    '/kaggle/input/cs-480-2024-spring/data/test.csv',  # Path to ancillary data file
    ids_input,
    transform=test_transform
)

#subset_indices = list(range(1000))  # List of indices from 0 to 999
#train_subset = Subset(train_dataset, subset_indices)



train_loader = DataLoader(train_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)


In [30]:
model = dinov2_vits14_lc
model.eval()  # Set the model to evaluation mode


def extract_dino_features(loader, model):
    features_list = []
    ancillary_list = []
    labels_list = []
    
    with torch.no_grad():  # Disable gradient calculation for inference
        batch_num = 1
        for images, ancillaries, labels in loader:
            #images = images.to('cuda')  # Move images to GPU if available
            features = model(images)  # Extract features with DINOv2
            
            # Convert features to numpy and store them
            features_list.append(features.cpu().numpy())
            ancillary_list.append(ancillaries.numpy())
            labels_list.append(labels.numpy())

            print("Gone through batch " + str(batch_num))
            batch_num += 1
    
    return np.concatenate(features_list), np.concatenate(ancillary_list), np.concatenate(labels_list)

In [32]:
def extract_dino_features_test(loader, model):
    features_list = []
    ancillary_list = []
    ids_list = []
    
    with torch.no_grad():  # Disable gradient calculation for inference
        batch_num = 1
        for images, ancillaries, ids in loader:
            #images = images.to('cuda')  # Move images to GPU if available
            features = model(images)  # Extract features with DINOv2
            
            # Convert features to numpy and store them
            features_list.append(features.cpu().numpy())
            ancillary_list.append(ancillaries.numpy())
            ids_list.extend(ids)

            print("Gone through batch " + str(batch_num))
            batch_num += 1
    
    return np.concatenate(features_list), np.concatenate(ancillary_list), ids_list

In [9]:
train_image_features, train_ancillary_data, labels_data = extract_dino_features(train_loader, model)



df_features = pd.DataFrame(train_image_features)
df_features.to_csv('train_image_features.csv', index=False)



Gone through batch 1
Gone through batch 2
Gone through batch 3
Gone through batch 4
Gone through batch 5
Gone through batch 6
Gone through batch 7
Gone through batch 8
Gone through batch 9
Gone through batch 10
Gone through batch 11
Gone through batch 12
Gone through batch 13
Gone through batch 14
Gone through batch 15
Gone through batch 16
Gone through batch 17
Gone through batch 18
Gone through batch 19
Gone through batch 20
Gone through batch 21
Gone through batch 22
Gone through batch 23
Gone through batch 24
Gone through batch 25
Gone through batch 26
Gone through batch 27
Gone through batch 28
Gone through batch 29
Gone through batch 30
Gone through batch 31
Gone through batch 32
Gone through batch 33
Gone through batch 34
Gone through batch 35
Gone through batch 36
Gone through batch 37
Gone through batch 38
Gone through batch 39
Gone through batch 40
Gone through batch 41
Gone through batch 42
Gone through batch 43
Gone through batch 44
Gone through batch 45
Gone through batch 

In [33]:


test_image_features, test_ancillary_data, ids  = extract_dino_features_test(test_dataloader, model)

df_features = pd.DataFrame(test_image_features)
df_features.to_csv('test_image_features.csv', index=False)

Gone through batch 1
Gone through batch 2
Gone through batch 3
Gone through batch 4
Gone through batch 5
Gone through batch 6
Gone through batch 7
Gone through batch 8
Gone through batch 9
Gone through batch 10
Gone through batch 11
Gone through batch 12
Gone through batch 13
Gone through batch 14
Gone through batch 15
Gone through batch 16
Gone through batch 17
Gone through batch 18
Gone through batch 19
Gone through batch 20
Gone through batch 21
Gone through batch 22
Gone through batch 23
Gone through batch 24
Gone through batch 25
Gone through batch 26
Gone through batch 27
Gone through batch 28
Gone through batch 29
Gone through batch 30
Gone through batch 31
Gone through batch 32
Gone through batch 33
Gone through batch 34
Gone through batch 35
Gone through batch 36
Gone through batch 37
Gone through batch 38
Gone through batch 39
Gone through batch 40
Gone through batch 41
Gone through batch 42
Gone through batch 43
Gone through batch 44
Gone through batch 45
Gone through batch 

In [34]:
# Load the ancillary data from CSV files
train_df = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')

# Combine DINOv2 features with the ancillary data
train_combined_data = np.hstack((train_image_features, train_ancillary_data))
test_combined_data = np.hstack((test_image_features, test_ancillary_data))


# Extract the labels for the training data
train_labels = labels_data

In [13]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [35]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor



In [36]:
base_model = xgb.XGBRegressor(objective='reg:squarederror')
model = MultiOutputRegressor(base_model)
model.fit(train_combined_data, train_labels)


In [39]:
#ids_df = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
#ids = ids_df['id'].values
ids = [id_.item() for id_ in ids]

predictions = model.predict(test_combined_data)
print(f"Predictions shape: {predictions.shape}")  
#submission_df = pd.DataFrame(predictions, columns=[f'Trait_{i}' for i in range(6)])
#submission_df.to_csv('predictions.csv', index=False)

desired_order = ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
predictions_df = pd.DataFrame(predictions, columns=desired_order)
predictions_df.insert(0, 'id', ids)
predictions_df.to_csv('submission.csv', index=False)

Predictions shape: (6391, 6)
