In [1]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms

tqdm.pandas()




In [2]:
class Config():
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean' ,'X50_mean', 'X3112_mean']
    # Dataset
    RECOMPUTE_DATAFRAMES_TRAIN = True
    RECOMPUTE_DATAFRAMES_TEST = True
    RECOMPUTE_IMAGE_EMBEDDINGS = False
    N_VAL_SAMPLES0 = 4096
    # Others
    SEED = 42
    DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    
def seed_everything(seed: int):    
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
        
CONFIG = Config()
seed_everything(CONFIG.SEED)
CONFIG.DEVICE

'cuda:0'

In [3]:

BASE_DIR = os.path.join(os.getcwd() , 'data')
train_df = pd.read_csv(BASE_DIR  +  '/train.csv')
# load pickled dataframes from a public dataset; split to train-val
if CONFIG.RECOMPUTE_DATAFRAMES_TRAIN:
    train0 = pd.read_csv(BASE_DIR + '/train.csv')
    train0['file_path'] = train0['id'].apply(lambda s: f'{BASE_DIR}/train_images/{s}.jpeg')
else:
    train0 = pd.read_pickle('/kaggle/input/planttraits2024-eda-training-pub-dataset/train.pkl')
    
if CONFIG.RECOMPUTE_DATAFRAMES_TEST:
    test = pd.read_csv(BASE_DIR + '/test.csv')
    test['file_path'] = test['id'].apply(lambda s: f'{BASE_DIR}/test_images/{s}.jpeg')
else:
    test = pd.read_pickle('/kaggle/input/planttraits2024-eda-training-pub-dataset/test.pkl')
CONFIG.FEATURE_COLUMNS = test.columns.values[1:-2]

train, val = train_test_split(train0, test_size=CONFIG.N_VAL_SAMPLES0, shuffle=True, random_state=CONFIG.SEED)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [4]:
def get_mask(df, labels_describe_df):
    lower = []
    higher = []
    mask = np.empty(shape=df[CONFIG.TARGET_COLUMNS].shape, dtype=bool)
    for idx, t in enumerate(CONFIG.TARGET_COLUMNS):
        labels = df[t].values
        v_min, v_max = labels_describe_df.loc[t]['0.1%'], labels_describe_df.loc[t]['98%']
        mask[:,idx] = ((labels > v_min) & (labels < v_max))
    return mask.min(axis=1)

labels_describe_df = train[CONFIG.TARGET_COLUMNS].describe(percentiles=[0.001, 0.98]).round(3).T
# Masks
mask_train = get_mask(train, labels_describe_df)
mask_val = get_mask(val, labels_describe_df)
# Masked DataFrames
train_mask = train[mask_train].reset_index(drop=True)
val_mask = val[mask_val].reset_index(drop=True)

for m, subset, full in zip([train_mask, val_mask], ['train', 'val'], [train, val]):
    print(f'===== {subset} shape: {m.shape} =====')
    n_masked = len(full) - len(m)
    perc_masked = (n_masked / len(full)) * 100
    print(f'{subset} \t| # Masked Samples: {n_masked}')
    print(f'{subset} \t| % Masked Samples: {perc_masked:.3f}%')

===== train shape: (34884, 171) =====
train 	| # Masked Samples: 4383
train 	| % Masked Samples: 11.162%
===== val shape: (3623, 171) =====
val 	| # Masked Samples: 473
val 	| % Masked Samples: 11.548%


In [5]:
# Standard Scaler for Features
FEATURE_SCALER = StandardScaler()
# Fit and transform on training features
train_features_mask = FEATURE_SCALER.fit_transform(train_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
# Transform val/test features using scaler fitted on train data
val_features_mask = FEATURE_SCALER.transform(val_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
test_features = FEATURE_SCALER.transform(test[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))

y_train_mask = train_mask[CONFIG.TARGET_COLUMNS].values
y_val_mask = val_mask[CONFIG.TARGET_COLUMNS].values

In [6]:
def get_image_embeddings_dino(model, preprocess, batch_size, df):
    image_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        paths = df['file_path'][i:i + batch_size]
        image_tensor = torch.stack([preprocess(Image.open(path)) for path in paths]).to(CONFIG.DEVICE)
        with torch.no_grad():
            curr_image_embeddings = model(image_tensor)
        image_embeddings.extend(curr_image_embeddings.cpu().numpy())
    return image_embeddings

In [7]:
if CONFIG.RECOMPUTE_IMAGE_EMBEDDINGS:
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(CONFIG.DEVICE)
    model.eval()
    # the preprocessing differs from the original code, originally it was resize + crop
    # but we lose info while cropping, so here we use only resize to 224
    preprocess = transforms.Compose([
        transforms.Resize(224, interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])
    
    batch_size = 64
    suffix = 'image_embs_dinov2_vitg14_reg'
    train_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, train_mask)
    np.save(f'train_{suffix}', np.array(train_image_embeddings))
    val_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, val_mask)
    np.save(f'val_{suffix}', np.array(val_image_embeddings))
    test_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, test)
    np.save(f'test_{suffix}', np.array(test_image_embeddings))
else:
    suffix = 'image_embs_dinov2_vitg14_reg'
    train_image_embeddings = np.load(f'{BASE_DIR}/train_{suffix}.npy')
    val_image_embeddings = np.load(f'{BASE_DIR}/val_{suffix}.npy')
    test_image_embeddings = np.load(f'{BASE_DIR}/test_{suffix}.npy')
    print(f'Embeddings {suffix} loaded from dataset.')

Embeddings image_embs_dinov2_vitg14_reg loaded from dataset.


In [23]:
# we can potentially use all the polynomial features but it would take an etenriny to train the models
first_n_poly_feats = 1000
train_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(train_features_mask)[:, :first_n_poly_feats], train_image_embeddings), axis=1
)
val_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(val_features_mask)[:, :first_n_poly_feats], val_image_embeddings), axis=1
)
test_features_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(test_features)[:, :first_n_poly_feats], test_image_embeddings), axis=1
)

In [24]:
train_features_mask_df = pd.DataFrame(train_features_mask_all)
train_features_mask_df['emb'] = list(train_image_embeddings)

val_features_mask_df = pd.DataFrame(val_features_mask_all)
val_features_mask_df['emb'] = list(val_image_embeddings)

test_features_mask_df = pd.DataFrame(test_features_all)
test_features_mask_df['emb'] = list(test_image_embeddings)

In [17]:
%%time
models = {}
scores = {}
# for i, col in tqdm(enumerate(CONFIG.TARGET_COLUMNS), total=len(CONFIG.TARGET_COLUMNS)):
#     y_curr = y_train_mask[:, i]
#     y_curr_val = y_val_mask[:, i]
#     train_pool = Pool(train_features_mask_df, y_curr, embedding_features=['emb'])
#     val_pool = Pool(val_features_mask_df, y_curr_val, embedding_features=['emb'])
    
    # tried to tune these parameters but without real success 
model = CatBoostRegressor(iterations=1500, learning_rate=0.06, loss_function='RMSE', verbose=0, random_state=CONFIG.SEED)
#     model.fit(train_pool)
#     models[col] = model
    
#     y_curr_val_pred = model.predict(val_pool)
    
#     r2_col = r2_score(y_curr_val, y_curr_val_pred)
#     scores[col] = r2_col
#     print(f'Target: {col}, R2: {r2_col:.3f}')
# # this val score somewhat correlates with submission score bit I didn't really bother
# print(f'Mean R2: {np.mean(list(scores.values())):.3f}')

CPU times: user 109 μs, sys: 15 μs, total: 124 μs
Wall time: 130 μs


In [10]:
# from_file = CatBoostRegressor()

In [18]:
model.load(f"{BASE_DIR}/dino_model.pth")

AttributeError: 'CatBoostRegressor' object has no attribute 'load'

In [28]:
model.to(device = "cpu")
torch.save(model.state_dict(), f"{BASE_DIR}/dinoV2.pth")

AttributeError: 'CatBoostRegressor' object has no attribute 'to'

In [29]:
submission = pd.DataFrame({'id': test['id']})
submission[CONFIG.TARGET_COLUMNS] = 0
submission.columns = submission.columns.str.replace('_mean', '')

In [30]:
for i, col in enumerate(CONFIG.TARGET_COLUMNS):
    test_pool = Pool(test_features_mask_df, embedding_features=['emb'])
    col_pred = models[col].predict(test_pool)
    submission[col.replace('_mean', '')] = col_pred

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,X4,X11,X18,X26,X50,X3112
0,154220505,1.150634,145.307883,19707.700772,3490.907981,15.133894,400094.893415
1,195736552,0.984673,152.914871,19699.674946,3461.206384,14.976181,398848.975075
2,182701773,0.976492,149.251464,19699.565957,3459.253743,15.040446,398182.331739
3,27688500,0.95777,139.824763,19699.918337,3478.239522,15.942455,398248.174168
4,195825045,0.891497,153.176586,19699.167833,3460.367853,14.839305,398791.282423
