In [2]:
!pip install kaggle
!echo '{"username":"sarahali1223","key":"39d9946ae36527c2440edf25fbdc9ccf"}' > kaggle.json
!mkdir -p /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c cs-480-2024-spring
!unzip *.zip

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3061, in _parsed_pkg_info
    return self._pkg_info
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info. Did you mean: 'egg_info'?

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py",

In [1]:
!pip install catboost



In [17]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms
from scipy import stats

tqdm.pandas()

# Config

In [26]:
TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

seed = 42
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

device

'cuda:0'

# Load and split DataFrames

In [5]:
# load dataframes from public dataset; split to train-val
train0 = pd.read_csv('/content/data/train.csv')
train0['file_path'] = train0['id'].apply(lambda s: f'/content/data/train_images/{s}.jpeg')

test = pd.read_csv('/content/data/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'/content/data/test_images/{s}.jpeg')
FEATURE_COLUMNS = test.columns.values[1:-1]

train, val = train_test_split(train0, test_size=0.2, shuffle=True, random_state=seed)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

# Filter outliers

In [27]:
def get_mask(df):
  within = {}
  for id, col in enumerate(TARGET_COLUMNS):
    within[id] = np.abs(stats.zscore(df[col])) < 3
  return np.logical_and.reduce((within[0], within[1], within[2], within[3], within[4], within[5]))

train_mask = train[get_mask(train)]
val_mask = val[get_mask(val)]

# Scale features

In [7]:
FEATURE_SCALER = StandardScaler()
train_features_mask = FEATURE_SCALER.fit_transform(train_mask[FEATURE_COLUMNS].values.astype(np.float32))
val_features_mask = FEATURE_SCALER.transform(val_mask[FEATURE_COLUMNS].values.astype(np.float32))
test_features = FEATURE_SCALER.transform(test[FEATURE_COLUMNS].values.astype(np.float32))

y_train_mask = train_mask[TARGET_COLUMNS].values
y_val_mask = val_mask[TARGET_COLUMNS].values

# Get DINO image embeddings

In [8]:
def get_image_embeddings_dino(model, preprocess, batch_size, df):
    image_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        paths = df['file_path'][i:i + batch_size]
        image_tensor = torch.stack([preprocess(Image.open(path)) for path in paths]).to(device)
        with torch.no_grad():
            curr_image_embeddings = model(image_tensor)
        image_embeddings.extend(curr_image_embeddings.cpu().numpy())
    return image_embeddings

In [9]:
RECOMPUTE_IMAGE_EMBEDDINGS = True

if RECOMPUTE_IMAGE_EMBEDDINGS:
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(device)
    model.eval()
    preprocess = transforms.Compose([
        transforms.Resize(224, interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), #using ImageNet normalization
    ])

    batch_size = 64
    suffix = 'image_embs'
    train_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, train_mask)
    np.save(f'train_{suffix}', np.array(train_image_embeddings))
    val_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, val_mask)
    np.save(f'val_{suffix}', np.array(val_image_embeddings))
    test_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, test)
    np.save(f'test_{suffix}', np.array(test_image_embeddings))
else:
    suffix = 'image_embs'
    train_image_embeddings = np.load(f'/content/train_{suffix}.npy')
    val_image_embeddings = np.load(f'/content/val_{suffix}.npy')
    test_image_embeddings = np.load(f'/content/test_{suffix}.npy')
    print(f'Embeddings {suffix} loaded from dataset.')

Embeddings image_embs loaded from dataset.


# Add polynomial features

In [10]:
train_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(train_features_mask)[:, :1000], train_image_embeddings), axis=1
)
val_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(val_features_mask)[:, :1000], val_image_embeddings), axis=1
)
test_features_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(test_features)[:, :1000], test_image_embeddings), axis=1
)

In [11]:
train_features_mask_df = pd.DataFrame(train_features_mask_all)
train_features_mask_df['emb'] = list(train_image_embeddings)

val_features_mask_df = pd.DataFrame(val_features_mask_all)
val_features_mask_df['emb'] = list(val_image_embeddings)

test_features_mask_df = pd.DataFrame(test_features_all)
test_features_mask_df['emb'] = list(test_image_embeddings)

# Train CatBoost

In [13]:
%%time
models = {}
scores = {}
for i, col in tqdm(enumerate(TARGET_COLUMNS), total=len(TARGET_COLUMNS)):
    y_curr = y_train_mask[:, i]
    y_curr_val = y_val_mask[:, i]
    train_pool = Pool(train_features_mask_df, y_curr, embedding_features=['emb'])
    val_pool = Pool(val_features_mask_df, y_curr_val, embedding_features=['emb'])

    model = CatBoostRegressor(iterations=2000, learning_rate = 0.07, depth=8, loss_function='RMSE',
                              verbose = 0, random_state=seed, task_type="GPU", devices='0')
    model.fit(train_pool)
    models[col] = model

    y_curr_val_pred = model.predict(val_pool)

    r2_col = r2_score(y_curr_val, y_curr_val_pred)
    scores[col] = r2_col
    print(f'Target: {col}, R2: {r2_col:.3f}')

  0%|          | 0/6 [00:00<?, ?it/s]

Target: X4_mean, R2: 0.520
Target: X11_mean, R2: 0.482
Target: X18_mean, R2: 0.620
Target: X50_mean, R2: 0.390
Target: X26_mean, R2: 0.355
Target: X3112_mean, R2: 0.501
Mean R2: 0.478
CPU times: user 2h 38min 54s, sys: 35.6 s, total: 2h 39min 30s
Wall time: 2h 34min 51s


# Submission

In [14]:
submission = pd.DataFrame({'id': test['id']})
for i, col in enumerate(TARGET_COLUMNS):
    test_pool = Pool(test_features_mask_df, embedding_features=['emb'])
    col_pred = models[col].predict(test_pool)
    submission[col.replace('_mean', '')] = col_pred

submission = submission.reindex(columns=["id", "X4", "X11", "X18", "X26", "X50", "X3112"])
submission.head()

Unnamed: 0,id,X4,X11,X18,X26,X50,X3112
0,154220505,1.126255,145.265036,19708.192033,3494.700804,15.089887,399722.501434
1,195736552,0.946652,153.215798,19699.742205,3460.716326,14.857638,399036.977354
2,182701773,0.980872,146.319944,19699.069683,3459.370964,14.958562,397981.216567
3,27688500,0.988727,138.462792,19699.264606,3480.267833,16.017378,398039.947268
4,195825045,0.900524,153.715954,19699.59655,3462.907895,14.845607,398988.570426


In [15]:
submission.to_csv('20948283_Ali.csv', index=False)

In [9]:
!zip downloaded_embeddings.zip *image_embs.npy

  adding: test_image_embs.npy (deflated 7%)
  adding: train_image_embs.npy (deflated 7%)
  adding: val_image_embs.npy (deflated 7%)
