<a href="https://colab.research.google.com/github/rajaonsonella/crosstalk-q2-2025/blob/main/notebooks/3_1_train_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

⚙️ Step 1: Set your notebook to GPU

The next two cells take ~2 min.... start running them now while we talk! 👇👇

In [2]:
# get workshop code
import os
import sys
IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
if IN_COLAB:
    !git clone https://github.com/rajaonsonella/crosstalk-q2-2025
    sys.path.append('./crosstalk-q2-2025')
else:
    sys.path.append('..')
!pip install -r crosstalk-q2-2025/requirements.txt

Cloning into 'crosstalk-q2-2025'...
remote: Enumerating objects: 343, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 343 (delta 59), reused 37 (delta 24), pack-reused 248 (from 1)[K
Receiving objects: 100% (343/343), 36.85 MiB | 17.49 MiB/s, done.
Resolving deltas: 100% (183/183), done.
Collecting catboost (from -r crosstalk-q2-2025/requirements.txt (line 5))
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pympler (from -r crosstalk-q2-2025/requirements.txt (line 6))
  Downloading Pympler-1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdkit (from -r crosstalk-q2-2025/requirements.txt (line 8))
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:0

In [1]:
# Download data from google drive
import gdown
import os

file_ids = {'test_inputs' : '1Gyv_ldUTi0Ymy6wVMfruAO0UraCQ70CR',
            'train': '11S5p0QgP1X9rOFiIjNSLydLenJwm7hle'}

for name, file_id in file_ids.items():
    filename = f'crosstalk_{name}.parquet'
    if not os.path.exists(filename):
        gdown.download(id=file_id, output=filename, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1Gyv_ldUTi0Ymy6wVMfruAO0UraCQ70CR
From (redirected): https://drive.google.com/uc?id=1Gyv_ldUTi0Ymy6wVMfruAO0UraCQ70CR&confirm=t&uuid=c28887f5-ba17-4200-a922-480fc9b4e33f
To: /content/crosstalk_test.parquet
100%|██████████| 1.52G/1.52G [00:23<00:00, 65.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=11S5p0QgP1X9rOFiIjNSLydLenJwm7hle
From (redirected): https://drive.google.com/uc?id=11S5p0QgP1X9rOFiIjNSLydLenJwm7hle&confirm=t&uuid=1f198066-0136-4816-a487-0744e6de1cdb
To: /content/crosstalk_train.parquet
100%|██████████| 1.97G/1.97G [00:30<00:00, 63.8MB/s]


Or, if you have the file located in your drive

In [40]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Load the train datasets

See the bonus content from last notebook to get a peek under the hood of the data loaders

Or check it out in the files you downloaded to colab on the left 👈

In [5]:
import pandas as pd
import numpy as np
from dataset import basic_dataloader

In [6]:
X_train, y_train = basic_dataloader('/content/drive/My Drive/crosstalk_train.parquet', x_col="AVALON", y_col = 'DELLabel', max_to_load=100000) # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

Loading chunks:   0%|          | 0/20 [00:00<?, ?it/s]

In [7]:
X_train.shape

(100000, 2048)

In [8]:
y_train.shape

(100000,)

In [9]:
print(y_train)

[0 0 0 ... 0 1 0]


# Let's train catboost classifier and see how well it fits the training data

🐞 do you see a CUDA error? raise your hand now and brag about it

In [10]:
%%time
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
model.fit(X_train, y_train)
yp = model.predict_proba(X_train)[:, 1] # or validation

CPU times: user 20.8 s, sys: 9.15 s, total: 29.9 s
Wall time: 19.1 s


In [11]:
eval = BinaryEvaluator(X_train, y_train)
metric_dict = eval.compute_metrics(yt=y_train, yp=yp) # or validation

In [12]:
for metric_name, metric_value in metric_dict.items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 0.95
balanced_accuracy   : 0.71
roc_auc             : 0.95
precision           : 0.92
recall              : 0.42
mean_reciprocal_rank: 0.00
positives           : 7645.00
predicted_positives : 3510.00
hits_at_5           : 0.00
precision_at_5      : 1.00
hits_at_10          : 0.00
precision_at_10     : 1.00
hits_at_30          : 0.00
precision_at_30     : 1.00
hits_at_7645        : 0.71
precision_at_7645   : 0.71


# How well does it generalize though? Let's try 5-fold cross-validation

In [13]:
%%time
model_cv = cb.CatBoostClassifier(**params)
metric_dict_cv = eval.CV_model(model_cv)

CPU times: user 1min 26s, sys: 38.6 s, total: 2min 4s
Wall time: 1min 14s


In [14]:
for metric_name, metric_value in metric_dict_cv['mean'].items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 0.95
balanced_accuracy   : 0.69
roc_auc             : 0.94
precision           : 0.90
recall              : 0.38
mrr                 : 0.00
precision_at_k_5    : 1.00
hits_at_k_5         : 0.00
precision_at_k_10   : 0.98
hits_at_k_10        : 0.01
precision_at_k_30   : 0.99
hits_at_k_30        : 0.02


# Submit predictions

Update the next cell with your team name

In [51]:
team_name = 'demo'

In [52]:
%%time
X_test = basic_dataloader('/content/drive/My Drive/crosstalk_test_inputs.parquet', x_col="AVALON", y_col = None, max_to_load = None, chunk_size = 20000)

Loading chunks:   0%|          | 0/17 [00:00<?, ?it/s]

 18%|█▊        | 274M/1.52G [06:29<29:27, 704kB/s] 


CPU times: user 3min 12s, sys: 11.6 s, total: 3min 23s
Wall time: 3min 31s


In [53]:
X_test.shape

(339258, 2048)

In [54]:
yp = model.predict_proba(X_test)[:,1]

Upload this baseline to kaggle and check out the leaderboard!

In [79]:
import pyarrow as pa
from pyarrow import parquet as pq

In [80]:
pf = pq.ParquetFile('/content/drive/My Drive/crosstalk_test_inputs.parquet')

In [81]:
preds = pf.read(columns = ['RandomID']).to_pandas()
preds['DELLabel'] = yp
display(preds)

Unnamed: 0,RandomID,DELLabel
0,ID_0,0.038460
1,ID_1,0.036216
2,ID_2,0.005991
3,ID_3,0.018435
4,ID_4,0.018095
...,...,...
339253,ID_339253,0.043736
339254,ID_339254,0.003604
339255,ID_339255,0.012205
339256,ID_339256,0.026451


In [82]:
preds.to_csv(f'{team_name}.csv', index=False)

# Let's compare it against some sklearn baselines

⚠️ these next cells are slow to run! Start them now and come back in 5 minutes

In [2]:
%%time
from eval import get_baseline_models

eval = BinaryEvaluator(X_train, y_train)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

NameError: name 'BinaryEvaluator' is not defined

In [None]:
# display all the models results
baselines_res.update({'catboost': metric_dict_cv})
pd.DataFrame({model: metrics['mean'] for model, metrics in baselines_res.items()}).T.round(2)