<a href="https://colab.research.google.com/github/rajaonsonella/crosstalk-q2-2025/blob/main/notebooks/3_1_train_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

⚙️ Step 1: Set your notebook to GPU

The next two cells take ~2 min.... start running them now while we talk! 👇👇

In [None]:
# get workshop code
import os
import sys
IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
if IN_COLAB:
    !git clone https://github.com/rajaonsonella/crosstalk-q2-2025
    sys.path.append('./crosstalk-q2-2025')
else:
    sys.path.append('..')
!pip install -r crosstalk-q2-2025/requirements.txt

Cloning into 'crosstalk-q2-2025'...
remote: Enumerating objects: 297, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 297 (delta 28), reused 20 (delta 14), pack-reused 248 (from 1)[K
Receiving objects: 100% (297/297), 32.35 MiB | 16.79 MiB/s, done.
Resolving deltas: 100% (152/152), done.
Collecting catboost (from -r crosstalk-q2-2025/requirements.txt (line 5))
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pympler (from -r crosstalk-q2-2025/requirements.txt (line 6))
  Downloading Pympler-1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdkit (from -r crosstalk-q2-2025/requirements.txt (line 8))
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:0

In [None]:
# Download data from google drive
import gdown
import os

file_ids = {'test' : '19oR_A2UNUvy7pyL3J11oixkmF1m-bZBl',
            'train':'11S5p0QgP1X9rOFiIjNSLydLenJwm7hle'}

for name, file_id in file_ids.items():
    filename = f'crosstalk_{name}.parquet'
    if not os.path.exists(filename):
        gdown.download(id=file_id, output=filename, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=19oR_A2UNUvy7pyL3J11oixkmF1m-bZBl
From (redirected): https://drive.google.com/uc?id=19oR_A2UNUvy7pyL3J11oixkmF1m-bZBl&confirm=t&uuid=1fa0f77c-41d6-4a5f-9f5c-0f8a8369dd7c
To: /content/crosstalk_test.parquet
100%|██████████| 1.52G/1.52G [00:31<00:00, 47.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=11S5p0QgP1X9rOFiIjNSLydLenJwm7hle
From (redirected): https://drive.google.com/uc?id=11S5p0QgP1X9rOFiIjNSLydLenJwm7hle&confirm=t&uuid=77a6c693-cb3f-4606-a6a3-1fc7a1f16c91
To: /content/crosstalk_train.parquet
100%|██████████| 1.97G/1.97G [00:46<00:00, 42.9MB/s]


# Load the train datasets

See the bonus content from last notebook to get a peek under the hood of the data loaders

Or check it out in the files you downloaded to colab on the left 👈

In [None]:
import pandas as pd
import numpy as np
from dataset import basic_dataloader

In [None]:
X_train, y_train = basic_dataloader('crosstalk_train.parquet', x_col="AVALON", y_col = 'DELLabel', max_to_load=1000) # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

Loading chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
X_train.shape

(1000, 2048)

In [None]:
y_train.shape

(1000,)

In [None]:
print(y_train)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 

# Let's train catboost classifier and see how well it fits the training data

🐞 do you see a CUDA error? raise your hand now and brag about it

In [None]:
%%time
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
eval = BinaryEvaluator(X_train.toarray(), y_train)
model.fit(X_train.toarray(), y_train)
yp = model.predict_proba(X_train)[:, 1] # or validation
metric_dict = eval.compute_metrics(yt=y_train, yp=yp) # or validation

CPU times: user 36.9 s, sys: 3.54 s, total: 40.4 s
Wall time: 36 s


In [None]:
for metric_name, metric_value in metric_dict.items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 1.00
balanced_accuracy   : 1.00
roc_auc             : 1.00
precision           : 1.00
recall              : 1.00
mean_reciprocal_rank: 0.08
positives           : 59.00
predicted_positives : 59.00
hits_at_5           : 0.08
precision_at_5      : 1.00
hits_at_10          : 0.17
precision_at_10     : 1.00
hits_at_30          : 0.51
precision_at_30     : 1.00
hits_at_59          : 1.00
precision_at_59     : 1.00


# How well does it generalize though? Let's try 5-fold cross-validation

⚠️ these next cells are slow to run! Start them now and come back in 5 minutes

In [None]:
%%time
model_cv = cb.CatBoostClassifier(**params)
metric_dict_cv = eval.CV_model(model_cv)

CPU times: user 3min 7s, sys: 20.9 s, total: 3min 28s
Wall time: 3min


In [None]:
for metric_name, metric_value in metric_dict_cv['mean'].items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 0.95
balanced_accuracy   : 0.60
roc_auc             : 0.79
precision           : 0.82
recall              : 0.20
mrr                 : 0.16
precision_at_k_5    : 0.56
hits_at_k_5         : 0.24
precision_at_k_10   : 0.46
hits_at_k_10        : 0.39
precision_at_k_30   : 0.23
hits_at_k_30        : 0.59


# Let's compare it against simpler sklearn baselines

In [None]:
%%time
from eval import get_baseline_models

eval = BinaryEvaluator(X_train.toarray(), y_train)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

CPU times: user 43.3 s, sys: 45.9 ms, total: 43.3 s
Wall time: 26.9 s


In [None]:
# display all the models results
baselines_res.update({'catboost': metric_dict_cv})
pd.DataFrame({model: metrics['mean'] for model, metrics in baselines_res.items()}).T.round(2)

Unnamed: 0,accuracy,balanced_accuracy,roc_auc,precision,recall,mrr,precision_at_k_5,hits_at_k_5,precision_at_k_10,hits_at_k_10,precision_at_k_30,hits_at_k_30
stratified_dummy,0.89,0.5,0.5,0.05,0.05,0.13,0.0,0.05,0.04,0.25,0.03,1.0
most_frequent_dummy,0.94,0.5,0.5,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.01,1.0
uniform_dummy,0.06,0.5,0.5,0.06,1.0,1.0,0.0,1.0,0.0,1.0,0.01,1.0
logistic_regression,0.92,0.62,0.74,0.32,0.29,0.09,0.36,0.15,0.3,0.25,0.19,0.47
decision_tree,0.92,0.51,0.51,0.13,0.05,0.16,0.24,0.15,0.16,0.75,0.06,0.9
random_forest,0.94,0.51,0.79,0.2,0.02,0.11,0.44,0.19,0.34,0.29,0.19,0.49
linear_svc,0.94,0.5,0.73,0.0,0.0,0.08,0.24,0.1,0.26,0.22,0.17,0.44
catboost,0.95,0.6,0.79,0.82,0.2,0.16,0.56,0.24,0.46,0.39,0.23,0.59


# Submit predictions

Update the next cell with your team name

In [None]:
team_name = 'demo'

In [None]:
%%time
X_test = basic_dataloader('crosstalk_test.parquet', x_col="AVALON", y_col = None, max_to_load = 400000, chunk_size = 20000)

Loading chunks:   0%|          | 0/20 [00:00<?, ?it/s]

CPU times: user 3min 11s, sys: 10.9 s, total: 3min 22s
Wall time: 3min 21s


In [None]:
X_test.shape

(339204, 2048)

In [None]:
yp = model.predict_proba(X_test)[:,1]
np.savetxt(f'{team_name}.txt', yp)

Upload this baseline to kaggle and check out the leaderboard!