<a href="https://colab.research.google.com/github/rajaonsonella/crosstalk-q2-2025/blob/main/notebooks/3_1_train_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

⚙️ Step 1: Set your notebook to GPU

The next two cells take ~2 min.... start running them now while we talk! 👇👇

In [None]:
# get workshop code
import os
import sys
IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
if IN_COLAB:
    !git clone https://github.com/rajaonsonella/crosstalk-q2-2025
    sys.path.append('./crosstalk-q2-2025')
else:
    sys.path.append('..')
!pip install -r crosstalk-q2-2025/requirements.txt

In [None]:
# Download data from google drive
import gdown
import os

file_ids = {'test' : '19oR_A2UNUvy7pyL3J11oixkmF1m-bZBl',
            'train':'11S5p0QgP1X9rOFiIjNSLydLenJwm7hle'}

for name, file_id in file_ids.items():
    filename = f'crosstalk_{name}.parquet'
    if not os.path.exists(filename):
        gdown.download(id=file_id, output=filename, quiet=False)

# Load the train datasets

See the bonus content from last notebook to get a peek under the hood of the data loaders

Or check it out in the files you downloaded to colab on the left 👈

In [None]:
import pandas as pd
import numpy as np
from dataset import basic_dataloader

In [None]:
X_train, y_train = basic_dataloader('crosstalk_train.parquet', x_col="AVALON", y_col = 'DELLabel', max_to_load=1000) # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
print(y_train)

# Let's train catboost classifier and see how well it fits the training data

🐞 do you see a CUDA error? raise your hand now and brag about it

In [None]:
%%time
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
eval = BinaryEvaluator(X_train.toarray(), y_train)
model.fit(X_train.toarray(), y_train)
yp = model.predict_proba(X_train)[:, 1] # or validation
metric_dict = eval.compute_metrics(yt=y_train, yp=yp) # or validation

In [None]:
for metric_name, metric_value in metric_dict.items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

# How well does it generalize though? Let's try 5-fold cross-validation

⚠️ these next cells are slow to run! Start them now and come back in 5 minutes

In [None]:
%%time
model_cv = cb.CatBoostClassifier(**params)
metric_dict_cv = eval.CV_model(model_cv)

In [None]:
for metric_name, metric_value in metric_dict_cv['mean'].items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

# Let's compare it against simpler sklearn baselines

In [None]:
%%time
from eval import get_baseline_models

eval = BinaryEvaluator(X_train.toarray(), y_train)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

In [None]:
# display all the models results
baselines_res.update({'catboost': metric_dict_cv})
pd.DataFrame({model: metrics['mean'] for model, metrics in baselines_res.items()}).T.round(2)

# Submit predictions

Update the next cell with your team name

In [None]:
team_name = 'demo'

In [None]:
%%time
X_test = basic_dataloader('crosstalk_test.parquet', x_col="AVALON", y_col = None, max_to_load = 400000, chunk_size = 20000)

In [None]:
X_test.shape

In [None]:
yp = model.predict_proba(X_test)[:,1]
np.savetxt(f'{team_name}.txt', yp)

Upload this baseline to kaggle and check out the leaderboard!