# Set up envinroment for Google Colab

In [None]:
!git clone https://github.com/cottascience/crosstalk-q1-2025.git
%cd crosstalk-q1-2025
!pip install -r requirements.txt

# Download the train and test data

In [None]:
import gdown
import os

file_id = '1iDn6HEO6oXO5lI5cQscxh_VQjpSAcGac'
train_file = 'crosstalk_train.parquet'
if not os.path.exists(train_file):
    gdown.download(id=file_id, output=train_file, quiet=False)

# Load the train datasets

In [13]:
from dataset import Dataset
train_dataset = Dataset(filename=train_file, X_col="AVALON") # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

# Get a smaller subset to make it faster to debug

In [5]:
import numpy as np
random_indices = np.random.choice(len(train_dataset.X), size=1000, replace=False)
train_dataset.X = train_dataset.X[random_indices]
train_dataset.y = train_dataset.y[random_indices]

# Let's train catboost classifier and see how well it fits the training data

In [10]:
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
model.fit(train_dataset.X, train_dataset.y)
yp = model.predict_proba(train_dataset.X)[:, 1] # or validation
print( eval.compute_metrics(yt=train_dataset.y, yp=yp) ) # or validation

{'accuracy': 0.999, 'balanced_accuracy': 0.993421052631579, 'roc_auc': 0.9999928799270904, 'precision': 1.0, 'recall': 0.9868421052631579, 'mean_reciprocal_rank': 0.06466465164855116, 'positives': 76, 'predicted_positives': 75, 'hits_at_5': 0.06578947368421052, 'precision_at_5': 1.0, 'hits_at_10': 0.13157894736842105, 'precision_at_10': 1.0, 'hits_at_30': 0.39473684210526316, 'precision_at_30': 1.0, 'hits_at_76': 1.0, 'precision_at_76': 1.0}


# How well does it generalize though? Let's try 5-fold cross-validation

In [11]:
model = cb.CatBoostClassifier(**params)
res = eval.CV_model(model)
print(res)

{'mean': {'accuracy': 0.917, 'balanced_accuracy': 0.5203011163337251, 'roc_auc': 0.7455324862906385, 'precision': 0.26666666666666666, 'recall': 0.052500000000000005, 'mrr': 0.09705604222418686, 'precision_at_k_5': 0.5199999999999999, 'hits_at_k_5': 0.17083333333333334, 'precision_at_k_10': 0.43999999999999995, 'hits_at_k_10': 0.28833333333333333, 'precision_at_k_30': 0.2733333333333333, 'hits_at_k_30': 0.5375}, 'std': {'accuracy': 0.008717797887081356, 'balanced_accuracy': 0.025732054327889897, 'roc_auc': 0.04922048975262177, 'precision': 0.27080128015453203, 'recall': 0.04969350505291858, 'mrr': 0.03131774861306332, 'precision_at_k_5': 0.20396078054371142, 'hits_at_k_5': 0.06718548123582124, 'precision_at_k_10': 0.13564659966250536, 'hits_at_k_10': 0.08491826135237998, 'precision_at_k_30': 0.06463573143221772, 'hits_at_k_30': 0.11672617529928753}}


# Let's compare it against simpler sklearn baselines

In [12]:
from eval import get_baseline_models

eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

print(baselines_res)

{'stratified_dummy': {'mean': {'accuracy': 0.8639999999999999, 'balanced_accuracy': 0.5161158441049746, 'roc_auc': 0.5161158441049746, 'precision': 0.09960317460317461, 'recall': 0.10583333333333333, 'mrr': 0.16562091503267978, 'precision_at_k_5': 0.12000000000000002, 'hits_at_k_5': 0.10583333333333333, 'precision_at_k_10': 0.06000000000000001, 'hits_at_k_10': 0.2925, 'precision_at_k_30': 0.12, 'hits_at_k_30': 1.0}, 'std': {'accuracy': 0.013928388277184131, 'balanced_accuracy': 0.03638891170364818, 'roc_auc': 0.036388911703648184, 'precision': 0.048861921803878675, 'recall': 0.0804328567467577, 'mrr': 0.0688195754532735, 'precision_at_k_5': 0.09797958971132713, 'hits_at_k_5': 0.0804328567467577, 'precision_at_k_10': 0.048989794855663564, 'hits_at_k_10': 0.3622499137335991, 'precision_at_k_30': 0.05811865258054232, 'hits_at_k_30': 0.0}}, 'most_frequent_dummy': {'mean': {'accuracy': 0.924, 'balanced_accuracy': 0.5, 'roc_auc': 0.5, 'precision': 0.0, 'recall': 0.0, 'mrr': 1.0, 'precision_a