# Set up envinroment for Google Colab

In [None]:
# Check if directory exists and remove it if it does
import os
import shutil
if os.path.exists('crosstalk-q1-2025'):
    shutil.rmtree('crosstalk-q1-2025')
!git clone https://github.com/cottascience/crosstalk-q1-2025.git
%cd crosstalk-q1-2025
!pip install -r requirements.txt

# Download the training data

### Download the training file and upload it to your google Drive (root folder)
https://drive.google.com/file/d/11S5p0QgP1X9rOFiIjNSLydLenJwm7hle/view?usp=drive_link

In [1]:
from google.colab import drive
drive.mount('/content/drive')
train_file = 'crosstalk_train.parquet'
file_path = '/content/drive/My Drive/crosstalk_train.parquet'

# Load the train datasets

In [1]:
from dataset import Dataset
train_dataset = Dataset(filename='crosstalk_train.parquet', x_col="AVALON") # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

# Get a smaller subset to make it faster to debug

In [5]:
import numpy as np
random_indices = np.random.choice(len(train_dataset.X), size=1000, replace=False)
train_dataset.X = train_dataset.X[random_indices]
train_dataset.y = train_dataset.y[random_indices]

# Let's train catboost classifier and see how well it fits the training data

In [None]:
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
model.fit(train_dataset.X, train_dataset.y)
yp = model.predict_proba(train_dataset.X)[:, 1] # or validation
print( eval.compute_metrics(yt=train_dataset.y, yp=yp) ) # or validation

# How well does it generalize though? Let's try 5-fold cross-validation

In [None]:
model = cb.CatBoostClassifier(**params)
res = eval.CV_model(model)
print(res)

# Let's compare it against simpler sklearn baselines

In [None]:
from eval import get_baseline_models

eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

print(baselines_res)