This notebook was used to get predictions `lightgbm_secfp_1024_train_no_test_wide_50M` and `lightgbm_secfp:6_2048_train_no_test_wide_40M`.

In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy import sparse

In [None]:
print(pd.__version__)
print(lgb.__version__)

Set up the variables

In [3]:
TRAIN_SIZE = '40M'
FEATURES = 'secfp:6'
LENGTH = '2048'
PROTEINS = ['BRD4', 'HSA', 'sEH']

In [4]:
DATA_PATH = '../data'
TARGETS_PATH = f'{DATA_PATH}/train_no_test_wide_{TRAIN_SIZE}.parquet'
TRAIN_FEATURES_PATH = f'{DATA_PATH}/{FEATURES}_{LENGTH}_train_no_test_wide_{TRAIN_SIZE}'

TEST_FILE = f'{DATA_PATH}/test.csv'
TEST_FEATURES_PATH = f'{DATA_PATH}/{FEATURES}_{LENGTH}_test.npz'

SUBMIT_PATH = f'../submits/lightgbm_{FEATURES}_{LENGTH}_train_no_test_wide_{TRAIN_SIZE}.csv'

Prepare the features once by loading from .npz and saving LightGBM Dataset into a binary file, restart the kernel.

In [None]:
if not os.path.exists(f'{TRAIN_FEATURES_PATH}.bin'):
    dtrain = lgb.Dataset(sparse.load_npz(f'{TRAIN_FEATURES_PATH}.npz'))  
    dtrain.save_binary(f'{TRAIN_FEATURES_PATH}.bin')

In [None]:
dtrain = lgb.Dataset(f'{TRAIN_FEATURES_PATH}.bin') 
dtrain

Load targets

In [None]:
targets = pd.read_parquet(TARGETS_PATH)[PROTEINS]
targets

Split to train and valudation subsets

In [None]:
all_indices = np.arange(len(targets))
np.random.seed(1)
np.random.shuffle(all_indices)
valid_idx = np.random.choice(all_indices, size = 200_000, replace = False)
train_idx = np.setdiff1d(all_indices, valid_idx)

print("Number of samples for training", len(train_idx))
print("Number of samples for validation:", len(valid_idx))
print("Sanity check: intersection between train_idx and val_idx:", np.intersect1d(train_idx, valid_idx))

Fit models

In [9]:
lgb_params = {
    'max_depth': 11,
    'bagging_fraction': 0.9,
    'learning_rate': 0.05,
    'colsample_bytree': 1,
    'colsample_bynode': 0.5,
    'lambda_l1': 1,
    'objective': 'binary',
    'lambda_l2': 1.5,
    'num_leaves': 490,
    'min_data_in_leaf': 50,
    'verbose': -1,
    'metric': 'average_precision',
    'device': 'cpu'
}

In [None]:
models = {}

for protein in PROTEINS:    
    dtrain.set_label(targets[protein])

    bst = lgb.train(lgb_params,
                    num_boost_round = 5000,
                    train_set = dtrain.subset(train_idx),
                    valid_sets = dtrain.subset(valid_idx),
                    callbacks = [
                        lgb.early_stopping(stopping_rounds = 30),
                        lgb.log_evaluation(50)
                    ]
                    )
    bst.save_model(os.path.join(DATA_PATH, f'lightgbm_model_{protein}.txt'))
    del bst

Prepare submit

In [15]:
test_df = pd.read_csv(TEST_FILE)
columns_to_drop = ['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']
test_df = test_df.drop(columns=[col for col in columns_to_drop if col in test_df.columns])

features = sparse.load_npz(f'{TEST_FEATURES_PATH}')

In [None]:
submit = []
for protein in PROTEINS:
    model = lgb.Booster(model_file = os.path.join(DATA_PATH, f'lightgbm_model_{protein}.txt'))
    preds = model.predict(features, num_iteration = model.best_iteration)
    submit.append(preds)
        
submit = np.array(submit).T
submit = pd.DataFrame(submit, columns = ['BRD4', 'HSA', 'sEH'])
submit = submit.reset_index(drop = True)

submit = pd.concat([test_df['molecule_smiles'], submit], axis = 1)

submit = pd.melt(
    submit, 
    id_vars = ['molecule_smiles'], 
    value_vars = ['BRD4', 'HSA', 'sEH'], 
    value_name = 'binds', 
    var_name = 'protein_name'
)
submit = pd.merge(
    test_df, 
    submit, 
    how = 'inner',
    on = ['molecule_smiles', 'protein_name']
)
submit = submit[['id', 'binds']]
submit = submit.drop_duplicates()
submit

In [None]:
submit['binds'].describe()

In [18]:
submit.to_csv(SUBMIT_PATH, index=False)