- trained model
    - https://www.kaggle.com/datasets/motono0223/belka-autogluon-gpu-3m  
        This model was trained with the following data:
            binds=1 data : 1.5M records (all positive data in train data)
            binds=0 data : 1.5M records (random picked)

In [None]:
# !pip install -q autogluon==1.1.0
# !pip install -q ray==2.6.3
# tips: https://github.com/autogluon/autogluon/issues/3365

# !pip install rdkit
# !pip install duckdb

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
import gc

import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
import duckdb
import pickle


import os
import pathlib

In [None]:
PATH_PROJECT = pathlib.Path("/mnt/d/Data/kaggle/leash/leash-BELKA/")
PATH_BASELINE_INPUT = pathlib.Path("/mnt/d/Data/kaggle/leash/automl-baseline-input")
PATH_DATA = pathlib.Path("/mnt/d/Data/kaggle/leash/")
print(os.listdir(PATH_PROJECT))
PATH_TRAIN = PATH_PROJECT / "train.csv"
PATH_SUBMISSION_EXAMPLE = PATH_PROJECT / "sample_submission.csv"

In [None]:
DEBUG = True
TRAIN = True

N_SAMPLES = 30000
if DEBUG:
    N_SAMPLES = 100

In [None]:
train_path = f"{PATH_PROJECT}/train.parquet"
test_path = f"{PATH_PROJECT}/test.parquet"

if TRAIN:
    con = duckdb.connect()
    df = con.query(
        f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT {N_SAMPLES})
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT {N_SAMPLES})"""
    ).df()
    con.close()

In [None]:
df["binds"].hist()

In [None]:
!ls -tlhr "$PATH_DATA/automl-baseline-input/train_dicts/"

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

# Assuming your DataFrame is named 'df' with columns 'molecule_smiles', 'protein_name', and 'binds'

# Convert SMILES to RDKit molecules


# Generate ECFPs
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))


# One-hot encode the protein_name
with open(f"{PATH_DATA}/leash-bio-onehot-encoder/protein_name_encoder.pkl", "rb") as f:
    onehot_encoder = pickle.load(f)

if TRAIN:
    df["molecule"] = df["molecule_smiles"].apply(Chem.MolFromSmiles)
    df["ecfp"] = df["molecule"].apply(generate_ecfp)
    onehot_encoder = OneHotEncoder(sparse_output=False)
    protein_onehot = onehot_encoder.fit_transform(
        df["protein_name"].values.reshape(-1, 1)
    )

    # Combine ECFPs and one-hot encoded protein_name
    X = [
        ecfp + protein
        for ecfp, protein in zip(df["ecfp"].tolist(), protein_onehot.tolist())
    ]

    data = pd.DataFrame(np.array(X), columns=[f"col{i:04d}" for i in range(len(X[0]))])
    data["binds"] = df["binds"]
    train, valid = train_test_split(data, test_size=0.2, random_state=42)

    train = TabularDataset(train)
    valid = TabularDataset(valid)

    del X, data
    gc.collect()

In [None]:
if TRAIN:
    predictor = TabularPredictor(
        label="binds",
        problem_type="binary",
        eval_metric="average_precision",
        path="predictor",
    )

In [None]:
%%time
if TRAIN:
    predictor.fit(
        train,
        tuning_data=valid,
        save_space=True,
        presets="optimize_for_deployment",
        use_bag_holdout=True,
    )

In [None]:
MODEL_PATH = f"{PATH_DATA}/predictor_3m"
MODEL_PATH = "predictor"
predictor = TabularPredictor.load(path=MODEL_PATH, require_version_match=False)

In [None]:
predictor.leaderboard()

# Inference

In [None]:
import os
from tqdm import tqdm

# Process the test.parquet file chunk by chunk
test_file = f"{PATH_PROJECT}/test.csv"
output_file = "submission.csv"  # Specify the path and filename for the output file

# Read the test.parquet file into a pandas DataFrame
for seq, df_test in enumerate(tqdm(pd.read_csv(test_file, chunksize=100000))):
    print(seq)
    # Generate ECFPs for the molecule_smiles
    df_test["molecule"] = df_test["molecule_smiles"].apply(Chem.MolFromSmiles)
    df_test["ecfp"] = df_test["molecule"].apply(generate_ecfp)

    # One-hot encode the protein_name
    protein_onehot = onehot_encoder.transform(
        df_test["protein_name"].values.reshape(-1, 1)
    )

    # Combine ECFPs and one-hot encoded protein_name
    X_test = [
        ecfp + protein
        for ecfp, protein in zip(df_test["ecfp"].tolist(), protein_onehot.tolist())
    ]
    X_test = pd.DataFrame(
        np.array(X_test), columns=[f"col{i:04d}" for i in range(len(X_test[0]))]
    )
    X_test = TabularDataset(X_test)

    # Predict the probabilities
    probabilities = predictor.predict_proba(X_test).iloc[:, 1].values

    # Create a DataFrame with 'id' and 'probability' columns
    output_df = pd.DataFrame({"id": df_test["id"], "binds": probabilities})

    # Save the output DataFrame to a CSV file
    output_df.to_csv(
        output_file, index=False, mode="a", header=not os.path.exists(output_file)
    )