# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer
from keras.losses import binary_crossentropy

# Load and Format the dataset

In [None]:
# load dataset
tr_feats = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
tr_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
ts_feats = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
# set mask of 'ctl_vehicle'
tr_mask = tr_feats['cp_type']=='ctl_vehicle'
ts_mask = ts_feats['cp_type']=='ctl_vehicle'

# replace str to binary
tr_feats['cp_dose'] = tr_feats['cp_dose'].replace({'D1': 0, 'D2': 1})
tr_feats['cp_type'] = tr_feats['cp_type'].replace({'trt_cp': 0, 'ctl_vehicle': 1})
ts_feats['cp_dose'] = ts_feats['cp_dose'].replace({'D1': 0, 'D2': 1})
ts_feats['cp_type'] = ts_feats['cp_type'].replace({'trt_cp': 0, 'ctl_vehicle': 1})
# Exclude rows where cp_type is "ctl_vehicle"
X = tr_feats[~tr_mask]
y = tr_scored[~tr_mask]
# convert to ndarray
X = X.iloc[:,1:].to_numpy()
X_test = ts_feats.iloc[:,1:].to_numpy()
y = y.iloc[:,1:].to_numpy()

# Cross Validation

In [None]:
%%time
NFOLDS = 4
# define Log loss metric
def logloss(y_true, y_pred):    
    return binary_crossentropy(y_true, y_pred.astype('float')).numpy().mean()

# CV
classifier =  MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))
score_funcs = {
    'custom_logloss': make_scorer(logloss)
}
k_fold = KFold(n_splits=NFOLDS, shuffle=True, random_state=10)
cv_results = cross_validate(classifier, X, y, cv=k_fold, scoring=score_funcs, return_estimator=True, verbose=1)
cv_results['test_custom_logloss']

# Prediction

In [None]:
%%time
test_proba = np.zeros((X_test.shape[0], y.shape[1]))
for nfold in range(NFOLDS):
    proba = cv_results['estimator'][nfold].predict_proba(X_test)
    proba = np.array(proba)[:,:,1].T
    test_proba += proba / NFOLDS

# create submission file

In [None]:
# set control test preba to 0
test_proba[ts_mask] = 0

In [None]:
submission.iloc[:,1:] = test_proba
submission.to_csv('submission.csv', index=False)
submission.head()