# Multi-Regression CatBoost Test

CatBoost has its own MultiRMSE loss that supports multi-regression tasks. In this notebook, I test its performance.

**Update:** By changing the learning rate to 0.03 and iteration to 1000, CV becomes 0.0168, LB is 0.02016.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import os
import gc
import datetime
import numpy as np
import pandas as pd
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
from time import time

In [None]:
def create_folds(num_starts, num_splits):
    
    folds = []
    
    # LOAD FILES
    train_feats = pd.read_csv('../input/lish-moa/train_features.csv')
    scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
    scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :]
    drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :]
    targets = scored.columns[1:]
    scored = scored.merge(drug, on = 'sig_id', how = 'left') 

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    for seed in range(num_starts):

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}; dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop = True)
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(),'fold'] =\
            scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)
        
        del scored['fold']
        
    return np.stack(folds)

# Data Preparation

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')
ss_lr = ss.copy()

cols = [c for c in ss.columns.values if c != 'sig_id']
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
#     df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 0.5, 72: 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

# def log_loss_metric(y_true, y_pred):
#     metrics = []
#     for _target in train_targets.columns:
#         metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
#     return np.mean(metrics)

def log_loss_metric(y_true, y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    for i in range(y_true.shape[1]):
        loss += - np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_true.shape[1]

train = preprocess(train_features)
test = preprocess(test_features)

del train_targets['sig_id']
del train_targets_nonscored['sig_id']

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(output_distribution = 'normal', random_state = 42)
qt.fit(pd.concat([pd.DataFrame(train[GENES+CELLS]), pd.DataFrame(test[GENES+CELLS])]))
train[GENES+CELLS] = qt.transform(train[GENES+CELLS])
test[GENES+CELLS] = qt.transform(test[GENES+CELLS])

In [None]:
train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop = True)
train_targets_nonscored = train_targets_nonscored.loc[train['cp_type'] == 0].reset_index(drop = True)
train = train.loc[train['cp_type'] == 0].reset_index(drop = True)

print(train.shape)

In [None]:
top_feats = np.arange(1, train.shape[1])
print(top_feats)

In [None]:
train.head()

In [None]:
N_STARTS = 1
N_SPLITS = 5
LBS = 0.0008
folds = create_folds(N_STARTS, N_SPLITS)
print(folds)

# Multi-Regression CatBoost

In [None]:
params = {'learning_rate': 0.3, 
          'depth': 6, 
          'l2_leaf_reg': 3, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 150,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bernoulli', 
          'allow_const_label': True, 
         }

In [None]:
res = train_targets.copy()
ss.loc[:, train_targets.columns] = 0
res.loc[:, train_targets.columns] = 0

for nums, seed in enumerate(range(N_STARTS)):
    
#     for n, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits = N_SPILTS, random_state = 42, shuffle = True).split(train_targets, train_targets)):
    for n, foldno in enumerate(set(folds[nums])):
        start_time = time()
        tr = folds[nums] != foldno
        te = folds[nums] == foldno
        
        x_tr, x_val = train.values[tr][:, top_feats], train.values[te][:, top_feats]
        y_tr, y_val = train_targets.astype(float).values[tr], train_targets.astype(float).values[te]
        x_tt = test.values[:, top_feats]
        
        # Label Smoothing
        y_tr = y_tr * (1 - LBS) + 0.5 * LBS
        
        cat_tr = Pool(x_tr, label = y_tr)
        cat_val = Pool(x_val, label = y_val)
        
        params['random_state'] = seed
        model = CatBoostRegressor(**params)
        fit_model = model.fit(cat_tr, eval_set = cat_val, early_stopping_rounds = 5, 
                              use_best_model = True, verbose = 0)
        
        ss.loc[:, train_targets.columns] += fit_model.predict(x_tt) / (N_SPLITS * N_STARTS)
        fold_pred = fit_model.predict(x_val)
        res.loc[te, train_targets.columns] += fold_pred / N_STARTS
        fold_score = log_loss_metric(train_targets.loc[te].values, fold_pred)
        print(f'[{str(datetime.timedelta(seconds = time() - start_time))[0:7]}] CatBoost: Seed {seed}, Fold {n}:', fold_score)
        
        del model, fit_model
        x = gc.collect()

In [None]:
print(f'CatBoost OOF Metric: {log_loss_metric(train_targets.values, res.values)}')

# Logistic Regression Stacked on Regressor

https://www.kaggle.com/gogo827jz/rapids-svm-on-gpu-6000-models-in-1-hour

In [None]:
X_new = res[cols].values
x_tt_new = ss[cols].values

In [None]:
from sklearn.linear_model import LogisticRegression

res_lr = train_targets.copy()
ss_lr.loc[:, train_targets.columns] = 0
res_lr.loc[:, train_targets.columns] = 0

for tar in tqdm(range(train_targets.shape[1])):
    
    start_time = time()
    targets = train_targets.values[:, tar]
    
    if targets.sum() >= N_SPLITS:
        
        for seed in range(N_STARTS):

            skf = StratifiedKFold(n_splits = N_SPLITS, random_state = 42, shuffle = True)

            for n, (tr, te) in enumerate(skf.split(targets, targets)):

                x_tr, x_val = X_new[tr, tar].reshape(-1, 1), X_new[te, tar].reshape(-1, 1)
                y_tr, y_val = targets[tr], targets[te]
                
                model = LogisticRegression(random_state = seed)
                model.fit(x_tr, y_tr)
                ss_lr.loc[:, train_targets.columns[tar]] += model.predict_proba(x_tt_new[:, tar].reshape(-1, 1))[:, 1] / (N_SPLITS * N_STARTS)
                res_lr.loc[te, train_targets.columns[tar]] += model.predict_proba(x_val)[:, 1] / N_STARTS
    
    score = log_loss(train_targets.loc[:, train_targets.columns[tar]].values, res_lr.loc[:, train_targets.columns[tar]].values)
#     print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] LR Target {tar}:', score)

In [None]:
print(f'LR OOF Metric: {log_loss_metric(train_targets.values, res_lr.values)}')

# Submit

In [None]:
ss.loc[test['cp_type'] == 1, train_targets.columns] = 0
ss.to_csv('submission_cat.csv', index = False)

In [None]:
ss_lr.loc[test['cp_type'] == 1, train_targets.columns] = 0
ss_lr.to_csv('submission.csv', index = False)