I took the name **self-stacking** from [@Yirun Zhang](https://www.kaggle.com/gogo827jz) as the idea is similar but not exactly the same.

I also took some of the code from https://www.kaggle.com/sishihara/moa-lgbm-benchmark.

## Main Idea:
* Train targets that contain lots of postive samples(=1) first
* Save the oof and add the oof predictions as features for training targets with less postive samples

## Results:
* With a slight twist of this code you can get a CV around `0.015` and LB around `0.019` with LightGBM

## Imports

In [None]:
reset -fs

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
import lightgbm as lgb
import optuna
import warnings
from tqdm import tqdm
import time
from sklearn.decomposition import TruncatedSVD, PCA
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/lish-moa/train_features.csv")
test = pd.read_csv("../input/lish-moa/test_features.csv")
train_targets_scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
train_drug = pd.read_csv('../input/lish-moa/train_drug.csv')
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")

## Data Prep

In [None]:
train.cp_time = train.cp_time.astype(str)
test.cp_time = test.cp_time.astype(str)
gs = train.columns[train.columns.str.contains('g-')]
cs = train.columns[train.columns.str.contains('c-')]

In [None]:
def feature_engineering(train, test):
    # scaling
    scaler = preprocessing.StandardScaler()
    train_n = scaler.fit_transform(train.iloc[:, 4:])
    test_n = scaler.transform(test.iloc[:, 4:])
    # pca
    pca = PCA(n_components=117, random_state=42)
    train_pca = pca.fit_transform(train_n)
    test_pca = pca.transform(test_n)
    for i in range(117):
        train[f'pca{i+1}'] = train_pca[:, i]
        test[f'pca{i+1}'] = test_pca[:, i]
        
    # rank gauss
    for col in train.iloc[:, 4:].columns:
        transformer = QuantileTransformer(n_quantiles=500,random_state=42, output_distribution="normal")
        vec_len = len(train[col].values)
        vec_len_test = len(test[col].values)
        raw_vec = train[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
        
    # categorical variable concat
    train['combined_cat0'] = train.cp_type + train.cp_time 
    test['combined_cat0'] = test.cp_type + test.cp_time
    train['combined_cat1'] = train.cp_type + train.cp_dose 
    test['combined_cat1'] = test.cp_type + test.cp_dose
    train['combined_cat2'] = train.cp_time + train.cp_dose 
    test['combined_cat2'] = test.cp_time + test.cp_dose
    train['combined_cat3'] = train.cp_type + train.cp_dose + train.cp_time
    test['combined_cat3'] = test.cp_type + test.cp_dose + test.cp_time
    
    # other features
    train['mean_g'] = train[gs].mean(axis=1) # mean of all genes
    test['mean_g'] = test[gs].mean(axis=1)
    train['mean_c'] = train[cs].mean(axis=1)
    test['mean_c'] = train[cs].mean(axis=1)
    train['median_g'] = train[gs].median(axis=1)
    test['median_g'] = train[gs].median(axis=1)
    train['median_c'] = train[cs].median(axis=1)
    test['median_c'] = train[cs].median(axis=1)
    train['max_g'] = train[gs].max(axis=1)
    test['max_g'] = train[gs].max(axis=1)
    train['max_c'] = train[cs].max(axis=1)
    test['max_c'] = train[cs].max(axis=1)
    train['min_g'] = train[gs].min(axis=1)
    test['min_g'] = train[gs].min(axis=1)
    train['min_c'] = train[cs].min(axis=1)
    test['min_c'] = train[cs].min(axis=1)
    train['q25_g'] = train[gs].quantile(0.25, axis=1)
    test['q25_g'] = train[gs].quantile(0.25, axis=1)
    train['q25_c'] = train[cs].quantile(0.25, axis=1)
    test['q25_c'] = train[cs].quantile(0.25, axis=1)
    train['q75_g'] = train[gs].quantile(0.75, axis=1)
    test['q75_g'] = train[gs].quantile(0.75, axis=1)
    train['q75_c'] = train[cs].quantile(0.75, axis=1)
    test['q75_c'] = train[cs].quantile(0.75, axis=1)
    return train, test

def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    # drop id
    train.drop(['sig_id'], axis=1, inplace=True)
    test.drop(['sig_id'], axis=1, inplace=True)
    return train, test

In [None]:
train, test = feature_engineering(train, test)
train, test = label_encoding(train, test, ['cp_type', 'cp_dose', 'combined_cat0', 
                                           'combined_cat1','combined_cat2', 'combined_cat3',
                                           'cp_time'])

## LGB Setup

In [None]:
def run_lgbm(target_col):
    
    X_train = train
    y_train = train_targets_scored[target_col]
    X_test = test
    
    combined = train.copy()
    combined[target_col] = train.combined_cat3.astype(str) + y_train.astype(str) + train_drug.drug_id.astype(str).values
    
    y_preds = []
    models = []
    oof_train = np.zeros((len(X_train),))
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for fold_id, (train_index, valid_index) in enumerate(kf.split(X=combined, y=combined[target_col].values)):
        print(f'-----------running fold {fold_id} for {target_col}-----------')
        print('*'*50)
        X_tr = X_train.loc[train_index, :]
        X_val = X_train.loc[valid_index, :]
        y_tr = y_train[train_index]
        y_val = y_train[valid_index]
        lgb_train = lgb.Dataset(X_tr,
                                y_tr,)

        lgb_eval = lgb.Dataset(X_val,
                               y_val,
                               reference=lgb_train)

        model = lgb.train(params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_eval],
                          verbose_eval=-1,
                          num_boost_round=5000, # 5000
                          early_stopping_rounds=100
                         )


        oof_train[valid_index] = model.predict(X_val,
                                               num_iteration=model.best_iteration)
        y_pred = model.predict(X_test,
                               num_iteration=model.best_iteration)

        y_preds.append(y_pred)
        models.append(model)

    return oof_train, sum(y_preds) / len(y_preds)

In [None]:
## things to change here
params = {
    'num_leaves': 19,
    'max_depth': 2,
    "lambda_l1": 1,
    "lambda_l2": 1,
    'objective': 'binary',
    'metric': "binary_logloss",
    'learning_rate': 0.01,
    'verbosity': -1,
    "feature_fraction": 0.6
}

oof = train_targets_scored.copy()

## Stage 1: Training 'Easy' Target, more positive samples

In [None]:
# pick top features that have more postive samples
top_k = 75

In [None]:
easy_tar = train_targets_scored.iloc[:, 1:].sum(axis=0).sort_values(ascending=False)[:top_k].index.values
hard_tar = train_targets_scored.iloc[:, 1:].sum(axis=0).sort_values(ascending=False)[top_k:].index.values
assert len(easy_tar) + len(hard_tar) == 206

In [None]:
print(f'Training {top_k} easy targets.....')
start_time = time.time()
for target_col in tqdm(easy_tar):
    if target_col != "sig_id":
        _oof, _preds = run_lgbm(target_col)
        oof[target_col] = _oof
        sub[target_col] = _preds
end_time = time.time()
print('*' * 100)
print(f'Total time for training Easy Targets: {(end_time- start_time)/60:.1f} minutes. Kaggle CPU sucks.')

## Stage 2: Training 'hard' targets, less postive samples

In [None]:
#update train and test for stage 2, append the oofs as features
train = pd.concat([train, oof[easy_tar]], axis=1)
test = pd.concat([test, sub[easy_tar]], axis=1)

In [None]:
print(f'Training {206-top_k} hard targets.....')
start_time = time.time()
for target_col in tqdm(hard_tar):
    if target_col != "sig_id":
        _oof, _preds = run_lgbm(target_col)
        oof[target_col] = _oof
        sub[target_col] = _preds
end_time = time.time()
print('*' * 100)
print(f'Total time for training hard targets: {(end_time- start_time)/60:.1f} minutes')

## OOF Check

In [None]:
scores = []
score_for_each_tar = {}
for target_col in train_targets_scored.columns:
    if target_col != "sig_id":
        score_for_each_tar[target_col] = log_loss(train_targets_scored[target_col], oof[target_col])
        scores.append(log_loss(train_targets_scored[target_col], oof[target_col]))
print('Score without PP:')
print(np.mean(scores))

In [None]:
oof.iloc[train.query('cp_type==0').index] = 0
scores_pp = []
for target_col in train_targets_scored.columns:
    if target_col != "sig_id":
        scores_pp.append(log_loss(train_targets_scored[target_col], oof[target_col]))
print('Score when setting vehicle equal to 0:')
print(np.mean(scores_pp))

## Submission

In [None]:
print('Writing to submission')
sub.to_csv(f'submission.csv', index=False)