## About this Competition

scientists seek to identify a protein target associated with a disease and develop a molecule that can modulate that protein target. As a shorthand to describe the biological activity of a given molecule, scientists assign a label referred to as mechanism-of-action or MoA for short.

Hence, our task is to use the training dataset to develop an algorithm that automatically labels each case in the test set as one or more MoA classes. Note that since drugs can have multiple MoA annotations, the task is formally a multi-label classification problem.

Based on the MoA annotations, the accuracy of solutions will be evaluated on the average value of the logarithmic loss function applied to each drug-MoA annotation pair.

***train_features.csv*** / ***test_features.csv*** -Features for the training set. 
<br>Features g- signify gene expression data, and 
c- signify cell viability data. 
cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle); control perturbations have no MoAs; 
cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
<br>***train_targets_scored.csv*** - The binary MoA targets that are scored.
<br>***sample_submission.csv*** - A submission file in the correct format

## References
*  https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/
*  https://www.kaggle.com/yasufuminakama/moa-pytorch-nn-starter
*  https://www.kaggle.com/nroman/moa-lightgbm-206-models
*  https://www.kaggle.com/fchmiel/xgboost-baseline-multilabel-classification
*  https://www.kaggle.com/kushal1506/moa-pytorch-feature-engineering-0-01846


I would be grateful for any correction, suggestion or discussion ):

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import log_loss

from logging import getLogger, StreamHandler, FileHandler, INFO, Formatter

from time import time
import datetime
import os

import lightgbm as lgb
import gc
import warnings
warnings.simplefilter('ignore')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as f

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
trainF = pd.read_csv('../input/lish-moa/train_features.csv')
test  = pd.read_csv('../input/lish-moa/test_features.csv')
trainTs = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
trainTn = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
trainF.info()
print('\n')
test.info()

In [None]:
display(trainF.describe())
display(trainTs.describe())
test.describe()

In [None]:
display(trainF.head().T)
display(trainTs.sample(6))
display(test.head())
sub.head()

In [None]:
print('total missing values in dataset = ', trainF.isna().sum().sum())
#categorical features
cat_feat = trainF.columns[trainF.dtypes == 'object'].tolist()
cat_feat

In [None]:
target_cols = [col for col in trainTs.columns if col != 'sig_id']
train = trainF.merge(trainTs, on= 'sig_id')
train.shape, trainTs.shape

## Analysing cp- features

In [None]:
target_cols = [col for col in trainTs.columns if col != 'sig_id']
c_feats = ['cp_type', 'cp_time', 'cp_dose']
for feat in c_feats:
    col = target_cols + [feat]
    c_sumTs = train[col].groupby([feat]).sum().sum(1)
    sns.countplot(c_sumTs) ;
    sns.barplot(c_sumTs.index, c_sumTs.values) ;
    plt.show()

In [None]:
train[col+['cp_type']].groupby('cp_type').sum().sum(1)

In [None]:
def cat2num(df):
    df.loc[:, 'cp_time'] = df['cp_time'].map({24: 1, 48: 2, 72: 3})
    df.loc[:, 'cp_type'] = df['cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    return df
train = cat2num(train)
test = cat2num(test)

In [None]:
print('Number of different labels:', len(target_cols))

train = train[train['cp_type']!= 1].reset_index(drop=True)
X_test = test.copy()
test = test[test['cp_type']!= 1].reset_index(drop=True)

num_feat = [x for x in train.columns if x not in trainTs]
targets = train[target_cols].values
print(train.shape, test.shape, targets.shape)

## Feature engineering

In [None]:
df = pd.concat([train[num_feat], test[num_feat]], axis= 0)

features_g = list(train.columns[4:776])
features_c = list(train.columns[776:876])
train_feat = []
half_g = len(features_g)//2
half_c = len(features_c)//2
gc_fe = ['g_sum', 'g_mean', 'g_kurt', 'g_skew', 'c_sum', 'c_mean', 'c_std','c_kurt','c_skew','gc_sum',
         'gc_mean','gc_std','gc_kurt', 'gc_skew', 'g_initials_mean', 'g_initials_std','g_finals_mean',
         'g_finals_std']
#
df['g_sum'] = df[features_g].sum(axis=1)
df['g_mean'] = df[features_g].mean(axis=1)
df['g_std'] = df[features_g].std(axis=1)
df['g_kurt'] = df[features_g].kurtosis(axis = 1)
df['g_skew'] = df[features_g].skew(axis = 1)
df['c_sum'] = df[features_c].sum(axis = 1)
df['c_mean'] = df[features_c].mean(axis = 1)
df['c_std'] = df[features_c].std(axis = 1)
df['c_kurt'] = df[features_c].kurtosis(axis = 1)
df['c_skew'] = df[features_c].skew(axis = 1)
df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
df['gc_std'] = df[features_g + features_c].std(axis = 1)
df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
df['g_initials_mean'] = df[features_g[:half_g]].mean(axis=1)
df['g_initials_std'] = df[features_g[:half_g]].std(axis=1)
df['g_finals_mean'] = df[features_g[half_g:]].mean(axis=1)
df['g_finals_std'] = df[features_g[half_g:]].std(axis=1)
df['c_initials_mean'] = df[features_c[:half_c]].mean(axis=1)
df['c_initials_std'] = df[features_c[:half_c]].std(axis=1)
df['c_finals_mean'] = df[features_c[half_c:]].mean(axis=1)
df['c_finals_std'] = df[features_c[half_c:]].std(axis=1)


train[gc_fe] = df[gc_fe].iloc[:train.shape[0],:]
test[gc_fe] = df[gc_fe].iloc[train.shape[0]:, :]
num_feat = num_feat + gc_fe

del df

In [None]:
def get_logger(filename='log'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()

In [None]:
def seed_everything(seed = 42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(7)

In [None]:
import sys
sys.path.append('/kaggle/input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

## CV Split

In [None]:
folds = train.copy()
Fold = MultilabelStratifiedKFold(n_splits = 5, shuffle= True, random_state= 42)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[target_cols])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype('int')

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, num_features, labels):
        self.cont_values = df[num_features].values
        self.labels = labels
        
    def __len__(self):
        return len(self.cont_values)

    def __getitem__(self, idx):
        cont_x = torch.FloatTensor(self.cont_values[idx])
        label = torch.tensor(self.labels[idx]).float()
        
        return cont_x, label
    

class TestDataset(Dataset):
    def __init__(self, df, num_features):
        self.cont_values = df[num_features].values
        
    def __len__(self):
        return len(self.cont_values)
    
    def __getitem__(self, idx):
        cont_x = torch.FloatTensor(self.cont_values[idx])
        
        return cont_x

In [None]:
class CFG:
    max_grad_norm = 5
    gradient_accumulation_steps=1
    hidden_size=512
    dropout=0.5
    lr=1e-2
    weight_decay=1e-6
    batch_size=32
    epochs= 20
    num_features=num_feat
    target_cols=target_cols

In [None]:
class TabularNN(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.mlp = nn.Sequential(
                            nn.Linear(len(cfg.num_features), cfg.hidden_size),
                            nn.BatchNorm1d(cfg.hidden_size),
                            nn.Dropout(cfg.dropout),
                            nn.PReLU(),
                            nn.Linear(cfg.hidden_size, cfg.hidden_size),
                            nn.BatchNorm1d(cfg.hidden_size),
                            nn.Dropout(cfg.dropout),
                            nn.PReLU(),
                            nn.Linear(cfg.hidden_size, len(cfg.target_cols))
                            )
    def forward(self, cont_x):
        x = self.mlp(cont_x)
        return x

In [None]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    losses = AverageMeter()
    model.train()
    
    for step, (cont_x, y) in enumerate(train_loader):
        cont_x, y = cont_x.to(device), y.to(device)
        batch_size = cont_x.size(0)
        pred = model(cont_x)
        loss = nn.BCEWithLogitsLoss()(pred, y)
        losses.update(loss.item(), batch_size)
        
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), CFG.max_grad_norm)
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
        
    return losses.avg

def validate_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    val_preds = []
    
    for epoch, (cont_x, y) in enumerate(valid_loader):
        cont_x, y = cont_x.to(device), y.to(device)
        batch_size = cont_x.size(0)
        pred = model(cont_x)
        loss = nn.BCEWithLogitsLoss()(pred, y)
        # losses.update is a function from AverageMeter() used to accumulate the loss form all epoches
        losses.update(loss.item(), batch_size)
        
        val_preds.append(pred.sigmoid().detach().cpu().numpy())
        
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_step
    val_preds = np.concatenate(val_preds)
    
    return losses.avg, val_preds

def inference_fn(test_loader, model, device):
    model.eval()
    preds = []
    
    for epoch, (cont_x) in enumerate(test_loader):
        cont_x = cont_x.to(device)
        
        with torch.no_grad():
            pred = model(cont_x)
        preds.append(pred.sigmoid().detach().cpu().numpy())
    preds = np.concatenate(preds)
    
    return preds


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def run_single_nn(cfg, train, test, folds, num_features, target, device, fold_num=0, seed=42):
    
    # Set seed
    logger.info(f'Set seed {seed}')
    seed_everything(seed=seed)

    # loader
    trn_idx = folds[folds['fold'] != fold_num].index
    val_idx = folds[folds['fold'] == fold_num].index
    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)
    train_target = target[trn_idx]
    valid_target = target[val_idx]
    train_dataset = TrainDataset(train_folds, num_features, train_target)
    valid_dataset = TrainDataset(valid_folds, num_features, valid_target)
    train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, 
                              num_workers=4, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False, 
                              num_workers=4, pin_memory=True, drop_last=False)

    # model
    model = TabularNN(cfg)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=cfg.epochs, steps_per_epoch=len(train_loader))

    # log
    log_df = pd.DataFrame(columns=(['EPOCH']+['TRAIN_LOSS']+['VALID_LOSS']) )

    # train & validate
    best_loss = np.inf
    for epoch in range(cfg.epochs):
        train_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, device)
        valid_loss, val_preds = validate_fn(valid_loader, model, device)
        log_row = {'EPOCH': epoch, 
                   'TRAIN_LOSS': train_loss,
                   'VALID_LOSS': valid_loss,
                  }
        log_df = log_df.append(pd.DataFrame(log_row, index=[0]), sort=False)
        
        if valid_loss < best_loss:
            logger.info(f'epoch{epoch} save best model... {valid_loss}')
            best_loss = valid_loss
            oof = np.zeros((len(train), len(cfg.target_cols)))
            oof[val_idx] = val_preds
            torch.save(model.state_dict(), f"fold{fold_num}_seed{seed}.pth")

    # predictions
    test_dataset = TestDataset(test, num_features)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, 
                             num_workers=4, pin_memory=True)
    model = TabularNN(cfg)
    model.load_state_dict(torch.load(f"fold{fold_num}_seed{seed}.pth"))
    model.to(device)
    predictions = inference_fn(test_loader, model, device)
    
    # del
    torch.cuda.empty_cache()

    return oof, predictions


def run_kfold_nn(cfg, train, test, folds, num_features, target, device, n_fold=5, seed=42):

    oof = np.zeros((len(train), len(cfg.target_cols)))
    predictions = np.zeros((len(test), len(cfg.target_cols)))

    for _fold in range(n_fold):
        logger.info("Fold {}".format(_fold))
        _oof, _predictions = run_single_nn(cfg,
                                           train,
                                           test,
                                           folds,
                                           num_features, 
                                           target, 
                                           device,
                                           fold_num=_fold,
                                           seed=seed)
        oof += _oof
        predictions += _predictions / n_fold

    score = 0
    for i in range(target.shape[1]):
        _score = log_loss(target[:,i], oof[:,i])
        score += _score / target.shape[1]
    logger.info(f"CV score: {score}")
    
    return oof, predictions

In [None]:
#seed average for solid results
oof = np.zeros((len(train), len(CFG.target_cols)))
predictions = np.zeros((len(test), len(CFG.target_cols)))

SEED = [0, 1, 2]
for seed in SEED:
    _oof, _predictions = run_kfold_nn(CFG, 
                                      train, test, folds, 
                                      num_feat, targets,
                                      device,
                                      n_fold=5, seed=seed)
    oof += _oof /len(SEED)
    predictions += _predictions / len(SEED)
    
score = 0
for i in range(targets.shape[1]):
    _score = log_loss(targets[:, i], oof[:, i])
    score += _score / targets.shape[1]
logger.info(f'saved average CV score: {score}')

In [None]:
train1 = train[['sig_id']].copy()
test1 = test[['sig_id']].copy()
train1[target_cols] = oof
train1[['sig_id']+target_cols].to_csv('oof.csv', index=False)

test1[target_cols] = predictions
test1[['sig_id']+target_cols].to_csv('pred.csv', index=False)

In [None]:
# Final result with 'cp_type'=='ctl_vehicle' data
result = trainTs.drop(columns=target_cols)\
            .merge(train1[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
y_true = trainTs[target_cols].values
y_pred = result[target_cols].values
score = 0
for i in range(y_true.shape[1]):
    
    _score = log_loss(y_true[:,i], y_pred[:,i])
    score += _score / y_true.shape[1]
logger.info(f"Final result: {score}")

del result, y_true, y_pred
gc.collect()

In [None]:
sub = sub.drop(columns=target_cols).merge(test1[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission1.csv', index=False)
sub.head()

In [None]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03,
          'feature_fraction': 0.3,
          'bagging_fraction': 0.4,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': 3,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          #'is_unbalanced': True,
          "metric": 'binary_logloss',
          #'device': 'gpu',
          "verbosity": 0,
          'reg_alpha': 0.4,
          'reg_lambda': 0.6,
          'save_binary': True,
          'num_threads': 4,
          'random_state': 47
         }

In [None]:

accumulative_loss = 0
skf = StratifiedKFold(n_splits = 3, random_state= 47, shuffle= True)
print('Execution time | Model number | logloss | new logloss | best coeff')

fold_num = 0
train2 = train[['sig_id']].copy()
test2 = test[['sig_id']].copy()

train_folds = train[num_feat]
for model, target in enumerate(target_cols, 1):
    y = train[target]
    start_time = time()    
    
    preds = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fold_num = 0
    
    for trn_idx, test_idx in skf.split(train[num_feat], y):
        
        trn_data = lgb.Dataset(train_folds.iloc[trn_idx], label = y.iloc[trn_idx])
        val_data = lgb.Dataset(train_folds.iloc[test_idx], label = y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = val_data,
                        verbose_eval=0, early_stopping_rounds= 20)
        clf.save_model(f'lgb_{target}_{fold_num}.txt', num_iteration= clf.best_iteration)
        clf = lgb.Booster(model_file = f'lgb_{target}_{fold_num}.txt')
        fold_num += 1
        oof[test_idx] = clf.predict(train_folds.loc[test_idx])
        preds += clf.predict(test[num_feat])
    loss = log_loss(y, oof)
    
    #Hacking the metrics
    coeffs = [3, 2, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0, 0.9, 0.8, 0.7]
    best_coeff = 0
    best_loss = loss
    for coeff in coeffs:
        new_oof = oof.copy()
        new_oof[new_oof < new_oof.mean() / coeff] = 0
        new_loss = log_loss(y, new_oof)
        if new_loss < loss:
            preds[preds < preds.mean() / best_coeff] = 0
            best_coeff = coeff
            best_loss = new_loss
            
    if best_coeff:
        preds[preds < preds.mean() / best_coeff] = 0
    
    train2[f'{target}_2'] = new_oof
    test2[f'{target}_2'] = preds
        
    accumulative_loss += best_loss
    print('{}\t\t{}\t\t{:.5f}\t\t{:.5f}\t\t{}'.format(str(datetime.timedelta(seconds = time() - start_time))[:-7],
                                                   model, loss, best_loss, best_coeff))
    del preds, oof, start_time, y, loss, best_loss, new_oof
    gc.collect()


In [None]:
train2.to_csv('train2.csv', index = False) 
test2.to_csv('test2.csv', index = False)

In [None]:
print('Overall mean loss: {:.5f}'.format(accumulative_loss / 206))

In [None]:
target_cols2 = [f'{col}_2' for col in target_cols]
sub1 = sub.drop(columns=target_cols).merge(test2[['sig_id']+target_cols2], on='sig_id', how='left').fillna(0)
sub1.to_csv('submission2.csv', index = False)

## stacking

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier

kf = KFold(n_splits = 5, random_state= 42, shuffle=True)
n_folds = 5

clf = XGBClassifier()

In [None]:
df_train = pd.concat([train1, train2], axis = 1)
df_test = pd.concat([test1, test2], axis = 1)
targets = train[target_cols].values

df_train = df_train.drop(['sig_id'], axis=1)
df_test = df_test.drop(['sig_id'], axis= 1)

In [None]:
#stacking
oof = np.zeros(train[target_cols].shape)
accumulative_loss = 0
test3 = np.zeros((test.shape[0], len(target_cols)))
n_folds = 5
for target_i, target in enumerate(target_cols):
    oof_losses = []
    oof = np.zeros(train.shape[0])
    y = train[target].values
    X = df_train[[target, f'{target}_2']].values
    X_test2 = df_test[[target, f'{target}_2']].values
    for i, (trn_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[trn_idx], X[test_idx]
        y_train, y_val = y[trn_idx], y[test_idx]
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val)

        val_preds = np.array(val_preds)[:, 1].T 
        val_preds = val_preds.astype('float64')
        oof[test_idx] = val_preds

        preds = clf.predict_proba(X_test2)
        preds = np.array(preds)[:,1].T
        test3[:, target_i] += preds/n_folds
    oof_losses = log_loss(y, oof)
    accumulative_loss += np.mean(oof_losses)
    print(f'loss {target}', np.mean(oof_losses))
    print(f'Mean OOF loss of folds {target}', np.mean(oof_losses))

    del X_train, X_val, y_train, y_val, val_preds, preds
    gc.collect()
    

In [None]:
print('overall mean loss =', accumulative_loss/206)

In [None]:
sub3 = sub.copy()
control_mask = X_test['cp_type'] == 1
sub3.iloc[~control_mask, 1:] = test3
sub3.iloc[control_mask, 1:] = 0
sub3.to_csv('submission.csv', index = False)

In [None]:
sub3.head()