In [None]:
import sys
sys.path.append('../input/iterstat-proxy/')

import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.decomposition import PCA
from tqdm import tqdm
import torch as tc
import pytorch_lightning as pl

In [None]:
features = pd.read_csv('../input/lish-moa/train_features.csv')
target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
submit_features = pd.read_csv('../input/lish-moa/test_features.csv')

assert features.sig_id.duplicated().sum() == 0
assert target.sig_id.duplicated().sum() == 0
assert submit_features.sig_id.duplicated().sum() == 0

features = features.set_index('sig_id').sort_index()
target = target.set_index('sig_id').sort_index()
submit_features = submit_features.set_index('sig_id').sort_index()

In [None]:
def score(prob, true):
    prob = prob.astype(np.float64)
    prob = np.clip(prob, 1e-15, 1-1e-15)
    return -np.stack([
        np.log(1-prob),
        np.log(prob)
    ])[true.reshape(-1), np.arange(np.prod(true.shape))].mean()

In [None]:
def features_to_matrix(f):
    cat_features = np.stack([
        f.cp_time.map(dict([(24, -1), (48, 0), (72, 1)])).values,
        f.cp_dose.map(dict(D1=-1, D2=1)).values,
    ], axis=1)

    num_features = f.drop(columns=['cp_type', 'cp_time', 'cp_dose']).values
    return np.hstack([cat_features, num_features])

def target_to_matrix(t):
    return t.values

In [None]:
class Mean:
    def name(self):
        return 'Mean()'
    
    def fit(self, x, y, **kwargs):
        self.mean = y.mean(axis=0)
        return self
        
    def predict_proba(self, x):
        return np.zeros((len(x), len(self.mean))) + self.mean[None, :]

In [None]:
features_to_matrix(features).shape

In [None]:
target_to_matrix(target).shape

In [None]:
class RotationBlock(tc.nn.Module):
    def __init__(self, input_dim, output_dim, activation=tc.nn.ReLU):
        super().__init__()
        
        self.features = tc.nn.Sequential(
            tc.nn.Linear(input_dim, output_dim),
            activation()
        )
        
    def forward(self, x):
        return self.features(x)
    
class NN(tc.nn.Module):
    def __init__(self, input_dim=874, hidden_dims=[1024, 512, 206]):
        super().__init__()
        
        self.features = tc.nn.Sequential(*[
            RotationBlock(i, o, activation=tc.nn.ReLU if j < len(hidden_dims) - 1 else tc.nn.Identity)
            for j, (i, o) in enumerate(zip([input_dim] + hidden_dims, hidden_dims))
        ])
    
    def forward(self, x):
        return self.features(x)
    
class NNPL(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        
        self.model = NN(*args, **kwargs)
        self.criterion = tc.nn.BCEWithLogitsLoss()
        
    def forward(self, x):
        return tc.sigmoid(self.model(x))
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        grad_loss = self.criterion(self.model(x), y)
        self.log('train_bce', grad_loss)
        
        return grad_loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        grad_loss = self.criterion(self.model(x), y)
        self.log('test_bce', grad_loss)
        
        return grad_loss
    
    def configure_optimizers(self):
        return tc.optim.Adam(self.model.parameters(), lr=1e-4)
    
class NNSklearn:
    def __init__(self, *args, **kwargs):
        self.nn_pl = NNPL(*args, **kwargs)
        
    def name(self):
        return 'NN()'
        
    def fit(self, x, y, x_val, y_val, fold, cl, **kwargs):
        train_loader = tc.utils.data.DataLoader(
            tc.utils.data.TensorDataset(tc.from_numpy(x).to(tc.float32), tc.from_numpy(y).to(tc.float32)), 
            batch_size=128, num_workers=4
        )
        
        val_loader = tc.utils.data.DataLoader(
            tc.utils.data.TensorDataset(tc.from_numpy(x_val).to(tc.float32), tc.from_numpy(y_val).to(tc.float32)), 
            batch_size=128, num_workers=4
        )
        
        chk_callback = pl.callbacks.ModelCheckpoint(
            filepath='CHK_classic_nn_{}_{}'.format(cl, fold) + '/NN-{epoch:02d}-{test_bce:.2f}',
            save_top_k=1,
            verbose=True,
            monitor='test_bce',
            mode='min'
        )
        trainer = pl.Trainer(
            gpus=1, max_epochs=48,
            checkpoint_callback=chk_callback,
            logger=pl.loggers.TensorBoardLogger(save_dir='classic_nn_{}_{}'.format(cl, fold), name='classic_nn_{}'.format(cl, fold))
        )
        trainer.fit(self.nn_pl, train_loader, val_loader)
        self.nn_pl = NNPL.load_from_checkpoint(chk_callback.best_model_path)
        
        return self
    
    def predict_proba(self, x):
        self.nn_pl.eval()
        with tc.no_grad():
            return self.nn_pl(tc.from_numpy(x).to(tc.float32)).numpy()

In [None]:
def get_class(f):
    return f.cp_type
    
def class_to_models(cl):
    if cl == 'ctl_vehicle':
        return [Mean()]
    else:
        return [NNSklearn() for _ in range(3)] + [Mean()]

In [None]:
!rm -r classic_*
!rm -r CHK_classic_*

n_folds = 7
folder = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=91)

fold_class_model = dict()

fold_class_model_predict = dict()
fold_class_true = dict()

print('training')
for i, (train_idx, test_idx) in enumerate(folder.split(features, target)):
    print('  [fold {}]'.format(i))
    fold_train = features.iloc[train_idx]
    fold_train_target = target.iloc[train_idx]
    
    fold_test = features.iloc[test_idx]
    fold_test_target = target.iloc[test_idx]
    
    fold_class_model[i] = dict()
    fold_class_model_predict[i] = dict()
    fold_class_true[i] = dict()
    
    classes = get_class(fold_train)
    test_classes = get_class(fold_test)
    
    print('    fold classes={}'.format(list(classes.unique())))
    
    for cl in classes.unique():
        print('    fitting class={}'.format(cl))
        class_mask = classes == cl
        test_class_mask = test_classes == cl
        models = [
            m.fit(
                x=features_to_matrix(fold_train[class_mask]), 
                y=target_to_matrix(fold_train_target[class_mask]), 
                x_val=features_to_matrix(fold_test[test_class_mask]),
                y_val=target_to_matrix(fold_test_target[test_class_mask]),
                fold=i,
                cl=cl
            )
            for m in class_to_models(cl)
        ]
        fold_class_model[i][cl] = models
        
    for cl in test_classes.unique():
        print('    testing class={}'.format(cl))
        class_mask = test_classes == cl
        preds = [m.predict_proba(features_to_matrix(fold_test[class_mask])) for m in fold_class_model[i][cl]]
        fold_class_model_predict[i][cl] = preds
        fold_class_true[i][cl] = target_to_matrix(fold_test_target[class_mask])
        
print('comparing models')
fold_class_model_score = dict()
fold_class_size = dict()
for i in range(n_folds):
    fold_class_model_score[i] = dict()
    fold_class_size[i] = dict()
    for cl in fold_class_model_predict[i].keys():
        scores = []
        true = fold_class_true[i][cl]
        for pred in fold_class_model_predict[i][cl]:
            scores.append([score(pred[:, j], true[:, j]) for j in range(true.shape[1])])
        fold_class_model_score[i][cl] = np.array(scores)
        fold_class_size[i][cl] = len(true)

fold_class_best_model = dict()
for i in range(n_folds):
    fold_class_best_model[i] = dict()
    for cl,scores in fold_class_model_score[i].items():
        fold_class_best_model[i][cl] = [scores[:, j].argmin() for j in range(scores.shape[1])]
        print('  [fold {}, cl {}] {}'.format(i, cl, pd.Series(np.array([m.name() for m in fold_class_model[i][cl]])[fold_class_best_model[i][cl]]).value_counts()))
        
print('calculating final score')
fold_class_weight = {
    i:{
        cl:s/sum(cl_to_size.values())
        for cl,s in cl_to_size.items()
    }
    for i,cl_to_size in fold_class_size.items()
}

fold_scores = []
for i in range(n_folds):
    s = 0
    for cl,w in fold_class_weight[i].items():
        s += fold_class_model_score[i][cl].min(axis=0).mean() * w
    fold_scores.append(s)
    print('  [fold {}] score={:.4f}'.format(i, s))
print('  final score = {:.4f}'.format(np.mean(fold_scores)))
    
print('generating submit prediction')
fold_submit_pred = dict()
for i in range(n_folds):
    print('  [fold {}]'.format(i))
    
    x = features_to_matrix(submit_features)
    submit_classes = get_class(submit_features)
    
    pred = np.zeros((len(x), len(target.columns)))
    for cl in submit_classes.unique():
        print('    predicting class = {}'.format(cl))
        class_mask = submit_classes == cl
        
        preds = []
        for m in fold_class_model[i][cl]:
            preds.append(m.predict_proba(x[class_mask]))
            
        pred[class_mask] = np.stack([
            preds[fold_class_best_model[i][cl][j]][:, j]
            for j in range(len(target.columns))
        ]).T
    
    fold_submit_pred[i] = pd.DataFrame(data=pred, index=submit_features.index, columns=target.columns)
    
submission = sum(fold_submit_pred.values()) / len(fold_submit_pred)
submission.head()

In [None]:
submission.to_csv('submission.csv')