In [None]:
import sys
sys.path.append('../input/iterstat-proxy')

In [None]:
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.decomposition import PCA
from tqdm import tqdm

In [None]:
features = pd.read_csv('../input/lish-moa/train_features.csv')
target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
submit_features = pd.read_csv('../input/lish-moa/test_features.csv')

assert features.sig_id.duplicated().sum() == 0
assert target.sig_id.duplicated().sum() == 0
assert submit_features.sig_id.duplicated().sum() == 0

features = features.set_index('sig_id').sort_index()
target = target.set_index('sig_id').sort_index()
submit_features = submit_features.set_index('sig_id').sort_index()

In [None]:
def score(prob, true):    
    prob = np.clip(prob, 1e-15, 1-1e-15)
    return -np.stack([
        np.log(1-prob),
        np.log(prob)
    ])[true.reshape(-1), np.arange(np.prod(true.shape))].mean()

In [None]:
def features_to_matrix(f):
    return f.drop(columns=['cp_type', 'cp_time', 'cp_dose']).values

def target_to_matrix(t):
    return t.values

In [None]:
import warnings


class PCAProjectorSingle:
    def __init__(self, alpha=1):
        self.alpha = alpha

    def name(self):
        return 'PCA(alpha={})'.format(self.alpha)
    
    def fit(self, x, y):
        actor = x[y == 1]
        if len(actor) > 0:
            with warnings.catch_warnings():
                warnings.simplefilter(action='ignore', category=RuntimeWarning)
                self.pca = PCA().fit(actor)
        else:
            self.pca = None
        return self
    
    def predict_proba(self, x):
        if self.pca is None:
            d = np.array([float('inf') for _ in range(len(x))])
        else:
            d = ((self.pca.inverse_transform(self.pca.transform(x)) - x)**2).mean(axis=1)
        return np.exp(-self.alpha * d)
    
class PCAProjector:
    def __init__(self, alpha=1):
        self.alpha = alpha
        
    def name(self):
        return 'PCA(alpha={})'.format(self.alpha)
    
    def fit(self, x, y):
        self.proj = []
        for i in range(y.shape[1]):
            self.proj.append(PCAProjectorSingle(alpha=self.alpha).fit(x, y[:, i]))
        return self
    
    def predict_proba(self, x):
        return np.stack([p.predict_proba(x) for p in self.proj]).T
    
class Mean:
    def name(self):
        return 'Mean()'
    
    def fit(self, x, y):
        self.mean = y.mean(axis=0)
        return self
        
    def predict_proba(self, x):
        return np.zeros((len(x), len(self.mean))) + self.mean[None, :]

In [None]:
def get_class(f):
    return f.cp_type
    
def class_to_models(cl):
    if cl == 'ctl_vehicle':
        return [Mean()]
    else:
        return [Mean()] + [PCAProjector(a) for a in np.linspace(10, 20, 10)]

In [None]:
n_folds = 7
folder = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=91)

fold_class_model = dict()

fold_class_model_predict = dict()
fold_class_true = dict()

print('training')
for i, (train_idx, test_idx) in enumerate(folder.split(features, target)):
    print('  [fold {}]'.format(i))
    fold_train = features.iloc[train_idx]
    fold_train_target = target.iloc[train_idx]
    
    fold_test = features.iloc[test_idx]
    fold_test_target = target.iloc[test_idx]
    
    fold_class_model[i] = dict()
    fold_class_model_predict[i] = dict()
    fold_class_true[i] = dict()
    
    classes = get_class(fold_train)
    test_classes = get_class(fold_test)
    
    print('    fold classes={}'.format(list(classes.unique())))
    
    for cl in classes.unique():
        print('    fitting class={}'.format(cl))
        class_mask = classes == cl
        models = [m.fit(features_to_matrix(fold_train[class_mask]), target_to_matrix(fold_train_target[class_mask])) for m in class_to_models(cl)]
        fold_class_model[i][cl] = models
        
    for cl in test_classes.unique():
        print('    testing class={}'.format(cl))
        class_mask = test_classes == cl
        preds = [m.predict_proba(features_to_matrix(fold_test[class_mask])) for m in fold_class_model[i][cl]]
        fold_class_model_predict[i][cl] = preds
        fold_class_true[i][cl] = target_to_matrix(fold_test_target[class_mask])
        
print('comparing models')
fold_class_model_score = dict()
fold_class_size = dict()
for i in range(n_folds):
    fold_class_model_score[i] = dict()
    fold_class_size[i] = dict()
    for cl in fold_class_model_predict[i].keys():
        scores = []
        true = fold_class_true[i][cl]
        for pred in fold_class_model_predict[i][cl]:
            scores.append([score(pred[:, j], true[:, j]) for j in range(true.shape[1])])
        fold_class_model_score[i][cl] = np.array(scores)
        fold_class_size[i][cl] = len(true)

fold_class_best_model = dict()
for i in range(n_folds):
    fold_class_best_model[i] = dict()
    for cl,scores in fold_class_model_score[i].items():
        fold_class_best_model[i][cl] = [scores[:, j].argmin() for j in range(scores.shape[1])]
        print('  [fold {}, cl {}] {}'.format(i, cl, pd.Series(np.array([m.name() for m in fold_class_model[i][cl]])[fold_class_best_model[i][cl]]).value_counts()))
        
print('calculating final score')
fold_class_weight = {
    i:{
        cl:s/sum(cl_to_size.values())
        for cl,s in cl_to_size.items()
    }
    for i,cl_to_size in fold_class_size.items()
}

fold_scores = []
for i in range(n_folds):
    s = 0
    for cl,w in fold_class_weight[i].items():
        s += fold_class_model_score[i][cl].min(axis=0).mean() * w
    fold_scores.append(s)
    print('  [fold {}] score={:.4f}'.format(i, s))
print('  final score = {:.4f}'.format(np.mean(fold_scores)))
    
print('generating submit prediction')
fold_submit_pred = dict()
for i in range(n_folds):
    print('  [fold {}]'.format(i))
    
    x = features_to_matrix(submit_features)
    submit_classes = get_class(submit_features)
    
    pred = np.zeros((len(x), len(target.columns)))
    for cl in submit_classes.unique():
        print('    predicting class = {}'.format(cl))
        class_mask = submit_classes == cl
        
        preds = []
        for m in fold_class_model[i][cl]:
            preds.append(m.predict_proba(x[class_mask]))
            
        pred[class_mask] = np.stack([
            preds[fold_class_best_model[i][cl][j]][:, j]
            for j in range(len(target.columns))
        ]).T
    
    fold_submit_pred[i] = pd.DataFrame(data=pred, index=submit_features.index, columns=target.columns)
    
submission = sum(fold_submit_pred.values()) / len(fold_submit_pred)
submission.head()

In [None]:
submission.to_csv('submission.csv')