In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from tqdm import tqdm

In [None]:
features = pd.read_csv('../input/lish-moa/train_features.csv')
target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
submit_features = pd.read_csv('../input/lish-moa/test_features.csv')

assert features.sig_id.duplicated().sum() == 0
assert target.sig_id.duplicated().sum() == 0
assert submit_features.sig_id.duplicated().sum() == 0

features = features.set_index('sig_id').sort_index()
target = target.set_index('sig_id').sort_index()
submit_features = submit_features.set_index('sig_id').sort_index()

In [None]:
def score(prob, true):    
    prob = np.clip(prob, 1e-15, 1-1e-15)
    return -np.stack([
        np.log(1-prob),
        np.log(prob)
    ])[true.reshape(-1), np.arange(np.prod(true.shape))].mean()

In [None]:
def features_to_matrix(f):
    return f.drop(columns=['cp_type', 'cp_time', 'cp_dose']).values

def target_to_matrix(t):
    return t.values

In [None]:
import warnings

class PCAProjectorSingle:
    def __init__(self, alpha=1):
        self.alpha = alpha

    def name(self):
        return 'PCA(alpha={})'.format(self.alpha)
    
    def fit(self, x, y):
        actor = x[y == 1]
        if len(actor) > 0:
            with warnings.catch_warnings():
                warnings.simplefilter(action='ignore', category=RuntimeWarning)
                self.pca = PCA().fit(actor)
        else:
            self.pca = None
        return self
    
    def predict_proba(self, x):
        if self.pca is None:
            d = np.array([float('inf') for _ in range(len(x))])
        else:
            d = ((self.pca.inverse_transform(self.pca.transform(x)) - x)**2).mean(axis=1)
        return np.exp(-self.alpha * d)

class Mean:
    def name(self):
        return 'Mean()'
    
    def fit(self, x, y):
        self.mean = y.mean()
        return self
        
    def predict_proba(self, x):
        return np.zeros(len(x)) + self.mean

In [None]:
def get_cl(f):
    return f.cp_type
    
def cl_to_models(cl):
    if cl == 'ctl_vehicle':
        return [Mean()]
    else:
        return [Mean()] + [PCAProjectorSingle(a) for a in np.linspace(10, 20, 10)]

In [None]:
from functools import namedtuple


n_folds = 7
verbose = False


folder = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=91)

ID = namedtuple('ID', ['cl', 'target', 'fold'])
Record = namedtuple('Record', ['model', 'score'])

models = dict()

print('training')

obj_cl = get_cl(features)
cles = obj_cl.unique()
print('  classes = {}'.format(list(cles)))
for cl in cles:
    print('  [cl {}]'.format(cl))
    cl_features = features[obj_cl == cl]
    for t in range(len(target.columns)):
        print('  [target ({}/{}) {}]'.format(t+1, len(target.columns), target.columns[t]))
        cl_target = target[obj_cl == cl].iloc[:, t]

        for i, (train_idx, test_idx) in enumerate(folder.split(cl_features, cl_target)):
            if verbose:
                print('  [fold {}]'.format(i))
            rid = ID(cl, t, i)
            models[rid] = []

            for m in cl_to_models(cl):
                if verbose:
                    print('    [model {}]'.format(m.name()))
                    print('    [fit]')
                m.fit(features_to_matrix(cl_features.iloc[train_idx]), target_to_matrix(cl_target.iloc[train_idx]))
                if verbose:
                    print('    [predict]')
                pred = m.predict_proba(features_to_matrix(cl_features.iloc[test_idx]))
                true = target_to_matrix(cl_target.iloc[test_idx])
                s = score(pred, true)
                if verbose:
                    print('    [score {:.5f}]'.format(s))
                models[rid].append(Record(m, s))
        print('  [target score {:.5f}]'.format(np.mean([min(models[ID(cl, t, i)], key=lambda r: r.score).score for i in range(n_folds)])))
    print('  class score = {:.5f}'.format(np.mean([min(models[ID(cl, t, i)], key=lambda r: r.score).score for i in range(n_folds) for t in range(len(target.columns))])))
        
best_models = {
    rid:min(records, key=lambda r: r.score)
    for rid,records in models.items()
}
        
print('calculating final score')
cl_weight = obj_cl.value_counts()
cl_weight /= cl_weight.sum()

final_score = np.sum([r.score*cl_weight[rid.cl] for rid,r in best_models.items()]) / len(target.columns) / n_folds
print('  [final score {:.5f}]'.format(final_score))

In [None]:
print('generating submit prediction')
submit_obj_cles = get_cl(submit_features)
submit_cles = submit_obj_cles.unique()
pred = np.zeros((len(submit_features), len(target.columns)))
for cl in submit_cles:
    print('  [class {}]'.format(cl))
    cl_features = submit_features[submit_obj_cles == cl]
    for t in tqdm(range(len(target.columns))):
        for i in range(n_folds):
            pred[submit_obj_cles == cl, t] += best_models[ID(cl, t, i)].model.predict_proba(features_to_matrix(cl_features)) / n_folds
                
        
submission = pd.DataFrame(data=pred, index=submit_features.index, columns=target.columns)
submission.head()

In [None]:
submission.to_csv('submission.csv')