# Linear combinations of linear models

**Heavily regularised logistic models** work surprisingly well.

---
**Description**

List various plausible penalty parameters (l1, l2, varied C).

In each of *k* folds, fit logistic regression models for each penalty, for each MoA label.

Models' predictions are weighted by the probability they assign to the test split (the non-normalised negative non-log loss).

Results are not competitive, but pleasingly good for the model simplicity.

---
**Details**

Two MoAs have only one positive label in the training data:
`erbb2_inhibitor` and
`atp-sensitive_potassium_channel_antagonist`.

Since that entry cannot be in both folds, these are simple fitted one l1 regularized model, tuned to use just a few features.


---
**Lessons**

Non-linear variable transforms hurt; all I have tried performed worse. The data are already whitened and appear to have meaningful linearity.

Categorical features hurt here. Tried one-hot encoding, but found worse results. Perhaps the risk of overfitting outweighs their information.


In [None]:
import joblib
import warnings

import numpy
import pandas
import sklearn

from matplotlib import pyplot

from numpy import log, exp

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
class data_container:
    pass


def data_load(data):
    """ Load data from disk and store our target statistics. """
    train_features = pandas.read_csv('../input/lish-moa/train_features.csv')
    test_features = pandas.read_csv('../input/lish-moa/test_features.csv')
    train_targets = pandas.read_csv('../input/lish-moa/train_targets_scored.csv')
    sample_submission = pandas.read_csv('../input/lish-moa/sample_submission.csv')

    test_control = (test_features.cp_type == "ctl_vehicle")

    def drop_useless(df):
        not_used = ["sig_id", "cp_type", "cp_time", "cp_dose"]
        df = df.drop(not_used, axis=1)
        return df

    train_features = drop_useless(train_features)
    test_features = drop_useless(test_features)
    train_targets = train_targets.drop("sig_id", axis=1)

    # whiten
    mean = train_features.values.mean()
    train_features -= mean
    test_features -= mean

    # store
    data.train_features = train_features
    data.train_targets = train_targets
    data.test_features = test_features
    data.test_control = test_control
    data.submission = sample_submission


def data_fold(data, target_column_name, k=2, seed=None):
    """ Iterate over k folds of the training data """
    kfold = StratifiedKFold(k, shuffle=(seed != None), random_state=seed)
    features = data.train_features
    targets = data.train_targets[target_column_name]
    for itrain, itest in kfold.split(features, targets):
        X_train, X_test = features.iloc[itrain], features.iloc[itest]
        y_train, y_test = targets.iloc[itrain], targets.iloc[itest]
        yield X_train, X_test, y_train, y_test


def data_sumbission(data, test_predictions, filename="submission.csv"):
    """ Write our submission """
    data.submission.iloc[:, 1:] = test_predictions
    data.submission.iloc[data.test_control, 1:] = 0.0
    data.submission.to_csv(filename, index=False)

In [None]:
data = data_container()
data_load(data)

In [None]:
def get_hitcount(data):
    """ Return dict of MoA names to number of positive training examples. """
    return dict(data.train_targets.sum(0).sort_values(ascending=False))

def print_hitcount(data):
    """ Print results of get_hitcount. """
    for i, (column_name, count) in enumerate(get_hitcount(data).items()):
        print("%3d %-47s %d" % (i, column_name, count))

# print_hitcount(data) # Commented to shorten public notebook

In [None]:
def log_likelihood(model, X_test, y_test):
    """ Return log prob(y_test | X_test, model). """
    y_pred = model.predict_proba(X_test)
    return -sklearn.metrics.log_loss(y_test, y_pred, normalize=False)

In [None]:
def logistic(**kwargs):
    """ Return a default logistic regression model. """
    config = dict(
        C=1e-2,
        max_iter=200,
        intercept_scaling=1e3,
    )
    config.update(kwargs)
    return LogisticRegression(**config)


def make_base_models():
    """ Return a list of linear models to combine. """
    models = []
    models.extend(
        logistic(penalty="l1", solver="liblinear", C=C)
        for C in (1e-4, 2e-4, 5e-4,
                  1e-3, 2e-3, 5e-3,
                  1e-2, 2e-2, 5e-2,
                  1e-1, 2e-1, 5e-1)
    )
    models.extend(
        logistic(C=C)
        for C in (1e-6, 1e-5,
                  1e-4, 2e-4, 5e-4,
                  1e-3, 2e-3, 5e-3,
                  1e-2, 2e-2, 5e-2,
                  1e-1, 2e-1, 5e-1,
                  1e0, 1e3)
    )
    return models

In [None]:
# Linear combination structures
class mixture:
    def __init__(self, models, weights=None):
        """ Initialize with fitted and their relative weights. """
        models = tuple(models)
        
        if weights is None:
            weights = numpy.ones(len(models))

        weights = numpy.asanyarray(weights)
        weights /= weights.sum()

        assert len(models) == len(weights)
        
        self.models = models
        self.weights = weights

    def predict_proba(self, X):
        """ Return the weighted sum of model results. """
        y_pred = numpy.array([
            model.predict_proba(X)*weight
            for model, weight in zip(self.models, self.weights)
        ])
        return y_pred.sum(axis=0)
           
    def prune(self, threshold):
        """ Remove models with weight below threshold. """
        models = []
        weights = []
        for model, weight in zip(self.models, self.weights):
            if weight < threshold:
                continue
            models.append(model)
            weights.append(weight)
            
        models = tuple(models)
        weights = numpy.array(weights)
        weights /= weights.sum()
        
        self.models = models
        self.weights = weights


class kfoldmixture:
    def __init__(self, base_models, prior=None):
        """ Initialize with models to fit and their relative weihgts. """
        base_models = tuple(base_models)
        
        if prior is None:
            prior = numpy.ones(len(base_models))

        prior = numpy.asanyarray(prior)
        prior /= prior.sum()

        assert len(base_models) == len(prior)
        
        self.base_models = base_models
        self.prior = prior
        
        # mixture over folds, then over base models
        self.model = None

    def fit(self, fold_iter, verbose=False):
        """ Fit models to the k-folded data generated. """
        chars = str(len(str(len(self.base_models))))
        print_string = "\rfold %d, model %" + chars + "d/%d"
        fold_models = []
        for i, (X_train, X_test, y_train, y_test) in enumerate(fold_iter):
            models = []
            log_likelihoods = []
            for j, model in enumerate(self.base_models):
                if verbose:
                    print(print_string % (i, j, len(self.base_models)), end='')
                model = sklearn.base.clone(model)
                model.fit(X_train, y_train)
                models.append(model)
                log_prob = log_likelihood(model, X_test, y_test)
                log_likelihoods.append(log_prob)

            if verbose:
                print(print_string % (i, len(self.base_models), len(self.base_models)))

            log_prod = log_likelihoods + log(self.prior)
            weights = exp(log_prod - log_prod.max())
            
            fold_model = mixture(models, weights)
            fold_models.append(fold_model)

        self.model = mixture(fold_models)

    def prune(self, threshold):
        """ Remove components with weight below threshold. """
        for mixture in self.model.models:
            mixture.prune(threshold)

    def predict_proba(self, X):
        """ Predict labels of new data X. """
        return self.model.predict_proba(X)

    def log_loss(self, fold_iter):
        """ Calculate log loss on the k-folded data generated. """
        log_loss = 0.0
        n_total = 0

        zipiter = zip(fold_iter, self.model.models)
        for (X_train, X_test, y_train, y_test), model in zipiter:
            log_loss -= log_likelihood(model, X_test, y_test)
            n_total += len(y_test)

        return log_loss/n_total

In [None]:
def fit_target(data, column_name, k=2, seed=None, verbose=True):
    """ Fit a linear combination of linear models to the given target column. """

    def fold_iter():
        return data_fold(data, column_name, k=k, seed=seed)

    mixture = kfoldmixture(make_base_models())
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=sklearn.exceptions.ConvergenceWarning)
        mixture.fit(fold_iter(), verbose=verbose)
    mixture.prune(1e-15)
    log_loss = mixture.log_loss(fold_iter())
    return mixture, log_loss


def fit_target_sparse(data, column_name):
    """ Hack to fit columns with almost-empty training data. 

        Hand tuned. No cross-validation here.
    """
    X_train, y_train = data.train_features, data.train_targets[column_name]
    
    # l1 for sparsity, C tuned by hand to get ~a dozen columns in play
    model = logistic(penalty="l1", solver="liblinear", C=0.8)
    model.fit(X_train, y_train)
    return model

In [None]:
def fit_all(data, k=2, seed=None):
    """ Run all our fitting and return fitted models. """
    hitcount = get_hitcount(data)

    usual = tuple(
        column_name
        for column_name, count in hitcount.items()
        if count >= k
    )
    
    sparse = tuple(
        column_name
        for column_name, count in hitcount.items()
        if not (count >= k)
    )

    model_loss = joblib.Parallel(3, verbose=10)(
        joblib.delayed(fit_target)(data, column_name, k=k, seed=seed, verbose=True)
        for column_name in usual
    )
    
    log_loss = 0.0
    column_models = {}
    for i, (model, column_log_loss) in enumerate(model_loss):
        column_name = usual[i]
        log_loss += column_log_loss
        column_models[column_name] = model
        print("%3d/%d %-47s logloss: %.6f, mean %.6f" % 
              (i + 1, len(usual), column_name, column_log_loss, log_loss/(i + 1)))

    for i, column_name in enumerate(sparse):
        print("%d/%d %-47s (sum(y) == 1)" % (i + 1, len(sparse), column_name))
        model = fit_target_sparse(data, column_name)
        column_models[column_name] = model

    log_loss /= len(usual)
    print("log loss: %.6f" % log_loss)
    return column_models

In [None]:
all_fitted = fit_all(data, k=3)

In [None]:
def all_predict(data, all_fitted):
    """ Return a DataFrame of predictions of test targets. """
    results = {}
    test_features = data.test_features
    for column_name, model in all_fitted.items():
        y_pred = model.predict_proba(test_features)
        y_pred = y_pred[:, 1]
        results[column_name] = y_pred
    return pandas.DataFrame(results)

In [None]:
predictions = all_predict(data, all_fitted)

In [None]:
data_sumbission(data, predictions)

In [None]:
joblib.dump(all_fitted, "all_fitted.joblib")