In [None]:
exec("from functools import namedtuple\nfrom itertools import chain\nCacheEntry = namedtuple('CacheEntry', ['value', 'version'])\n\nclass CG:\n    def __init__(self):\n        self.nodes = dict()\n        self.nodes_cache = dict()\n        self.timestamp = 0\n\n    def compute_node_(self, name, args, kwargs, need_version=False):\n        max_version = float('-inf')\n\n        value_args = []\n        for a in args:\n            value, version = a(need_version=True)\n            value_args.append(value)\n            max_version = max(max_version, version)\n\n        value_kwargs = dict()\n        for k,v in kwargs.items():\n            value, version = v(need_version=True)\n            value_kwargs[k] = value\n            max_version = max(max_version, version)\n\n        if name not in self.nodes_cache or self.nodes_cache[name].version <= max_version:\n            print('cg: computing [{}]'.format(name))\n            self.nodes_cache[name] = CacheEntry(value=self.nodes[name](*value_args, **value_kwargs), version=self.timestamp)\n            self.timestamp += 1\n\n        if need_version:\n            return self.nodes_cache[name]\n        else:\n            return self.nodes_cache[name].value\n\n        \n    def node(self, *args, name=None, **kwargs):\n        def add_node(f):\n            name_ = name or f.__name__\n                        \n            self.nodes[name_] = f\n            if name_ in self.nodes_cache:\n                del self.nodes_cache[name_]\n                \n            def compute_node(need_version=False):\n                return self.compute_node_(name_, args, kwargs, need_version=need_version)\n                \n            return compute_node\n        return add_node\n\n    def vnode(self, *args, name=None, **kwargs):\n        def add_vnode(f):\n            def compute_vnode(*args_, **kwargs_):\n                dim = None\n                for a in chain(args_, kwargs_.values()):\n                    if isinstance(a, list):\n                        assert dim is None or dim == len(a), 'vector arguments should have same lengths'\n                        dim = len(a)\n                \n                assert dim is not None, 'vector operation should operate on at least one vector'\n                \n                res = []\n                for i in range(dim):\n                    args_i = []\n                    for a in args_:\n                        if isinstance(a, list):\n                            args_i.append(a[i])\n                        else:\n                            args_i.append(a)\n                            \n                    kwargs_i = dict()\n                    for k,v in kwargs_:\n                        if isinstance(v, list):\n                            kwargs_i[k] = v[i]\n                        else:\n                            kwargs_i[k] = v\n                    \n                    res.append(f(*args_i, **kwargs_i))\n                    \n                return res\n            \n            return self.node(*args, name=name or f.__name__, **kwargs)(compute_vnode)\n        return add_vnode\n                \n            \ncg = CG()")
exec('from functools import namedtuple\nfrom functools import partial as p\nfrom functools import reduce as r\n\ndef c(*fs): return p(r, (lambda x, f: f(x)), fs[::-1])\ndef lmap(f, a): return list(map(f, a))\ndef lfilter(f, a): return list(filter(f, a))\ndef nth(n): return (lambda v: v[n])\ndef div(d): return (lambda x: x/d)\ndef fst(v): return next(iter(v))\nsnd = nth(-1)\n\ndef infrange(start=0):\n    i = start\n    while 1:\n        yield i\n        i += 1\n        \ndef srange(n): return pd.Series(range(n), index=range(n))')

In [None]:
import numpy as np
import torch as tc
import pandas as pd

import scipy.stats as stats
import statsmodels.api as sm

from tqdm import tqdm as tqdm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
@cg.node()
def features():
    return pd.read_csv('../input/lish-moa/train_features.csv').set_index('sig_id').sort_index()
    
@cg.node()
def target():
    return pd.read_csv('../input/lish-moa/train_targets_scored.csv').set_index('sig_id').sort_index()

assert features().index.duplicated().sum() == 0
assert target().index.duplicated().sum() == 0

assert all(features().index == target().index)

In [None]:
def prefix_filter(prefix, a): return lfilter(lambda s: s.startswith(prefix), features().columns)

@cg.node()
def cp_columns():
    return prefix_filter('cp_', features().columns)

@cg.node()
def g_columns(): 
    return prefix_filter('g-', features().columns)

@cg.node()
def c_columns():
    return prefix_filter('c-', features().columns)

assert sum(map(len, [cp_columns(), g_columns(), c_columns()])) == len(features().columns)

In [None]:
@cg.node()
def train_mask():
    np.random.seed(21)
    return np.random.randint(2, size=len(features())).astype(bool) & (features().cp_type != 'ctl_vehicle')

@cg.node()
def test_mask():
    return ~train_mask() & (features().cp_type != 'ctl_vehicle')

@cg.node()
def train_features():
    return features().loc[train_mask()]

@cg.node()
def train_target():
    return target().loc[train_mask()]

@cg.node()
def test_features():
    return features().loc[test_mask()]

@cg.node()
def test_target():
    return target().loc[test_mask()]

In [None]:
@cg.node()
def submit_train_mask():
    return features().cp_type != 'ctl_vehicle'

@cg.node()
def submit_train_features():
    return features()[submit_train_mask()]

@cg.node()
def submit_train_target():
    return target()[submit_train_mask()]

@cg.node()
def submit_test_features():
    return pd.read_csv('../input/lish-moa/test_features.csv').set_index('sig_id').sort_index()

assert submit_test_features().index.duplicated().sum() == 0

In [None]:
@cg.node()
def num_columns():
    return c_columns() + g_columns()

In [None]:
class HistModel:
    def fit(self, x, y, nbins=100):        
        self.bins = dict()
        self.nbins = nbins
        x_to_bin = ((x+10)/20*(self.nbins-1)).astype(int)
        for b in range(self.nbins):
            stat = y[x_to_bin == b]
            if len(stat) == 0:
                self.bins[b] = 0
            else:
                self.bins[b] = stat.mean()
        return self
        
    def predict_proba(self, x):
        x = ((x+10)/20*(self.nbins-1)).astype(int)
        p = np.zeros(x.shape)
        for i in range(len(x)):
            p[i] = self.bins[x[i]]
        return p
    
class HistStack:
    def fit(self, x, y):       
        self.hists = []
        for i in range(x.shape[1]):
            self.hists.append(HistModel().fit(x[:, i], y))
        
        return self
    
    def predict_proba(self, x):
        p = 0
        for i in range(x.shape[1]):
            p += self.hists[i].predict_proba(x[:, i]) / x.shape[1]
            
        return p
    
class Hist:
    def fit(self, x, y):       
        self.hists = []
        for i in tqdm(range(y.shape[1])):
            self.hists.append(HistStack().fit(x, y[:, i]))
            
        return self
    
    def predict_proba(self, x):
        return np.stack([
            h.predict_proba(x)
            for h in tqdm(self.hists)
        ]).T

In [None]:
@cg.node()
def submit_prob():
    prob = pd.DataFrame(index=submit_test_features().index, columns=submit_train_target().columns)
    
    model = Hist().fit(submit_train_features()[num_columns()].values, submit_train_target().values)

    prob[submit_test_features().cp_type != 'ctl_vehicle'] = model.predict_proba(submit_test_features()[submit_test_features().cp_type != 'ctl_vehicle'][num_columns()].values)
    prob[submit_test_features().cp_type == 'ctl_vehicle'] = 0
        
    return prob

In [None]:
submit_prob()

In [None]:
submit_prob().to_csv('submission.csv')