In [1]:
import sklearn as sk
from sklearn import ensemble
import operator
from sklearn import model_selection  as ms
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as rsc
from catboost import CatBoostRegressor
from autograd import elementwise_grad as egrad
import math

In [21]:
import theano.tensor as T
import theano
def make_grad_fn(loss_fun = lambda y_pred, y_ref: T.mean((y_pred - y_ref) ** 2),
                 hess_diag=True):

    y_pred = T.dvector('predicted')
    y_ref = T.dvector('reference')
    
    loss = loss_fun(y_pred, y_ref)
    grad = T.grad(loss, y_pred)
    if hess_diag:
        def second_derivative_fun(y_pred, y_ref):
            loss_i = loss_fun(y_pred, y_ref)
            grad_i = T.grad(loss_i, y_pred)
            hess_i = T.grad(grad_i, y_pred)
            return hess_i
        
        hess = theano.map(second_derivative_fun, [y_pred, y_ref])[0]
    else:        
        hess = theano.gradient.hessian(loss, y_pred)
    
    return theano.function([y_pred, y_ref], [loss, grad, hess])
    

In [22]:
compute = make_grad_fn()
compute([1,2,3] * 10 , [-1, 5, 3] * 10)

[array(4.33333333),
 array([ 0.13333333, -0.2       ,  0.        ,  0.13333333, -0.2       ,
         0.        ,  0.13333333, -0.2       ,  0.        ,  0.13333333,
        -0.2       ,  0.        ,  0.13333333, -0.2       ,  0.        ,
         0.13333333, -0.2       ,  0.        ,  0.13333333, -0.2       ,
         0.        ,  0.13333333, -0.2       ,  0.        ,  0.13333333,
        -0.2       ,  0.        ,  0.13333333, -0.2       ,  0.        ]),
 array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])]

In [26]:
%%timeit
compute([1,2,3] * 1000 , [-1, 5, 3] * 1000)

9.08 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [2]:
import pandas as pd
import numpy as np
import re
import datetime
import catboost
pd.options.display.max_columns = 300

In [3]:
class ComplexObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            if has_dirt(targets[index]):
                p = np.max(clean(targets[index]) - approxes[index], 0)
                der1 = np.tanh(p)
                der2 = 1 - der1*der1
            else:
                p = clean(targets[index]) - approxes[index]
                der1 = np.tanh(p)
                der2 = 1 - der1*der1

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result

In [None]:
def date_to_days(date):
    if type(date) in [int, float] and 0 <= date < 100:
        date = str(int(date))
    if type(date) != str:
        return date
    if True not in [str(i) in str(date) for i in range(10)]:
        return None
    if '-' not in date:
        point = datetime.datetime.strptime('1011000'.replace('.',''), "%d%m%Y")
        if '.' in date:
            d, m, y = map(int, date.split('.'))
        else:
            y = int(date)
            d, m = 30, 12
        st = 1900
        if y < 20:
            st += 100
        sd = datetime.datetime(st + y, m, d)
    else:
        point = datetime.datetime.strptime('1011000'.replace('.',''), "%d%m%Y")
        y, m, d = map(int, date.split('-'))
        sd = datetime.datetime(y, m, d)
    return (sd - point).days
def date_booling(X):
    x = X.copy()
    date_columns = ['Year', 'Birth', 'DiagDate', 'RemDate','MGM_Date', 
                        'OVGMDate', 'GM_Date', 'FirstRH', 'Relapse', 'MetaAfter']
    for col in date_columns:
        nans = len(x[col][x[col].isnull()]) / len(x)
        if nans > 0.45:
            x[col] = np.sign(x[col].fillna(0))
    return x
def columnFix(X):
    x = X.copy()
    remove_cols = ['HID', 'OS', 'Encoded_FIO', 'Time (мес)', 'Дата последнего наблюдения ', 'DEATH',
                   'Time (мес)', '916', 'DEATH_DATE', 'Time_delta', 'last_seen', 'Death_time',
                  'MGM_start', 'life_period']
    for col in remove_cols:
        if col in x.columns:
            x = x.drop(col, axis=1)
    x.columns =  ['Year', 'Sex', 'Birth', 'Diagnosis', 'DiagDate', 'RemDate', 'MGM_Date', 
                        'Mutations', 'OVGMDate', 'GM_Date', 'RHNum', 'FirstRH','KInd',
                        'FocNum', 'FocSize', 'FocMax', 'MetaBefore', 'Cure', 
                        'Relapse', 'MetaAfter', 'Progression', 'CureAfter']
    return x
def cleanTable(X):
    x = X.copy().sort_index()
    x = columnFix(x)
    for i in x.columns:
        x[i] = x[i].map(lambda t: t if type(t)!=str else t.lower())
    for i in x.columns:
        if 'есть' in np.array(x[i]):
            x[i] = x[i].map({'есть':1, 'нет':0})
    date_columns = ['Year', 'Birth', 'DiagDate', 'RemDate','MGM_Date', 
                    'OVGMDate', 'GM_Date', 'FirstRH', 'Relapse', 'MetaAfter',
                    'CureAfter']
    for col in date_columns:
        x[col] = x[col].map(date_to_days)
    x.FocMax = x.FocMax.map(lambda s: s.replace(',', '.'))
    x = pd.DataFrame(np.where(np.array(x) == 'нет', None, x))
    x = columnFix(x)
    category_cols = ['Sex', 'Diagnosis', 'Cure', 'Progression', 'CureAfter']
    for col in date_columns:
        x[col] = x[col].fillna(x[col].mean())
    for col in ['Mutations', 'MetaBefore']:
        x[col] = x[col].fillna(0)
    x = pd.get_dummies(x, columns=category_cols)
    #for col in category_cols:
    #    x[col] = pd.factorize(x[col])[0]
    newcols = ['not_treat', 'rh_del_diag', 'rh_del_birth', 'diag_age', 'mgm_del_birth', 'mgm_del_diag']
    cols1 = ['Year', 'FirstRH', 'FirstRH', 'DiagDate', 'MGM_Date','MGM_Date']
    cols2 = ['DiagDate', 'DiagDate', 'Birth', 'Birth', 'Birth', 'DiagDate']
    #x = delta(x, newcols, cols1, cols2)
    x['Progression_дм'] = x['Progression_дм'] | x['Progression_лр+дм']
    x['Progression_лр'] = x['Progression_лр'] | x['Progression_лр+дм']
    x = date_booling(x.drop('Progression_лр+дм', axis=1))
    x.drop(['Sex_ж'], axis=1, inplace=True)
    x.KInd /= 100
    return x

In [5]:
class MeanModel:
    def __init__(self, *mw):
        self.models = [i[0] for i in mw]
        self.w = [i[1] for i in mw]
    def fit(self, X_train, y_train):
        for model in self.models:
            model.fit(X_train, y_train)
    def predict(self, x_in):
        res = np.zeros(len(x_in))
        for model, w in zip(self.models, self.w):
            res += model.predict(x_in) * w
        return res / np.sum(w)
def mean_ans(*anss):
    return np.sum([i * t for i,t in anss], axis=0) / sum([i[1] for i in anss])

In [6]:
def delta(data, new_col, fcol1, fcol2):
    fdata = data.copy()
    for i in range(len(new_col)):
        fdata[new_col[i]] = fdata[fcol1[i]] - fdata[fcol2[i]]
        fdata[new_col[i]] = fdata[new_col[i]].fillna(int(fdata[new_col[i]].mean()))
    fdata.drop(set(fcol1 + fcol2), axis = 1, inplace = True)
    return fdata

In [7]:
def add_dirt(y):
    return np.array(y) + 0.25
def has_dirt(y):
    return y % 1 == 0.25
def clean(y):
    return np.floor(y)

In [8]:
#read data
X_in = pd.read_csv("X_train_new.csv", encoding="windows-1251")
y_in = pd.read_csv("y_train_new.csv", encoding="windows-1251", header=None)
X_final = pd.read_csv('X_test_new.csv', encoding = 'windows-1251')
#split test and train
alive = pd.read_csv('data_pacientAlive.csv', encoding='windows-1251')
params = np.load('bp.npy')

In [9]:
X_in = cleanTable(X_in)
X_final = cleanTable(X_final)
period = alive.life_period.map(lambda x: int(x.split()[0]) if type(x) == str else None)
alive = cleanTable(alive)
X_all = pd.concat([alive, X_in])
y_all = pd.concat([pd.DataFrame(add_dirt(period)), y_in])
X_train, X_test, y_train, y_test = ms.train_test_split(X_all,
                   y_all, test_size=0.3)



In [10]:
ctb = catboost.CatBoostRegressor(iterations=100, loss_function=ComplexObjective())

In [None]:
ctb.fit(X_all, np.ravel(y_all), verbose=True)