# Task 1

## Preprocessing

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
import theano.tensor as T
import theano

In [2]:
def totuple(a):
    try:
        return tuple(totuple(i) for i in a)
    except TypeError:
        return a

In [3]:
has_dirt = lambda y: y % 1 !=0

In [4]:
add_dirt = lambda y: y + 0.25

In [5]:
x_train = pd.read_csv("X_train_new.csv", encoding='windows-1251').drop(columns=['Год', 'HID', 'Encoded_FIO', 'OS'])
y_train = pd.read_csv("y_train_new.csv", encoding='windows-1251', header=None)
x_test = pd.read_csv("X_test_new.csv", encoding='windows-1251').drop(columns=['Год', 'HID', 'Encoded_FIO'])
ap = pd.read_csv("data_pacientAlive.csv", encoding='windows-1251').drop(columns=['Год', 'HID', 'Encoded_FIO', '916'])

In [6]:
x_train.columns = ['sex', 'birth_date', 'diag', 'diag_date', 'first_site', 'bmd_date', 'mutations', 'rad_date', 'surg_date', 'n_gamma', 'gamma_date', 'ecog', 'n_sites', 'sites_volume', 'max_volume', 'extra_m', 'treatment', 'local_relapse', 'distant_m', 'intra_prog', 'ir_treatment', 'last_exam', 'death', 'time']
x_test.columns = ['sex', 'birth_date', 'diag', 'diag_date', 'first_site', 'bmd_date', 'mutations', 'rad_date', 'surg_date', 'n_gamma', 'gamma_date', 'ecog', 'n_sites', 'sites_volume', 'max_volume', 'extra_m', 'treatment', 'local_relapse', 'distant_m', 'intra_prog', 'ir_treatment']
ap.columns = ['sex', 'birth_date', 'diag', 'diag_date', 'first_site', 'bmd_date', 'mutations', 'rad_date', 'surg_date', 'n_gamma', 'gamma_date', 'ecog', 'n_sites', 'sites_volume', 'max_volume', 'extra_m', 'treatment', 'local_relapse', 'distant_m', 'intra_prog', 'ir_treatment', 'last_exam', 'death', 'time', 'life_period']

In [7]:
ap.life_period = ap.life_period.map(lambda x: int(x.split()[0]) if type(x) == str else None)

In [8]:
ap.dropna(subset=['life_period'], inplace=True)

In [9]:
y_ap = add_dirt(ap.pop('life_period'))

In [10]:
def preprocess(df):
    df.sex = df.sex.map({'М':1, 'Ж':0})

    df.mutations = df.mutations.fillna('нет')
    df.extra_m = df.extra_m.fillna('нет')

    temp = df.columns
    df = pd.DataFrame(np.where(np.array(df) == 'нет', np.nan, df))
    df = pd.DataFrame(np.where(np.array(df) == 'Без лечения', np.nan, df))
    df = pd.DataFrame(np.where(np.array(df) == '#ССЫЛКА!', np.nan, df))
    df.columns = temp

    df['intra_prog_ЛР'] = np.where(df['intra_prog'] >= 'ЛР', 1, 0)
    df['intra_prog_ДМ'] = np.where(df['intra_prog'] > 'ЛР', 1, np.where(df['intra_prog'] < 'ЛР', 1, 0))
    df = df.drop(columns=['intra_prog'])

    df.mutations = df.mutations.map({'есть':1}).fillna(0)

    df.ecog = df.ecog/100

    df.extra_m = df.extra_m.map({'есть':1}).fillna(0)

    df.max_volume = df.max_volume.map(lambda s: float(s.replace(',', '.')))
    
    df.fillna('unkn', inplace=True)
    cc = ['sex', 'diag', 'mutations', 'treatment', 'ir_treatment']
    for col in cc:
        df[col] = df[col].astype('category').cat.codes
    
    date_cols = ['first_site', 'surg_date', 'rad_date']
    
    return df

In [11]:
x_train = preprocess(x_train)
x_test = preprocess(x_test)
ap = preprocess(ap)

In [12]:
def make_grad_fn(loss_fun = lambda y_pred, y_ref: T.mean((y_pred - y_ref) ** 2),
                 hess_diag=True):

    y_pred = T.dvector('predicted')
    y_ref = T.dvector('reference')
    
    loss = loss_fun(y_pred, y_ref)
    grad = T.grad(loss, y_pred)
    if hess_diag:
        def second_derivative_fun(y_pred, y_ref):
            loss_i = loss_fun(y_pred, y_ref)
            grad_i = T.grad(loss_i, y_pred)
            hess_i = T.grad(grad_i, y_pred)
            return hess_i
        
        hess = theano.map(second_derivative_fun, [y_pred, y_ref])[0]
    else:        
        hess = theano.gradient.hessian(loss, y_pred)
    
    return theano.function([y_pred, y_ref], [grad, hess])

In [13]:
compute = make_grad_fn()

In [14]:
class CustomObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        if np.any(np.isnan(np.array(approxes))):
            return list([(0, 1)]* len(approxes))
                    
        is_alive = has_dirt(np.array(targets))
        y_clear = np.array(targets).round()
        y_ref = np.where(is_alive,
                     np.maximum(y_clear, approxes),
                     y_clear
                )
        
        res = compute(approxes, y_ref)
        
        with open('test.txt', 'a') as f:
            f.write('NaNs in approxes: ' + str(np.any(np.isnan(np.array(approxes)))) + '\n')
            f.write('NaNs in targets: ' + str(np.any(np.isnan(np.array(targets)))) + '\n')
            f.write('NaNs in grads: ' + str(np.any(np.isnan(res[0]))) + '\n')
            f.write('NaNs in hesses: ' + str(np.any(np.isnan(res[1]))) + '\n')
            f.write('\n')
        
        return list(totuple(np.stack(res, 1)))

## Catboost

In [15]:
ctrain = x_train.drop(columns=['birth_date', 'diag_date', 'first_site', 'bmd_date', 'rad_date', 'surg_date', 'gamma_date', 'last_exam', 'local_relapse', 'distant_m', 'death', 'time'])

In [16]:
ctest = x_test.drop(columns=['birth_date', 'diag_date', 'first_site', 'bmd_date', 'rad_date', 'surg_date', 'gamma_date', 'local_relapse', 'distant_m'])

In [17]:
cap = ap.drop(columns=['birth_date', 'diag_date', 'first_site', 'bmd_date', 'rad_date', 'surg_date', 'gamma_date', 'last_exam', 'local_relapse', 'distant_m', 'death', 'time'])

In [18]:
train_all = pd.concat([ctrain, cap])

In [19]:
y_all = np.concatenate([y_train.values[:,0], y_ap])

In [20]:
x_tr, x_t, y_tr, y_t = train_test_split(train_all, y_all)

In [None]:
cbr = CatBoostRegressor(iterations=1000, loss_function=CustomObjective(), eval_metric='RMSE', use_best_model=True, od_type='Iter', od_wait=100, learning_rate=0.01)

In [None]:
cbr.fit(x_tr, y_tr, eval_set=(x_t, y_t))

0:	learn: 1150.0974773	test: 1100.6557448	best: 1100.6557448 (0)	total: 476ms	remaining: 7m 55s
1:	learn: 1149.5713093	test: 1100.1934604	best: 1100.1934604 (1)	total: 929ms	remaining: 7m 43s
2:	learn: 1149.1565743	test: 1099.7440793	best: 1099.7440793 (2)	total: 1.33s	remaining: 7m 21s
3:	learn: 1148.6570229	test: 1099.2499144	best: 1099.2499144 (3)	total: 1.88s	remaining: 7m 46s
4:	learn: 1148.2488745	test: 1098.8164073	best: 1098.8164073 (4)	total: 2.32s	remaining: 7m 42s
5:	learn: 1147.7339028	test: 1098.2995901	best: 1098.2995901 (5)	total: 2.73s	remaining: 7m 32s
6:	learn: 1147.2247733	test: 1097.7928034	best: 1097.7928034 (6)	total: 3.08s	remaining: 7m 17s
7:	learn: 1146.7754244	test: 1097.3354773	best: 1097.3354773 (7)	total: 3.43s	remaining: 7m 5s
8:	learn: 1145.7532084	test: 1096.8685907	best: 1096.8685907 (8)	total: 3.83s	remaining: 7m 2s
9:	learn: 1145.3232781	test: 1096.4096037	best: 1096.4096037 (9)	total: 4.24s	remaining: 7m
10:	learn: 1144.9534000	test: 1096.0214436	bes

In [None]:
fcbr = CatBoostRegressor(iterations=7, loss_function=CustomObjective(), eval_metric='R2')

In [21]:
fcbr.fit(ctrain, y_train)

0:	learn: 0.0000014	total: 455ms	remaining: 2.73s
1:	learn: 0.0002588	total: 835ms	remaining: 2.09s
2:	learn: 0.0004555	total: 1.18s	remaining: 1.57s
3:	learn: 0.0019089	total: 1.51s	remaining: 1.13s
4:	learn: 0.0009011	total: 1.93s	remaining: 773ms
5:	learn: 0.0023002	total: 2.27s	remaining: 378ms
6:	learn: 0.0030000	total: 2.6s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1179bf7b8>