In [1]:
# install libraries 
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 66kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
# load titanic dataset 
from catboost.datasets import titanic
import numpy as np

# load training and test dataset 
train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# check the null values statistics 
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [4]:
# fill the null values with -999
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

In [5]:
# Extract the data and label values 
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

In [6]:
# explore the data values 
print(X.dtypes)

# get the indexes of categorical data columns 
categorical_features_indices = np.where(X.dtypes != np.float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [7]:
# create training and validation dataset 
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

In [8]:
# load the libraries 
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [9]:
# create the cat boost classfier 
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [10]:
# fit the model to the training dataset 
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [11]:
# define the metric parameters of the loss functions 
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [12]:
# calculate the model performance 
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 543


In [13]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8260381593714928


In [14]:
# predict the values using the model 
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85473931 0.14526069]
 [0.76313031 0.23686969]
 [0.88972889 0.11027111]
 [0.87876173 0.12123827]
 [0.3611047  0.6388953 ]
 [0.90513381 0.09486619]
 [0.33434185 0.66565815]
 [0.78468564 0.21531436]
 [0.39429048 0.60570952]
 [0.94047549 0.05952451]]


In [15]:
# define the cat boost model without seed 
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


In [16]:
# define the paramters 
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [17]:
# define the best model 
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8027

Best model validation accuracy: 0.8251


In [18]:
# use the early stopping method 
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 4.9 s, sys: 585 ms, total: 5.49 s
Wall time: 3.05 s


In [19]:
# define the early stopping parameters 
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: user 731 ms, sys: 88.3 ms, total: 819 ms
Wall time: 486 ms


In [20]:
# evaluate the metrics of the model 
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8027

Early-stopped model tree count: 82
Early-stopped model validation accuracy: 0.8072


In [21]:
# use pre training results for training 
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline 
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

In [22]:
# use the snapshot feature of the cat boost 
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)

0:	learn: 0.8053892	test: 0.7937220	best: 0.7937220 (0)	total: 2.88ms	remaining: 11.5ms
1:	learn: 0.8008982	test: 0.7982063	best: 0.7982063 (1)	total: 5.17ms	remaining: 7.75ms
2:	learn: 0.8008982	test: 0.7937220	best: 0.7982063 (1)	total: 7.47ms	remaining: 4.98ms
3:	learn: 0.8113772	test: 0.7892377	best: 0.7982063 (1)	total: 9.76ms	remaining: 2.44ms
4:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 12.2ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4

5:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 15ms	remaining: 11.2ms
6:	learn: 0.8248503	test: 0.8026906	best: 0.8026906 (4)	total: 22.1ms	remaining: 14.8ms
7:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 26ms	remaining: 9.19ms
8:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 27.1ms	remaining: 3.73ms
9:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 29.6ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4



### 3.5 User Defined Objective Function
It is possible to create your own objective function. Let's create logloss objective function.

In [23]:
# define the logloss objective function 
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [24]:
# define the cat boost classifier 
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(), 
    eval_metric="Logloss"
)
# fit the model to the training data 
model.fit(train_pool)
#predict the values 
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6827074	total: 11.6ms	remaining: 104ms
1:	learn: 0.6723302	total: 22.6ms	remaining: 90.3ms
2:	learn: 0.6619449	total: 32.2ms	remaining: 75.1ms
3:	learn: 0.6521466	total: 42.7ms	remaining: 64ms
4:	learn: 0.6435227	total: 52.9ms	remaining: 52.9ms
5:	learn: 0.6353848	total: 63.4ms	remaining: 42.3ms
6:	learn: 0.6277210	total: 74.8ms	remaining: 32ms
7:	learn: 0.6210282	total: 86.2ms	remaining: 21.6ms
8:	learn: 0.6141958	total: 96.6ms	remaining: 10.7ms
9:	learn: 0.6073236	total: 108ms	remaining: 0us


In [25]:
# define the log loss metrics 
class LoglossMetric(object):
    # get the final error 
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    # check if the metric is optimal 
    def is_max_optimal(self):
        return False

    # evaluate the metric
    def evaluate(self, approxes, target, weight):
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

In [26]:
# define the cat boost classifier 
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)
# Fit model to the training data 
model.fit(train_pool)
# predict the data 
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5521578	total: 5.24ms	remaining: 47.2ms
1:	learn: 0.4885686	total: 9.68ms	remaining: 38.7ms
2:	learn: 0.4607664	total: 14.3ms	remaining: 33.4ms
3:	learn: 0.4418819	total: 18.8ms	remaining: 28.2ms
4:	learn: 0.4278162	total: 23.3ms	remaining: 23.3ms
5:	learn: 0.4151036	total: 27.7ms	remaining: 18.5ms
6:	learn: 0.4099336	total: 32.4ms	remaining: 13.9ms
7:	learn: 0.4095363	total: 36.1ms	remaining: 9.03ms
8:	learn: 0.4032867	total: 40.6ms	remaining: 4.51ms
9:	learn: 0.3929586	total: 45.3ms	remaining: 0us
