# <span style='color:#A80808'>Objective</span>

This notebook provides a baseline CatBoost model. A [fast comparison](https://www.kaggle.com/sytuannguyen/model-selection) has shown that Catboost is a good choice for solving the present problem. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier as cbc

# <span style='color:#A80808'>Data</span>

Data used for training the model is prepared by this [notebook](https://www.kaggle.com/sytuannguyen/spaceship-titanic-feature-engineering).

In [None]:
train = pd.read_csv('../input/spaceship-titanic-feature-engineering/train.csv')
train_targets = train.pop('Transported').astype('int64')
train.head(3)

# <span style='color:#A80808'>Catboost model</span>

In [None]:
# An exhausted list of hyperparameters
params=dict( iterations=100,
             learning_rate=None,
             depth=None,
             l2_leaf_reg=None,
             model_size_reg=None,
             rsm=None,
             loss_function=None,
             border_count=None,
             feature_border_type=None,
             per_float_feature_quantization=None,                         
             input_borders=None,
             output_borders=None,
             fold_permutation_block=None,
             od_pval=None,
             od_wait=None,
             od_type=None,
             nan_mode=None,
             counter_calc_method=None,
             leaf_estimation_iterations=None,
             leaf_estimation_method=None,
             thread_count=None,
             random_seed=None,
             use_best_model=None,
             verbose=1000,
             logging_level=None,
             metric_period=None,
             ctr_leaf_count_limit=None,
             store_all_simple_ctr=None,
             max_ctr_complexity=None,
             has_time=None,
             allow_const_label=None,
             classes_count=None,
             class_weights=None,
             one_hot_max_size=None,
             random_strength=None,
             name=None,
             ignored_features=None,
             train_dir=None,
             custom_loss=None,
             custom_metric=None,
             eval_metric='Accuracy',
             bagging_temperature=None,
             save_snapshot=None,
             snapshot_file=None,
             snapshot_interval=None,
             fold_len_multiplier=None,
             used_ram_limit=None,
             gpu_ram_part=None,
             allow_writing_files=None,
             final_ctr_computation_mode=None,
             approx_on_full_history=None,
             boosting_type=None,
             simple_ctr=None,
             combinations_ctr=None,
             per_feature_ctr=None,
             task_type=None,
             device_config=None,
             devices=None,
             bootstrap_type=None,
             subsample=None,
             sampling_unit=None,
             dev_score_calc_obj_block_size=None,
             max_depth=None,
             n_estimators=None,
             num_boost_round=None,
             num_trees=None,
             colsample_bylevel=None,
             random_state=None,
             reg_lambda=None,
             objective=None,
             eta=None,
             max_bin=None,
             scale_pos_weight=None,
             gpu_cat_features_storage=None,
             data_partition=None,
             early_stopping_rounds=1000,
             #cat_features=cat_features,
             grow_policy=None,
             min_data_in_leaf=None,
             min_child_samples=None,
             max_leaves=None,
             num_leaves=None,
             score_function=None,
             leaf_estimation_backtracking=None,
             ctr_history_unit=None,
             monotone_constraints=None,
             feature_weights=None,
             penalties_coefficient=None,
             first_feature_use_penalties=None,
             model_shrink_rate=None,
             model_shrink_mode=None,
             langevin=None,
             diffusion_temperature=None,
             posterior_sampling=None,
             boost_from_average=None,
             text_features=None,
             tokenizers=None,
             dictionaries=None,
             feature_calcers=None,
             text_processing=None)

# <span style='color:#A80808'>Cross-validation</span>

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores=[]
models=[]
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train_targets)):
    X_train = train.iloc[train_idx]
    X_val = train.iloc[val_idx]
    y_train = train_targets[train_idx]
    y_val = train_targets[val_idx]
    
    model = cbc(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    models.append(model)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_pred, y_val)

    print(f'Fold {fold}, accuracy score: {score}')
    print('_'*60)
    scores.append(score)

print(f'Average accuracy score: {np.mean(scores)}')

# <span style='color:#A80808'>Prediction</span>

In [None]:
test = pd.read_csv('../input/spaceship-titanic-feature-engineering/test.csv')

In [None]:
preds = []
for model in models:
    preds.append(model.predict(test))

# <span style='color:#A80808'>Submission</span>

In [None]:
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
submission['Transported'] = stats.mode(np.array(preds), axis=0)[0].reshape(-1).astype(bool)
submission.to_csv("submission.csv", index=False)

submission.head()