In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

# **Spaceship Titanic**

Welcome to the year 2912, where your data science skills are needed to solve a cosmic mystery. We've received a transmission from four lightyears away and things aren't looking good.

The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!

<div style="width:100%;text-align: center;"> <img align=middle src="https://wallup.net/wp-content/uploads/2016/01/54205-spaceship-futuristic.jpg" alt="Spaceship" style="height:555px;margin-top:3rem;"> </div>

To help rescue crews and retrieve the lost passengers, you are challenged to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

Help save them and change history!

**train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

<ul>
    <li> PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.</li>
    <li>HomePlanet - The planet the passenger departed from, typically their planet of permanent residence. </li>
    <li>CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.</li>
    <li>Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.</li>
    <li>Destination - The planet the passenger will be debarking to.</li>
    <li>Age - The age of the passenger.</li>
    <li>VIP - Whether the passenger has paid for special VIP service during the voyage.</li>
    <li>RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.</li>
    <li>Name - The first and last names of the passenger.</li>
    <li>Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.</li>
</ul>

**test.csv** - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

**sample_submission.csv** - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False.

### Reduce mem usage

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    #         else:
    #             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

### Data

In [None]:
!pip install gokinjo

In [None]:
import catboost as cb
import time
import optuna
import itertools
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from gokinjo import knn_kfold_extract
from gokinjo import knn_extract

import warnings
warnings.filterwarnings('ignore')

In [None]:
#!pip install catboost optuna tqdm eli5 gokinjo

In [None]:
# # Сброс ограничений на количество выводимых рядов
pd.set_option('display.max_rows', 100)
 
# # Сброс ограничений на число столбцов
pd.set_option('display.max_columns', None)
 
# # Сброс ограничений на количество символов в записи
pd.set_option('display.max_colwidth', None)

In [None]:
train = reduce_mem_usage(pd.read_csv('../input/spaceship-titanic/train.csv'))
test = reduce_mem_usage(pd.read_csv('../input/spaceship-titanic/test.csv'))
target = train.Transported
test_passengers = test.PassengerId
ids = pd.concat([train.PassengerId, test.PassengerId])
train.shape, test.shape

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

### PassengerId, Cabin (collect information from Data):

In [None]:
def cab_pass(data):
    # Cabin - The cabin number where the passenger is staying. 
    # Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    data['Cabin_groups_for_delete'] = data.Cabin.apply(lambda x: str(x).split('/'))
    splited = data['Cabin_groups_for_delete'].apply(lambda x: pd.Series(str(x).split(',')))
    splited.rename(columns={0:'CabinDeck',1:'CabinNum',2:'CabinSide'}, inplace=True)
    splited['CabinDeck'] = splited.CabinDeck.apply(lambda x: ''.join(str(x).strip("[ '',]")))
    splited['CabinNum'] = splited.CabinNum.apply(lambda x: str(x).strip("[ '',]"))
    splited['CabinSide'] = splited.CabinSide.apply(lambda x: ''.join(str(x).strip("[ '',]")))
    data['CabinDeck'] = splited['CabinDeck']
    data['CabinNum'] = splited['CabinNum']
    data['CabinSide'] = splited['CabinSide']
    #   PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg 
    #   indicates a group the passenger is travelling with and pp is their number within the group. 
    #   People in a group are often family members, but not always.
    data['PassengerGroup'] = data.PassengerId.apply(lambda x: int(x.split('_')[0]))
    data['PassengerNumber'] = data.PassengerId.apply(lambda x: int(x.split('_')[1]))
    #Name
    data['FirstName'] = data['Name'].str.split(' ', expand=True)[0]
    data['FamilyName'] = data['Name'].str.split(' ', expand=True)[1]
    #nan replace, or isna().sum() = 0
    data = data.apply(lambda x: x.replace('nan', np.nan)) 
    # drop old columns
    data = data.drop(['Name', 'Cabin', 'Cabin_groups_for_delete'], axis=1)
    return data

In [None]:
trn = cab_pass(train)
tst = cab_pass(test)

In [None]:
trn.CabinDeck.nunique() # check

In [None]:
# Filling NaNs Based on Feature Engineering...
def fill_age(df, age_limit = 13):
    df['RoomService'] = np.where(df['Age'] < age_limit, 0, df['RoomService'])
    df['FoodCourt'] = np.where(df['Age'] < age_limit, 0, df['FoodCourt'])
    df['ShoppingMall'] = np.where(df['Age'] < age_limit, 0, df['ShoppingMall'])
    df['Spa'] = np.where(df['Age'] < age_limit, 0, df['Spa'])
    df['VRDeck'] = np.where(df['Age'] < age_limit, 0, df['VRDeck'])
    
    return df

In [None]:
trn = fill_age(trn)
tst = fill_age(tst)

In [None]:
def age_groups(df, age_limit = 13):
    df['AgeGroup'] = np.where(df['Age'] < age_limit, 0, 1)
    return df

In [None]:
trn = age_groups(trn)
tst = age_groups(tst)

In [None]:
def fill_missing(df):    
    numeric_tmp = df.select_dtypes(include = np.number)
    categ_tmp = df.select_dtypes(exclude = np.number)

    for col in numeric_tmp.columns:
        df[col] = df[col].fillna(value = df[col].mean())
        
    for col in categ_tmp.columns:
        df[col] = df[col].fillna(value = df[col].mode()[0])

    
    return df

In [None]:
trn = fill_missing(trn)
tst = fill_missing(tst)

### Features

Calculate all money spent

In [None]:
def total_billed(df):
    df['TotalBilled'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    return df

In [None]:
trn = total_billed(trn)
tst = total_billed(tst)

In [None]:
trn['perc_spa'] = trn.Spa / trn.TotalBilled
tst['perc_spa'] = tst.Spa / tst.TotalBilled
trn['perc_room'] = trn.RoomService / trn.TotalBilled
tst['perc_room'] = tst.RoomService / tst.TotalBilled
trn['perc_food'] = trn.FoodCourt / trn.TotalBilled
tst['perc_food'] = tst.FoodCourt / tst.TotalBilled
trn['perc_shopping'] = trn.ShoppingMall / trn.TotalBilled
tst['perc_shopping'] = tst.ShoppingMall / tst.TotalBilled
trn['perc_vrdeck'] = trn.VRDeck / trn.TotalBilled
tst['perc_vrdeck'] = tst.VRDeck / tst.TotalBilled
trn.fillna(0, inplace=True)
tst.fillna(0, inplace=True)

Make another feature on information about spent money, if total = 0, no_money_spent = 1

In [None]:
trn['no_money_spent'] = trn.TotalBilled.apply(lambda x: 1 if x==0 else 0)
tst['no_money_spent'] = tst.TotalBilled.apply(lambda x: 1 if x==0 else 0)

This is interesting! It appears that Cabin_number is grouped into chunks of 300 cabins. This means we can compress this feature into a categorical one, which indicates which chunk each passenger is in.

Saw in on of notebooks, very disputable meaning as for me!

In [None]:
# trn['CabinNum'] = trn.CabinNum.apply(lambda x: int(x))
# tst['CabinNum'] = tst.CabinNum.apply(lambda x: int(x))
# # New features - training set
# trn['Cabin_region1']=(trn['CabinNum']<300).astype(int)   # one-hot encoding
# trn['Cabin_region2']=((trn['CabinNum']>=300) & (trn['CabinNum']<600)).astype(int)
# trn['Cabin_region3']=((trn['CabinNum']>=600) & (trn['CabinNum']<900)).astype(int)
# trn['Cabin_region4']=((trn['CabinNum']>=900) & (trn['CabinNum']<1200)).astype(int)
# trn['Cabin_region5']=((trn['CabinNum']>=1200) & (trn['CabinNum']<1500)).astype(int)
# trn['Cabin_region6']=((trn['CabinNum']>=1500) & (trn['CabinNum']<1800)).astype(int)
# trn['Cabin_region7']=(trn['CabinNum']>=1800).astype(int)

# # New features - test set
# tst['Cabin_region1']=(tst['CabinNum']<300).astype(int)   # one-hot encoding
# tst['Cabin_region2']=((tst['CabinNum']>=300) & (tst['CabinNum']<600)).astype(int)
# tst['Cabin_region3']=((tst['CabinNum']>=600) & (tst['CabinNum']<900)).astype(int)
# tst['Cabin_region4']=((tst['CabinNum']>=900) & (tst['CabinNum']<1200)).astype(int)
# tst['Cabin_region5']=((tst['CabinNum']>=1200) & (tst['CabinNum']<1500)).astype(int)
# tst['Cabin_region6']=((tst['CabinNum']>=1500) & (tst['CabinNum']<1800)).astype(int)
# tst['Cabin_region7']=(tst['CabinNum']>=1800).astype(int)

# # # Plot distribution of new features
# # plt.figure(figsize=(10,4))
# # train['Cabin_regions_plot']=(train['Cabin_region1']+2*train['Cabin_region2']+3*train['Cabin_region3']+4*train['Cabin_region4']+5*train['Cabin_region5']+6*train['Cabin_region6']+7*train['Cabin_region7']).astype(int)
# # sns.countplot(data=train, x='Cabin_regions_plot', hue='Transported')
# # plt.title('Cabin regions')
# # train.drop('Cabin_regions_plot', axis=1, inplace=True)

In [None]:
# cat_feats = train.select_dtypes(exclude=np.number).columns.to_list()

In [None]:
# cat_feats

### Advanced feature eng

In [None]:
%%time
Deck = trn.groupby('CabinDeck').aggregate({'TotalBilled': 'sum', 'Transported': 'sum', 'CryoSleep': 'sum', 'PassengerId': 'size'}).reset_index()
Deck['AvgSpended'] = Deck['TotalBilled'] / Deck['PassengerId']
Deck['TransportedPercentage'] = Deck['Transported'] / Deck['PassengerId']
Deck['CryoSleepPercentage'] = Deck['CryoSleep'] / Deck['PassengerId']
Deck = Deck.sort_values('AvgSpended', ascending = False)
Deck.head(10)

In [None]:
%%time
train_rel = trn.merge(Deck[['CabinDeck', 'TransportedPercentage', 'AvgSpended']], how = 'left', on = ['CabinDeck'])
test_rel = tst.merge(Deck[['CabinDeck', 'TransportedPercentage', 'AvgSpended']], how = 'left', on = ['CabinDeck'])

In [None]:
%%time
train_rel = trn.groupby('FamilyName')['PassengerId'].count().reset_index()
test_rel = tst.groupby('FamilyName')['PassengerId'].count().reset_index()

In [None]:
%%time
train_rel = train_rel.rename(columns = {'PassengerId': 'NumRelatives'})
test_rel = test_rel.rename(columns = {'PassengerId': 'NumRelatives'})

In [None]:
%%time
trn = trn.merge(train_rel[['FamilyName', 'NumRelatives']], how = 'left', on = ['FamilyName'])
tst = tst.merge(test_rel[['FamilyName', 'NumRelatives']], how = 'left', on = ['FamilyName'])

In [None]:
%%time
train_rel = trn.groupby('PassengerGroup')['PassengerId'].count().reset_index()
test_rel = tst.groupby('PassengerGroup')['PassengerId'].count().reset_index()

In [None]:
%%time
train_rel = train_rel.rename(columns = {'PassengerId': 'GroupSize'})
test_rel = test_rel.rename(columns = {'PassengerId': 'GroupSize'})

In [None]:
%%time
trn = trn.merge(train_rel[['PassengerGroup', 'GroupSize']], how = 'left', on = ['PassengerGroup'])
tst = tst.merge(test_rel[['PassengerGroup', 'GroupSize']], how = 'left', on = ['PassengerGroup'])

In [None]:
%%time
# A list of the original variables from the dataset
numerical_features = [
                      'Age', 
                      'RoomService', 
                      'FoodCourt', 
                      'ShoppingMall', 
                      'Spa', 
                      'VRDeck', 
                      'TotalBilled',
                      'no_money_spent'
                     ]

categorical_features = [
                        #'Name',
                        'FirstName',
                        'FamilyName',
                        'CabinNum',
                        #'TravelGroup',
                       ]


categorical_features_onehot = [
                               'HomePlanet',
                               'CryoSleep',
                               #'Cabin',
                               'CabinDeck',
                               'CabinSide',
                               'Destination',
                               'VIP',
                               #'AgeGroup'
                               ]

# target_feature = 'Transported'
trn.drop(['Transported'], axis=1, inplace=True)

In [None]:
# trn.loc[:, categorical_features_onehot]
trn.select_dtypes(exclude=np.number).columns.to_list()

In [None]:
trn['VIP'] = trn.VIP.apply(lambda x: str(x))
trn['CryoSleep'] = trn.CryoSleep.apply(lambda x: str(x))
tst['VIP'] = tst.VIP.apply(lambda x: str(x))
tst['CryoSleep'] = tst.CryoSleep.apply(lambda x: str(x))

In [None]:
def enc(train, test, categorical_features_onehot, categorical_features):
    train_data = train.copy()
    test_data = test.copy()
    train_data.drop(['PassengerId'], axis=1, inplace=True)
    test_data.drop(['PassengerId'], axis=1, inplace=True)
    #feats = train_data.select_dtypes(exclude=np.number).columns.to_list()
    #dummies
    dummies_tr = pd.get_dummies(train_data.loc[:, categorical_features_onehot])
    dummies_te = pd.get_dummies(test_data.loc[:, categorical_features_onehot])
    train_data.drop(categorical_features_onehot, axis=1, inplace=True)
    test_data.drop(categorical_features_onehot, axis=1, inplace=True)
    train_data = pd.concat([train_data, dummies_tr], axis=1)
    test_data = pd.concat([test_data, dummies_te], axis=1)
    #label
    data = pd.concat([train_data, test_data])
    shape = train_data.shape[0]
    for col in categorical_features:
        if col != 'Transported':
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
    train, test = data[:shape], data[shape:]
    
    
    
    return train, test
        
train_enc, test_enc = enc(trn, tst, categorical_features_onehot, categorical_features)

In [None]:
%%time
# Convert X and y to Numpy arrays as library requirements
X_array = train_enc[numerical_features].to_numpy()
y_array = target.to_numpy()
X_test_array = test_enc[numerical_features].to_numpy()

In [None]:
K = 2

In [None]:
%%time
# It Takes almost  35min 21s for K = 2 and 50_000 rows...
# It Takes almost  17min 36s for K = 1 and 50_000 rows...
KNN_trn_features = knn_kfold_extract(X_array, y_array, k = K, normalize = 'standard')

In [None]:
%%time
KNN_trn_features

In [None]:
KNN_trn_features.shape

In [None]:
%%time
knn_cols = ['KNN_K1_01',
            'KNN_K1_02',
            'KNN_K2_01',
            'KNN_K2_02',]

In [None]:
KNN_feat = pd.DataFrame(KNN_trn_features, columns = knn_cols)
KNN_feat = pd.DataFrame(KNN_trn_features, columns = knn_cols).set_index(train.index)

In [None]:
%%time
train_enc = pd.concat([train_enc, KNN_feat], axis = 1)
train.head()

In [None]:
%%time
KNN_tst_features = knn_extract(X_array, y_array, X_test_array, k = K, normalize = 'standard')
KNN_feat = pd.DataFrame(KNN_tst_features, columns = knn_cols).set_index(test.index)

In [None]:
test_enc = pd.concat([test_enc, KNN_feat], axis = 1)
test_enc.head()

In [None]:
%%time
remove = ['PassengerId', 
          #'Route', 
          #'FirstName_Enc', 
          #'CabinNum_Enc', 
          #'Transported',
          #'Cabin',
          'TransportedPercentage',
          #'IsKid', 
          #'IsAdult', 
          #'IsOlder'
          #'RoomService',
          #'FoodCourt',
          #'ShoppingMall',
          #'Spa',
          #'VRDeck',
          'KNN_K2_02',
          'KNN_K2_01',
         ]
#features = [feat for feat in train.columns if feat not in remove]
# features

In [None]:
features = [#'Age',
            'RoomService',
            'FoodCourt',
            'ShoppingMall',
            'Spa',
            'VRDeck',
            'TotalBilled',
            #'AvgSpended',
            #'NumRelatives',
            #'GroupSize',
            'FamilyName',
            #'TravelGroup',
            'HomePlanet_Earth',
            'HomePlanet_Europa',
            'HomePlanet_Mars',
            'CryoSleep_False',
            'CryoSleep_True',
            'CabinDeck_A',
            'CabinDeck_B',
            'CabinDeck_C',
            'CabinDeck_D',
            'CabinDeck_E',
            'CabinDeck_F',
            'CabinDeck_G',
            'CabinDeck_T',
            'CabinSide_P',
            'CabinSide_S',
            'Destination_55 Cancri e',
            'Destination_PSO J318.5-22',
            'Destination_TRAPPIST-1e',
            'VIP_False',
            'VIP_True',
            #'AgeGroup_0',
            #'AgeGroup_1',
            #'KNN_K1_01',
            #'KNN_K1_02'
]

In [None]:
train_enc.info()

### CATBOOST OPTUNA

In [None]:
def features_choice(tr, ts, features):
    return tr[features], ts[features]
train_enc, test_enc = features_choice(train_enc, test_enc, features)

In [None]:
def catboost_cross_validation(params, X, y, cv):
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f'{time.ctime()}, Cross-validation, {X.shape[0]} rows, {X.shape[1]} cols')
#     if not categorical:
#         categorical = list(set(categorical) & set(X.columns))
#         X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(#loss_function="Logloss",
                                      eval_metric="AUC",
                                      task_type="CPU",
                                      #verbose=50,
                                      #cat_features=cat_features_,
                                      #nan_mode='Max',
                                      #random_state=2021,
                                      **params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            cat_features=cat_feats,
            early_stopping_rounds=75,
            verbose=0)
        
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold + 1}, Valid score = {score}")
        folds_scores.append(score)
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("=" * 65)
    return estimators, oof_preds

In [None]:
def objective_cb(trial, X, y):
    param_grid = {
        #"loss_function": trial.suggest_categorical("loss_function", ["RMSE", "MAE", "Logloss"]),
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        #"n_estimators": trial.suggest_categorical('n_estimators', [10000]),
        "n_estimators": trial.suggest_int("n_estimators", 2000, 10000, step=1000),
        'border_count': trial.suggest_int('border_count', 10, 1000, step=50),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        "random_state": trial.suggest_categorical("random_state", [0, 7, 9, 999, 2022, 129324]),
        
    }
    
    # Conditional Hyper-Parameters
    if param_grid["bootstrap_type"] == "Bayesian":
        param_grid["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param_grid["bootstrap_type"] == "Bernoulli":
        param_grid["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3) #,random_state=42)
    model = cb.CatBoostClassifier(
        #loss_function="Logloss",
        eval_metric="AUC",
        task_type="CPU",
        verbose=0,
        #random_state=2021,
        **param_grid
        )
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],cat_features=cat_feats, early_stopping_rounds=100,verbose=0) # Multiple eval sets are not supported on GPU (train_x,train_y), 
    
    preds = model.predict_proba(test_x)[:, 1]
    
    score = roc_auc_score(test_y, preds)

    return score

In [None]:
cat_feats = train_enc.select_dtypes(exclude=np.number).columns.to_list() # empty now

In [None]:
# from catboost.utils import get_gpu_device_count
# print('I see %i GPU devices' % get_gpu_device_count())

In [None]:
study_cb = optuna.create_study(direction="maximize", study_name="CatBoost Classifier")
func = lambda trial: objective_cb(trial, train_enc, target)
study_cb.optimize(func, n_trials=100, show_progress_bar=True)

In [None]:
print("Number of completed trials: {}".format(len(study_cb.trials)))
print("Best trial:")
trial = study_cb.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_cb.best_params

### CB KFold with best params

In [None]:
cv=StratifiedKFold(n_splits=7, shuffle=True) #random_state=9, 
cb_estimators, cb_oof_preds = catboost_cross_validation(
    params=study_cb.best_params, X=train_enc, #'target drop
    y=target, cv=cv
)

In [None]:
cb_cv = pd.DataFrame()
for n, est in enumerate(cb_estimators):
    pred = est.predict_proba(test_enc)[:, 1]
    cb_cv[f'pred_{n}'] = pred

In [None]:
amean = cb_cv.mean(axis=1)

In [None]:
pred_amean = pd.DataFrame(amean, columns=['Transported'], index=None)

In [None]:
pred_amean.insert(0, 'PassengerId', test_passengers)

In [None]:
pred_amean.Transported = pred_amean.Transported.apply(lambda x: 'True' if x > 0.5 else 'False')

In [None]:
pred_amean.to_csv('ansamble_best_options.csv', index=False, encoding='utf-8')

### Tune preds

In [None]:
preds = amean

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,4))
sns.histplot(preds, binwidth=0.01, kde=True)
plt.title('Predicted probabilities')
plt.xlabel('Probability')

In [None]:
# Proportion (in test set) we get from rounding
print(np.round(100*np.round(preds).sum()/len(preds),2))

In [None]:
# Proportion of predicted positive (transported) classes
def preds_prop(preds_arr, thresh):
    pred_classes=(preds_arr>=thresh).astype(int)
    return pred_classes.sum()/len(pred_classes)

# Plot proportions across a range of thresholds
def plot_preds_prop(preds_arr):
    # Array of thresholds
    T_array=np.arange(0,1,0.001)
    
    # Calculate proportions
    prop=np.zeros(len(T_array))
    for i, T in enumerate(T_array):
        prop[i]=preds_prop(preds_arr, T)
        
    # Plot proportions
    plt.figure(figsize=(10,4))
    plt.plot(T_array, prop)
    target_prop=0.519         # Experiment with this value
    plt.axhline(y=target_prop, color='r', linestyle='--')
    plt.text(-0.02,0.45,f'Target proportion: {target_prop}', fontsize=14)
    plt.title('Predicted target distribution vs threshold')
    plt.xlabel('Threshold')
    plt.ylabel('Proportion')
    
    # Find optimal threshold (the one that leads to the proportion being closest to target_prop)
    T_opt=T_array[np.abs(prop-target_prop).argmin()]
    print('Optimal threshold:', T_opt)
    return T_opt
    
T_opt=plot_preds_prop(preds)

In [None]:
# Classify test set using optimal threshold
preds_tuned=(preds>=T_opt).astype(int)

In [None]:
# Sample submission (to get right format)
sub=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# Add predictions
sub['Transported']=preds_tuned

# Replace 0 to False and 1 to True
sub=sub.replace({0:False, 1:True})

# Prediction distribution
plt.figure(figsize=(4,4))
sub['Transported'].value_counts().plot.pie(colors=['orange', 'green'], explode=[0.05,0.05], autopct='%1.1f%%', shadow=True, textprops={'fontsize':12}).set_title("Prediction distribution")

In [None]:
# Output to csv
sub.to_csv('submission.csv', index=False)

### Permutation importance

In [None]:
from sklearn.inspection import permutation_importance

importance = permutation_importance(
    cb_estimators[5], train_enc, target, scoring="roc_auc", n_jobs=8, random_state=9999999
)
importance_scores = pd.DataFrame({
    "features": train_enc.columns,
    "importance-mean": importance.importances_mean,
    "importance-std": importance.importances_std,
})
importance_scores = importance_scores.sort_values(
    by="importance-mean", ascending=False
)
importance_scores = importance_scores.reset_index(drop=True)
# decrease_scores = importance_scores[importance_scores["importance-mean"]<=0]
# decrease_scores = decrease_scores.reset_index(drop=True)
# decrease_scores

importance_scores

### SHAP

In [None]:
import shap
shap.initjs()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_enc, target, train_size=0.80,
                                                             shuffle=True)

# tr = tr.reset_index(drop=True)
# tr_target = tr_target.reset_index(drop=True)
# x_validation = x_validation.reset_index(drop=True)
# y_validation = y_validation.reset_index(drop=True)

In [None]:
%%time
#model, X = cb.CatBoostRegressor, train_cb
# model, X = best_model_lgbm, X_lgbm
# model, X = best_model_cb, X_cb

explainer = shap.TreeExplainer(cb_estimators[5]) # best 1 model
shap_values = explainer.shap_values(train_enc)[1]

In [None]:
shap.force_plot(explainer.expected_value, shap_values, train_enc.iloc[0,:])

In [None]:
shap_values = explainer.shap_values(train_enc)
shap.summary_plot(shap_values, train_enc, max_display=200)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[7, :], train_enc.iloc[7, :], link='logit')

In [None]:
shap.summary_plot(shap_values, train_enc, plot_type="bar", max_display=train_enc.shape[1])

In [None]:
expected_value = explainer.expected_value
if isinstance(expected_value, list):
    expected_value = expected_value[1]
print(f"Explainer expected value: {expected_value}")

select = range(20)
features = train_enc.iloc[select]
features_display = train_enc.loc[features.index]
shap_values = explainer.shap_values(features)
shap_interaction_values = explainer(features)

if isinstance(shap_interaction_values, list):
    shap_interaction_values = shap_interaction_values[1]

In [None]:
shap.decision_plot(
    expected_value, shap_values, features_display, link='logit'
)