In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import catboost as cb
import numpy as np
import time

import optuna
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score

### Data

**train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

<ul>
    <li> PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.</li>
    <li>HomePlanet - The planet the passenger departed from, typically their planet of permanent residence. </li>
    <li>CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.</li>
    <li>Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.</li>
    <li>Destination - The planet the passenger will be debarking to.</li>
    <li>Age - The age of the passenger.</li>
    <li>VIP - Whether the passenger has paid for special VIP service during the voyage.</li>
    <li>RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.</li>
    <li>Name - The first and last names of the passenger.</li>
    <li>Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.</li>
</ul>

**test.csv** - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

**sample_submission.csv** - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False.

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')

In [None]:
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train.shape, test.shape

In [None]:
target = train.Transported
test_passengers = test.PassengerId

In [None]:
train_cb = train.drop(['Transported', 'Name'], axis=1)
test = test.drop(['Name'], axis=1)

In [None]:
train[train.Cabin.isna()]

### PassengerId, Cabin (information from Data)

Slit on Deck, Num, Side:

In [None]:
def cab_pass(data):
    # Cabin - The cabin number where the passenger is staying. 
    # Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    data['Cabin_groups_for_delete'] = data.Cabin.apply(lambda x: str(x).split('/'))
    splited = data['Cabin_groups_for_delete'].apply(lambda x: pd.Series(str(x).split(',')))
    splited.rename(columns={0:'deck',1:'num',2:'side'}, inplace=True)
    splited['deck'] = splited.side.apply(lambda x: ''.join(str(x).strip("[ '',]")))
    splited['num'] = splited.num.apply(lambda x: str(x).strip("[ '',]"))
    splited['side'] = splited.deck.apply(lambda x: ''.join(str(x).strip("[ '',]")))
    data['deck'] = splited['deck']
    data['num'] = splited['num']
    data['side'] = splited['side']
    #   PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg 
    #   indicates a group the passenger is travelling with and pp is their number within the group. 
    #   People in a group are often family members, but not always.
    data['PassengerGroup'] = data.PassengerId.apply(lambda x: int(x.split('_')[0]))
    data['PassengerNumber'] = data.PassengerId.apply(lambda x: int(x.split('_')[1]))
    #nan replace, or isna().sum() = 0
    data = data.apply(lambda x: x.replace('nan', np.nan)) 
    # drop old columns
    data = data.drop(['PassengerId', 'Cabin', 'Cabin_groups_for_delete'], axis=1)
    return data

### num fillna, to int

Nan to np.nan

In [None]:
train_cb = cab_pass(train_cb)
train_cb.num = train_cb.num.fillna(99999)
train_cb.num = train_cb.num.apply(lambda x: int(x))
train_cb.num = train_cb.num.replace(99999, np.nan)
train_cb.head(1)

In [None]:
test = cab_pass(test)
test.num = test.num.fillna(99999)
test.num = test.num.apply(lambda x: int(x))
test.num = test.num.replace(99999, np.nan)
test.head(1)

### DATAWIG

Lets's try to fill missing values with deep learning library:

In [None]:
!pip install pandas==1.3.5 scikit-learn==1.0.2 mxnet==1.7.0.post1

In [None]:
!pip install datawig

In [None]:
import datawig

For most use cases, the **SimpleImputer** class is the best starting point. For convenience there is the function SimpleImputer.complete that takes a DataFrame and fits an imputation model for each column with missing values, with all other columns as inputs:

In [None]:
train_cb.CryoSleep = train_cb.CryoSleep.apply(lambda x: 1 if x==True else (0 if x==False else np.nan))
train_cb.VIP = train_cb.VIP.apply(lambda x: 1 if x==True else (0 if x==False else np.nan))

In [None]:
test.CryoSleep = test.CryoSleep.apply(lambda x: 1 if x==True else (0 if x==False else np.nan))
test.VIP = test.VIP.apply(lambda x: 1 if x==True else (0 if x==False else np.nan))

In [None]:
train_cb.isna().sum()

In [None]:
def missing_values(data, data_test, msn_list):
    for _ in msn_list:
        imputer = datawig.SimpleImputer(
                input_columns=['HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall',
                               'Spa','VRDeck','deck','num','side',], 
                # column(s) containing information about the column we want to impute
                output_column=_, # the column we'd like to impute values for
                output_path = 'imputer_model' # stores model data and metrics
                )

        #Fit an imputer model on the train data
        imputer.fit(train_df=data)

        #Impute missing values and return original dataframe with predictions
        # TRAIN
        data_pred = imputer.predict(data[data[f'{_}'].isna()])
        temp = data[data[f'{_}'].isna()]
        temp[f'{_}'] = data_pred[f'{_}_imputed']
        data[f'{_}'][data[f'{_}'].isna()] = temp[f'{_}']
        # TEST
        data_pred_test = imputer.predict(data_test[data_test[f'{_}'].isna()])
        temp_test = data_test[data_test[f'{_}'].isna()]
        temp_test[f'{_}'] = data_pred_test[f'{_}_imputed']
        data_test[f'{_}'][data_test[f'{_}'].isna()] = temp_test[f'{_}']

    
    return data, data_test

In [None]:
%%time
train_cb, test = missing_values(train_cb, test, ['HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa',
          'VRDeck','deck','num','side'])

### Age

In [None]:
train_cb['Adult'] = train_cb.Age.apply(lambda x: 'True' if x >= 18 else 'False')
test['Adult'] = test.Age.apply(lambda x: 'True' if x >= 18 else 'False')

In [None]:
def age_l(age_years):
        """ Return the age group label (int). """
        if age_years < 18: return 1
        elif age_years < 35: return 2
        elif age_years < 50: return 3
        elif age_years < 99: return 4
        else: return 0

In [None]:
train_cb['age_label'] = train_cb['Age'].apply(lambda x: age_l(x))
test['age_label'] = test['Age'].apply(lambda x: age_l(x))

### Dummies

In [None]:
train_dummies = train_cb.copy()
test_dummies = test.copy()

In [None]:
dum = train_dummies.select_dtypes(exclude=np.number)
dum_test = test_dummies.select_dtypes(exclude=np.number)

dummies_tr = pd.get_dummies(dum) # 'CryoSleep',
dummies_te = pd.get_dummies(dum_test) # 'CryoSleep',


train_dummies.drop(['HomePlanet', 'Destination', 'deck', 'side', 'Adult'], axis=1, inplace=True) # 'CryoSleep',
test_dummies.drop(['HomePlanet', 'Destination', 'deck', 'side', 'Adult'], axis=1, inplace=True) # 'CryoSleep',

train_dummies = pd.concat([train_dummies, dummies_tr], axis=1)
test_dummies = pd.concat([test_dummies, dummies_te], axis=1)

##### Total bills, bills to age, mean total bills, no money spent, is adult

In [None]:
train_dummies['Total_bill'] = train_dummies.RoomService + train_dummies.FoodCourt + train_dummies.ShoppingMall + train_dummies.Spa + train_dummies.VRDeck

In [None]:
test_dummies['Total_bill'] = test_dummies.RoomService + test_dummies.FoodCourt + test_dummies.ShoppingMall + test_dummies.Spa + test_dummies.VRDeck

In [None]:
train_dummies['no_money_spent'] = train_dummies.Total_bill.apply(lambda x: 1 if x==0 else 0)
test_dummies['no_money_spent'] = test_dummies.Total_bill.apply(lambda x: 1 if x==0 else 0)

In [None]:
#train_dummies.columns.to_list()

train_dummies.drop(['age_label', 'Total_bill'], axis=1, inplace=True)
test_dummies.drop(['age_label', 'Total_bill'], axis=1, inplace=True)

### CATBOOST OPTUNA

In [None]:
def catboost_cross_validation(params, X, y, cv):
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f'{time.ctime()}, Cross-validation, {X.shape[0]} rows, {X.shape[1]} cols')
#     if not categorical:
#         categorical = list(set(categorical) & set(X.columns))
#         X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(#loss_function="Logloss",
                                      eval_metric="Accuracy",
                                      #task_type="CPU",
                                      #verbose=50,
                                      #cat_features=cat_features_,
                                      #nan_mode='Max',
                                      #random_state=2021,
                                      **params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            early_stopping_rounds=200,
            verbose=0)
        
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold + 1}, Valid score = {score}")
        folds_scores.append(score)
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("=" * 65)
    return estimators, oof_preds

In [None]:
def objective_cb(trial, X, y):
    param_grid = {
        #"loss_function": trial.suggest_categorical("loss_function", ["RMSE", "MAE", "Logloss"]),
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        #"n_estimators": trial.suggest_categorical('n_estimators', [10000]),
        "n_estimators": trial.suggest_int("n_estimators", 2000, 10000, step=1000),
        'border_count': trial.suggest_int('border_count', 10, 1000, step=50),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        
    }
    
    # Conditional Hyper-Parameters
    if param_grid["bootstrap_type"] == "Bayesian":
        param_grid["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param_grid["bootstrap_type"] == "Bernoulli":
        param_grid["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3) #,random_state=42)
    model = cb.CatBoostClassifier(
        #loss_function="Logloss",
        eval_metric="Accuracy",
        #task_type="CPU",
        verbose=0,
        #random_state=2021,
        **param_grid
        )
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=0) # Multiple eval sets are not supported on GPU (train_x,train_y), 
    
    preds = model.predict_proba(test_x)[:, 1]
    
    score = roc_auc_score(test_y, preds)

    return score

In [None]:
study_cb = optuna.create_study(direction="maximize", study_name="CatBoost Classifier")
func = lambda trial: objective_cb(trial, train_dummies, target)
study_cb.optimize(func, n_trials=100, show_progress_bar=True)

In [None]:
print("Number of completed trials: {}".format(len(study_cb.trials)))
print("Best trial:")
trial = study_cb.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_param_importances(study_cb)

In [None]:
optuna.visualization.plot_optimization_history(study_cb)

In [None]:
optuna.visualization.plot_slice(study_cb, params=['depth', 'learning_rate', 'bootstrap_type', 'one_hot_max_size'])

In [None]:
study_cb.best_params

In [None]:
model_best = cb.CatBoostClassifier(loss_function="Logloss",
                                   eval_metric="Accuracy",
                                   **study_cb.best_params)

model_best.fit(train_dummies, target, early_stopping_rounds=100, verbose=0)

### Best model params pred

In [None]:
pred_best_model = model_best.predict_proba(test_dummies)[:, 1]

In [None]:
pred_best_model = pd.DataFrame(pred_best_model, columns=['Transported'], index=None)

In [None]:
pred_best_model.insert(0, 'PassengerId', test_passengers)

In [None]:
pred_best_model.Transported = pred_best_model.Transported.apply(lambda x: 'True' if x > 0.5 else 'False')

In [None]:
!pip install pandas==1.3.5 # or will be bug with class import in pandas only in Kaggle notebook.
# Need to reinstall after downgrading for datawig

In [None]:
# pred_best_model.to_csv('best_options.csv', index=False, encoding='utf-8')

### CB KFold with best params

In [None]:
cv=StratifiedKFold(n_splits=10, random_state=9, shuffle=True) #random_state=9, 
cb_estimators, cb_oof_preds = catboost_cross_validation(
    params=study_cb.best_params, X=train_dummies, #'target drop
    y=target, cv=cv
)

### Test preds 2 for all folds

In [None]:
cb_cv = pd.DataFrame()
for n, est in enumerate(cb_estimators):
    pred = est.predict_proba(test_dummies)[:, 1]
    cb_cv[f'pred_{n}'] = pred

In [None]:
amean = cb_cv.mean(axis=1)

In [None]:
pred_amean = pd.DataFrame(amean, columns=['Transported'], index=None)

In [None]:
pred_amean.insert(0, 'PassengerId', test_passengers)

In [None]:
pred_amean.Transported = pred_amean.Transported.apply(lambda x: 'True' if x > 0.48 else 'False')

In [None]:
pred_amean.to_csv('ansamble_best_options.csv', index=False, encoding='utf-8')