In [None]:
!pip install feature_engine

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import missingno as msno
import catboost as cb
import warnings
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.encoding import DecisionTreeEncoder, RareLabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from optuna.samplers import TPESampler

In [None]:
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
msno.bar(train)

In [None]:
msno.matrix(train)

In [None]:
msno.heatmap(train)

In [None]:
train.head()

In [None]:
train.nunique().sort_values(ascending=False)

In [None]:
train.duplicated().sum()

In [None]:
fig, ax = plt.subplots(2,3, figsize = (10,10), sharey=True)

sns.countplot(train["Destination"], hue=train["Transported"],ax = ax[0][0], palette="cubehelix")
sns.countplot(train["VIP"], hue=train["Transported"],ax = ax[0][1], palette="cubehelix")
sns.countplot(train["HomePlanet"], hue=train["Transported"],ax = ax[0][2], palette="cubehelix")
sns.countplot(train["CryoSleep"], hue=train["Transported"],ax = ax[1][0], palette="cubehelix")
sns.countplot(train["Transported"],hue=train["Transported"], ax = ax[1][1], palette="cubehelix")
ax[1][2].axis("off")
plt.tight_layout()

In [None]:
def make_distplot(df, col, ax):
    sns.distplot(df[col], ax = ax)
    ax.axvline(df[col].mean(), linestyle = '--', color = "red")
    ax.axvline(df[col].median(), linestyle = '--', color = "green")

In [None]:
fig, ax = plt.subplots(2,3, figsize = (10,10))

make_distplot(train, "Age", ax[0,0])
make_distplot(train, "RoomService", ax[0,1])
make_distplot(train, "FoodCourt", ax[0,2])
make_distplot(train, "ShoppingMall", ax[1,0])
make_distplot(train, "Spa", ax[1,1])
make_distplot(train, "VRDeck", ax[1,2])

In [None]:
corr = train.select_dtypes(include=np.number).corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(10,10))
cmap = sns.color_palette("icefire", as_cmap=True)

sns.heatmap(corr, mask = mask, cmap = cmap, annot=True, fmt= '.2f', vmin=-1, vmax=1, square = True, linewidth=2, cbar_kws={"shrink": 0.7}, ax=ax)

In [None]:
def preprocess(df):
    try:
        df = df.drop("Name", axis = 1)
    except:
        pass
    df["VIP"] = df["VIP"].astype("object")
    df["CryoSleep"] = df["CryoSleep"].astype("object")
    df['TotalSpend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    return df

In [None]:
def add_pct_features(df):
    df['PctRoomService'] = df['RoomService']/df['TotalSpend']
    df['PctFoodCourt'] = df['FoodCourt']/df['TotalSpend']
    df['PctShoppingMall'] = df['ShoppingMall']/df['TotalSpend']
    df['PctSpa'] = df['Spa']/df['TotalSpend']
    df['PctVRDeck'] = df['VRDeck']/df['TotalSpend']
    return df

In [None]:
train = preprocess(train)

In [None]:
test = preprocess(test)

In [None]:
class AddGroupTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y = None):
        X["PassengerGroup"] = X["PassengerId"].apply(lambda s: s.split("_")[0])
        X = X.drop("PassengerId", axis=1)
        return X

In [None]:
class AddCabinTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y = None):
        X[['deck', 'num','side']] = X['Cabin'].str.split('/', expand=True)
        X = X.drop(['Cabin'], axis=1)
        return X

In [None]:
class ImputeNaN(TransformerMixin):
    def __init__(self, num_impute = "median"):
        self.num_impute = num_impute
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):   
        self.cat_cols = list(X.select_dtypes(include=["object", "bool"]).columns)
        self.num_cols = list(X.select_dtypes(include=np.number).columns)
        for cat_col in self.cat_cols:
            X[cat_col].fillna(X[cat_col].mode().loc[0], inplace=True)
        for num_col in self.num_cols:
            if self.num_impute == "median":
                pass
#                 X[num_col].fillna(X[num_col].median(), inplace=True)
            else:
                X[num_col].fillna(X[num_col].mean(), inplace=True)
        return X

In [None]:
X = train.loc[:, train.columns != 'Transported']
y = train["Transported"]

In [None]:
features_pipeline = Pipeline([
    ("add_group", AddGroupTransformer()),
    ("add_cabin", AddCabinTransformer()),
    ("impute", ImputeNaN())
])

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
def objective(trial, X, y):
    param_grid = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3),
        "depth": trial.suggest_int('depth', 9, 15),
        "min_child_samples": trial.suggest_int('min_child_samples', 2, 15),
        "iterations": trial.suggest_int('iterations', 1000, 10000, step = 1000),
        "use_best_model":True,
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "random_seed": 42
    }
    
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]  
        
        X_train = features_pipeline.fit_transform(X_train)
        X_test = features_pipeline.transform(X_test)
        
        X_train = X_train.fillna(-999)
        X_test = X_test.fillna(-999)
        
        rare_encoder = RareLabelEncoder(tol=0.07, n_categories=2, variables= list(X_train.select_dtypes(include="object").columns),
                           replace_with='Rare')
        
        X_train = rare_encoder.fit_transform(X_train)
        X_test = rare_encoder.transform(X_test)
        
        X_train = add_pct_features(X_train)
        X_test = add_pct_features(X_test)
        
        cat_indices = X_train.columns.get_indexer(X_train.select_dtypes(include='object').columns)
        
        model = cb.CatBoostClassifier(**param_grid)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100, cat_features = cat_indices)
        
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="minimize", study_name="Catboost Classifier", sampler=TPESampler(seed=42))
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=70)

In [None]:
from optuna.visualization.matplotlib import plot_param_importances

plot_param_importances(study)

In [None]:
X = add_pct_features(X)
test = add_pct_features(test)

In [None]:
X_trans = features_pipeline.fit_transform(X)
test_trans = features_pipeline.transform(test)

X_trans = X_trans.fillna(-999)
test_trans = test_trans.fillna(-999)

rare_encoder = RareLabelEncoder(tol=0.07, n_categories=2, variables= list(X_trans.select_dtypes(include="object").columns),
                           replace_with='Rare')

In [None]:
X_rare = rare_encoder.fit_transform(X_trans)
test_rare = rare_encoder.transform(test_trans)

In [None]:
model = cb.CatBoostClassifier(**study.best_params)
cat_indices = X_rare.columns.get_indexer(X_rare.select_dtypes(include='object').columns)
model.fit(X_rare, y, cat_features = cat_indices, verbose=0, early_stopping_rounds=100,)

In [None]:
model.get_feature_importance(type=cb.EFstrType.FeatureImportance, prettified=True, thread_count=-1, verbose=False)

In [None]:
preds = model.predict(test_rare)

In [None]:
submission["Transported"] = label_encoder.inverse_transform(preds)
submission['Transported'] = submission['Transported']

In [None]:
submission.to_csv("submission.csv", index=False)