# Introduction 
This is a notebook for developing model pipeline for the <a href="https://www.kaggle.com/competitions/spaceship-titanic/data?select=test.csv">Spaceship Titanic</a> competition.

## Library Importing

In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

from xgboost import XGBClassifier

import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline

import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import KFold

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

## Training Dataset Pipeline

In [87]:
# remove unnecessary columns
class FeatureSelection(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[["deck", "num", "side"]] = X["Cabin"].str.split("/", expand=True)
        # handle HomePlanet null values
        X = X.drop(columns=["Cabin", "Name", "PassengerId"])

        return X


class NullHandler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # handle HomePlanet null values
        modeVal = X["HomePlanet"].value_counts().idxmax()
        X["HomePlanet"] = X["HomePlanet"].fillna(modeVal)
        # CryoSleep
        modeVal_cs = X["CryoSleep"].value_counts().idxmax()
        X["CryoSleep"] = X["CryoSleep"].fillna(modeVal_cs)
        # Destination
        modeVal_dest = X["Destination"].value_counts().idxmax()
        X["Destination"] = X["Destination"].fillna(modeVal_dest)
        # Age
        median = X["Age"].median()
        X["Age"] = X["Age"].fillna(median)

        modeVal_vip = X["VIP"].value_counts().idxmax()
        X["VIP"] = X["VIP"].fillna(modeVal_vip)

        median_rs = X["RoomService"].median()
        X["RoomService"] = X["RoomService"].fillna(median_rs)

        X["FoodCourt"] = X["FoodCourt"].fillna(X["FoodCourt"].median())
        X["ShoppingMall"] = X["ShoppingMall"].fillna(X["ShoppingMall"].median())
        X["Spa"] = X["Spa"].fillna(X["Spa"].median())
        X["VRDeck"] = X["VRDeck"].fillna(X["VRDeck"].median())

        print("Here")
        X["deck"] = X["deck"].fillna(X["deck"].mode())
        X["side"] = X["side"].fillna(X["side"].mode())
        median_value = X["num"].fillna(-1).astype(int).median()
        X["num"] = X["num"].fillna(-1).astype(int).replace(-1, median_value)
        print("null handling done !")

        return X


class EncodeCatVar(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.get_dummies(
            X, columns=["Destination", "HomePlanet", "side", "deck"], dtype="int"
        )

        X["CryoSleep"] = X["CryoSleep"].replace({True: 1, False: 0})
        X["VIP"] = X["VIP"].replace({True: 1, False: 0})

        return X

In [88]:
best_feat = []


# function to get the best features from chi square test
def best_features(train_x, train_y):
    # get the top 10 features by their chi square test
    best_features = SelectKBest(score_func=chi2, k=10)
    fit = best_features.fit(train_x, train_y)
    feature_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(train_x.columns)
    feature_scores = pd.concat([df_columns, feature_scores], axis=1)
    feature_scores.columns = ["Specs", "Score"]
    print(feature_scores.nlargest(20, "Score"))

    yessir = feature_scores.nlargest(20, "Score")
    opt_cols = yessir["Specs"].tolist()

    return opt_cols

In [89]:
class ModelTraining(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.X_train, self.y_train = X.drop(columns=["Transported"]), X["Transported"]
        self.y_train = self.y_train.replace({True: 1, False: 0})

        self.opt_cols = best_features(self.X_train, self.y_train)
        self.X_train_best = self.X_train.copy()
        self.X_train_best = self.X_train_best[self.opt_cols]

        self.xgb_model = XGBClassifier()
        # Fit the model
        self.xgb_model.fit(self.X_train_best, self.y_train)

        return self

    def transform(self, X):
        return self.xgb_model, self.X_train_best, self.y_train, self.opt_cols


class HyperParamFineTuning(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.model, self.X_train_best, self.y_train, self.opt_cols = (
            ModelTraining().fit_transform(X)
        )
        return self

    def transform(self, X):
        param_grid = {
            "max_depth": [3, 5, 7],
            "learning_rate": [0.1, 0.01, 0.001],
            "subsample": [0.5, 0.7, 1],
        }

        grid_search_xgb = GridSearchCV(self.model, param_grid, cv=5, scoring="roc_auc")

        # Fit the GridSearchCV object to the training data
        grid_search_xgb.fit(self.X_train_best, self.y_train)

        # Print the best set of hyperparameters and the corresponding score
        print("Best set of hyperparameters: ", grid_search_xgb.best_params_)
        print("Best score: ", grid_search_xgb.best_score_)

        param_dist = {
            "max_depth": stats.randint(3, 10),
            "learning_rate": stats.uniform(0.01, 0.1),
            "subsample": stats.uniform(0.5, 0.5),
            "n_estimators": stats.randint(50, 200),
        }

        # Create the XGBoost model object
        xgb_model = XGBClassifier()

        # Create the RandomizedSearchCV object
        random_search = RandomizedSearchCV(
            xgb_model,
            param_distributions=param_dist,
            n_iter=10,
            cv=5,
            scoring="accuracy",
        )

        # Fit the RandomizedSearchCV object to the training data
        random_search.fit(self.X_train_best, self.y_train)

        # Print the best set of hyperparameters and the corresponding score
        print("Best set of hyperparameters: ", random_search.best_params_)
        print("Best score: ", random_search.best_score_)

        opt_search_param = (
            random_search.best_params
            if random_search.best_score_ > grid_search_xgb.best_score_
            else grid_search_xgb.best_params_
        )

        xgb_model = XGBClassifier(**opt_search_param)

        # Fit the model
        xgb_model.fit(self.X_train_best, self.y_train)

        return xgb_model

In [90]:
class BestFeatureSelector(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[best_feat]

In [91]:
train = pd.read_csv("data/train.csv")

In [92]:
model_pipe = Pipeline(
    [
        ("feature_selection", FeatureSelection()),
        ("NullHandler", NullHandler()),
        ("encoder", EncodeCatVar()),
        ("hyperparam_finetune", HyperParamFineTuning()),
    ]
)

model = model_pipe.fit_transform(train)

Here
null handling done !
                      Specs         Score
6                       Spa  1.726733e+06
7                    VRDeck  1.573321e+06
3               RoomService  1.002141e+06
4                 FoodCourt  1.025621e+05
8                       num  7.413619e+03
5              ShoppingMall  1.616605e+03
0                 CryoSleep  1.197498e+03
1                       Age  3.421734e+02
13        HomePlanet_Europa  2.053869e+02
18                   deck_B  1.657797e+02
12         HomePlanet_Earth  1.108991e+02
19                   deck_C  9.301393e+01
9   Destination_55 Cancri e  8.147903e+01
21                   deck_E  7.502059e+01
15                   side_P  4.613276e+01
22                   deck_F  4.542584e+01
16                   side_S  4.534145e+01
11  Destination_TRAPPIST-1e  2.408388e+01
2                       VIP  1.179281e+01
20                   deck_D  9.522286e+00
Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.7}
Best 

In [93]:
model.feature_names_in_

array(['Spa', 'VRDeck', 'RoomService', 'FoodCourt', 'num', 'ShoppingMall',
       'CryoSleep', 'Age', 'HomePlanet_Europa', 'deck_B',
       'HomePlanet_Earth', 'deck_C', 'Destination_55 Cancri e', 'deck_E',
       'side_P', 'deck_F', 'side_S', 'Destination_TRAPPIST-1e', 'VIP',
       'deck_D'], dtype='<U23')

In [94]:
test_pipe = Pipeline(
    [
        ("feature_selection", FeatureSelection()),
        ("NullHandler", NullHandler()),
        ("encoder", EncodeCatVar()),
    ]
)

In [95]:
# saving the pred results
test = pd.read_csv("data/Test.csv")
test_df = test_pipe.fit_transform(test)

Here
null handling done !


In [96]:
test_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,num,Destination_55 Cancri e,...,side_P,side_S,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,3,0,...,0,1,0,0,0,0,0,0,1,0
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,4,0,...,0,1,0,0,0,0,0,1,0,0
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,1,0,0,1,0,0,0,0,0
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,0,1,0,0,1,0,0,0,0,0
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,5,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.0,0,0.0,0.0,0.0,0.0,0.0,1496,0,...,0,1,0,0,0,0,0,0,1,0
4273,0,42.0,0,0.0,847.0,17.0,10.0,144.0,416,0,...,0,0,0,0,0,0,0,0,0,0
4274,1,26.0,0,0.0,0.0,0.0,0.0,0.0,296,1,...,1,0,0,0,0,1,0,0,0,0
4275,0,26.0,0,0.0,2680.0,0.0,0.0,523.0,297,0,...,1,0,0,0,0,1,0,0,0,0


In [97]:
y_pred = model.predict(test_df[model.feature_names_in_])
pred_df = pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": y_pred})
pred_df = pred_df.replace({1: True, 0: False})
pred_df.to_csv("submissions/submission6.csv", index=False)
joblib.dump(model, "models/model6.joblib")

['models/model6.joblib']