# (not yet) Kerasing Spacetanic

<div class="alert alert-success" role="alert">
  <strong>SCORE: </strong>0.8009
</div>

## Inputs and Set-up

In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
# from tensorflow import keras
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,\
    StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer

sns.set_theme(style="ticks")

In [None]:
file_paths = []
for dirname, _, filenames in os.walk('data'):  # /kaggle/input
    for filename in filenames:
        file_paths.append(os.path.join(dirname, filename))
file_paths

In [None]:
test_data = pd.read_csv(file_paths[0])
train_data = pd.read_csv(file_paths[1])
train_target = train_data["Transported"].astype(int)
train_data.drop(columns="Transported", inplace=True)

In [None]:
train_data.info()

There are missing values, hence some imputation is in order. Since new `object` features will be introduced, imputation should follow feature creation.

## Categorical Feature Review and some Feature Engineering

In [None]:
train_data.loc[:,train_data.dtypes==object].nunique()

For `object` columns with a handful of unique entries, one-hot/ordinal encoder will do straight away. `object` columns with numerous unique entries ought to be invesitgated for new feature creation.

### PassengerId, Cabin, Name?

In [None]:
train_data[["PassengerId", "Cabin", "Name"]].sample(5)

Let's define a custom Let's define a custom data preprocessor that splits the input feature on the specified separator string (e.g., "_" or " ") and returns `n` new features, where `n` is the number of elements obtained with the split. It should handle the missing values, but return an error if there are varying number of elements in the splitted input feature. Optionally, it should be able to convert the obtained new features to numerical type.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted, check_array

In [None]:
train_data.values[:,[1]]

In [None]:
class Splitter(BaseEstimator, TransformerMixin):
    def __init__(self, sep="_"):
        self.sep = sep

    def fit(self, X, y=None):
        assert X.shape[1] == 1, \
            "\nX must be a 2-D array with one column."
        *_, nonnan_entry_length = self._split(X)
        self.n_features_out_ = nonnan_entry_length.unique()[0]
        if hasattr(X, "columns"):
            self.feature_names_in_ = X.columns.values[0]
        else:
            self.feature_names_in_ = ""
        return self

    def transform(self, X, convert_to_float=True):
        check_is_fitted(self)
        nan_mask, X_split, nonnan_entry_length = self._split(X)
        assert nonnan_entry_length.unique()[0] == self.n_features_out_, \
            f"\nSplit with '{self.sep}' yields different n of features than seen in `fit`."

        out = np.tile(np.array(np.nan, dtype=object), (len(X), self.n_features_out_))
        out[~nan_mask] = np.vstack(X_split[~nan_mask])

        if convert_to_float:
            try:
                out = out.astype(float)
            except ValueError:  # encounter a string
                pass

        return out
    
    def inverse_transform(self, X):
        check_is_fitted(self)
        pass

    def get_feature_names_out(self, names=None):
        check_is_fitted(self)
        if hasattr(self, "feature_names_in_"):
            return [f"{self.feature_names_in_}_S{i}" for i in range(self.n_features_out_)]
        else:
            return [f"S{i}" for i in range(self.n_features_out_)]

    def _split(self, X):
        X = check_array(X, dtype=str, force_all_finite="allow-nan")
        nan_mask = (X.ravel() == "nan")
        X_split = pd.Series(X.ravel()).str.split(self.sep).values
        nonnan_entry_length = pd.Series(X_split[~nan_mask]).apply(len)

        assert nonnan_entry_length.nunique() == 1, \
            f"\nSplit with '{self.sep}' yields varying n of features per entry."

        return nan_mask, X_split, nonnan_entry_length

In [None]:
test_arrays = [
    np.array([["3/3/4"], ["A/77/3"], [np.nan]]),  # to be used in `fit`
    np.array([["3/34"], ["A/222"]]),  # fewer features
    np.array([["3/3/4"], ["A/222_3"]]),  # varying number of features
]

test = Splitter("/")
test.fit(pd.DataFrame(test_arrays[0], columns=["feature"]))
print(test.n_features_out_)
print(test.feature_names_in_)
print(test.get_feature_names_out())

for i, t in enumerate(test_arrays):
    try:
        print("Test", i)
        print(test.transform(t, sparse_output=False))
    except AssertionError:
        print("Fail")
        continue

### HomePlanet, CryoSleep, Destination, VIP

In [None]:
train_data[["HomePlanet", "CryoSleep", "Destination", "VIP"]].sample(5)

"HomePlanet" and "Destination" ought to be one-hot-encoded, whereas "CryoSleep" and "VIP" columns can be converted straight to 1s and 0s (whilst retaining `nan`s).

In [None]:
Floater = FunctionTransformer(lambda x: x.astype(float), feature_names_out="one-to-one")

In [None]:
t = Floater.transform(train_data[["CryoSleep"]])
t[-10:]

## Numerical Feature Review and some more Feature Engineering

In [None]:
num_cols = train_data.loc[:,train_data.dtypes!=object].columns.values
print(train_data[num_cols].nunique(), "\n")
train_data[num_cols].describe()

In [None]:
df = MinMaxScaler().fit_transform(train_data[num_cols])
df = pd.DataFrame(df, columns=num_cols)
df.plot(
    kind="hist", subplots=True, layout=(-1,2), figsize=(18,6),
    sharex=False, sharey=True, bins=50
)

It appears that the the distributions for all numerical columns but `Age` are significantly skewed. Because of that, we will feed the log of the original values to the model (while retaining `np.nan`s and zeros). Looks like most of values is zero. It would be useful to check if non-zero subsets are multi-modal or not.

In [None]:
df = pd.DataFrame(index=train_data.index.copy())
for col in num_cols:
    cond = train_data[col] > 0  # to filter out zeros (and nans)
    df = df.join(train_data.loc[cond, col] )

In [None]:
df = MinMaxScaler().fit_transform(df)
df = pd.DataFrame(df, columns=num_cols)
df.plot(
    kind="hist", subplots=True, layout=(-1,2), figsize=(18,6),
    sharex=False, sharey=True, bins=50,
)

The log transformer below will not only find the log of values greater than zero, but also retain the original missing values. This will enable further imputation.

In [None]:
def find_log(x):
    out = np.where(x > 0, np.log(x, where=x > 0), 0.0)
    out = np.where(np.isnan(x), np.nan, out)
    return out

Loggaformer = FunctionTransformer(find_log, feature_names_out="one-to-one")
# Loggaformer = FunctionTransformer(lambda x: np.where(x > 1e-6, np.log(x), 0.0))

## Preprocessing

The custom `Splitter` preprocessor can palusably output columns containing `objects` rather than numerical data which can cause the upcast of all new feautures to `object`. That is suboptimal. Upon a few attempts to do some post-processessing within the `Splitter` guts, I decided it would be easier instead to use two `ColumnTransofrmer`s. <br><br>Since I intend to chain two `ColumnTansformers`, I don't see how it can work without the upstream transformer ouputiting `pd.Dataframe`. Hence, I am changing the `sklearn` config to ouput dataframes rathen than `numpy` arrays.

In [None]:
from sklearn import set_config
set_config(transform_output="pandas", display='diagram') 

In [None]:
OneHotPipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

LogPipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", Loggaformer),
    ("scale", MinMaxScaler(feature_range=(-1,1)))
])

FloatPipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", Floater),
    ("scale", MinMaxScaler(feature_range=(-1,1)))
])

SplitPipeline = Pipeline([
    ("split", Splitter(sep="/|_")),
    # *OneHotPipeline.steps
])

In [None]:
PreProc = ColumnTransformer([
    ("Id", SplitPipeline, ["PassengerId"]),
    ("Cabin", SplitPipeline, ["Cabin"]),
    ("OneHot", OneHotPipeline, ["HomePlanet", "Destination"]),
    ("Float", FloatPipeline, ["CryoSleep", "VIP", "Age"]),
    ("Log", LogPipeline, num_cols[num_cols != "Age"]),   
])

In [None]:
# df = pd.DataFrame(
#     PreProc.fit_transform(train_data),
#     columns=PreProc.get_feature_names_out()
# )
df = PreProc.fit_transform(train_data)
df.info()

In [None]:
PostProc = ColumnTransformer([
    ("CabinImpute", OneHotPipeline, ["Cabin__Cabin_S0", "Cabin__Cabin_S2"]),
], remainder=FloatPipeline)

In [None]:
df1 = PostProc.fit_transform(df)
df1.info()

In [None]:
df1.hist(layout=(-1,3), figsize=(12,27))

## Sklearn Model

Things to try before moving on to Keras:
* stratified sampling
* more complex imputation
* XGBoost?
* Demonstrate expected model performance with nested cross-validation
* Examine correlation of engineered features with the  target
* spell check

In [None]:
from scipy.stats import geom, uniform, randint

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV


In [None]:
mainline = Pipeline([
    ("US", PreProc),  # upstream
    ("DS", PostProc),  # downstream
    ("Model", GradientBoostingClassifier())
])

In [None]:
# mainline.fit(train_data, train_target)

In [None]:
CV = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
scores = cross_val_score(mainline, train_data, train_target, scoring="accuracy", cv=CV, n_jobs=-1)
pd.Series(scores).describe()

In [None]:
pd.Series(geom(1/100, 100).rvs(1000000)).value_counts().sort_index()

In [None]:
mainline["Model"].get_params()

In [None]:
param_distributions = {
    "Model__n_estimators": geom(1/100, 100),
    "Model__learning_rate": uniform(0,1),
    "Model__max_depth": randint(1, 11),
    "Model__min_samples_split": randint(2, 11),
    "Model__min_samples_leaf": randint(1, 11),
    "Model__max_features": uniform(0.5,.5),
    "Model__subsample": uniform(.5, .5),
}

model_random_search = RandomizedSearchCV(
    mainline,
    param_distributions=param_distributions,
    n_iter=1000,
    cv=CV,
    verbose=1,
    n_jobs=-2
)

In [None]:
model_random_search.fit(train_data, train_target)


In [None]:
model_random_search.best_params_

In [None]:
mainline.set_params(**model_random_search.best_params_)
scores = cross_val_score(mainline, train_data, train_target, scoring="accuracy", cv=CV, n_jobs=-1)
pd.Series(scores).describe()

In [None]:
column_results = [f"param_{name}" for name in param_distributions.keys()]
column_results += ["mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_random_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False
)


def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results.to_csv("cv_results.csv")

Now use the grid search around the best parameters found with RandomSearch

In [None]:
param_grid  = {
    "Model__n_estimators": [140,150,160],
    "Model__learning_rate": [0.116, 0.117, 0.118],
    "Model__max_depth": [3,4,5],
    "Model__min_samples_split": [5,6,7],
    "Model__min_samples_leaf": [2,3,4],
    "Model__max_features": [27/27, 26/27, 25/27],
    "Model__subsample": [.65, .7, .75],
}

model_gird_search = GridSearchCV(
    mainline,
    param_grid =param_grid ,
    cv=CV,
    verbose=1,
    n_jobs=-2
)

In [None]:
model_gird_search.fit(train_data, train_target)

In [None]:
model_gird_search.best_params_

In [None]:
mainline.set_params(**model_gird_search.best_params_)
scores = cross_val_score(mainline, train_data, train_target, scoring="accuracy", cv=CV, n_jobs=-1)
pd.Series(scores).describe()

In [None]:
grid_cv_results = pd.DataFrame(model_gird_search.cv_results_)
grid_cv_results = grid_cv_results[column_results].sort_values(
    "mean_test_score", ascending=False
)

grid_cv_results = grid_cv_results.rename(shorten_param, axis=1)
grid_cv_results.to_csv("grid_cv_results.csv")

In [None]:
mainline.set_params(**{"Model__n_iter_no_change":75, "Model__n_estimators":500})
scores = cross_val_score(mainline, train_data, train_target, scoring="accuracy", cv=CV, n_jobs=-1)
pd.Series(scores).describe()

In [None]:
mainline.fit(train_data, train_target)
submission = test_data[["PassengerId"]].join(
    pd.DataFrame(mainline.predict(test_data), columns=["Transported"]).astype(bool))

In [None]:
submission.to_csv("grad_boost_submit.csv", index=False)

In [None]:
df1.join(train_target).corr().iloc[:,-1].sort_values()

---
## Keras Model

Using functional API to build an unnecessarily big model. Adding dropout and Ridge regularization to tackle overfit.

In [None]:
passengers = keras.Input(shape=(train_data.shape[1],), name="passenger")

mid_layers = keras.layers.Dense(2048,
                                kernel_regularizer= keras.regularizers.l2(0.003),
                                activation="relu")(passengers)
mid_layers = keras.layers.Dropout(0.3)(mid_layers)

mid_layers = keras.layers.Dense(2048,
#                                 kernel_regularizer= keras.regularizers.l2(0.003),
                                activation="relu")(mid_layers)
# mid_layers = keras.layers.Dropout(0.3)(mid_layers)

mid_layers = keras.layers.Dense(2048,
                                kernel_regularizer= keras.regularizers.l2(0.003),
                                activation="relu")(mid_layers)
mid_layers = keras.layers.Dropout(0.3)(mid_layers)

# mid_layers = keras.layers.Dense(1024,
#                                 kernel_regularizer= keras.regularizers.l2(0.001),
#                                 activation="relu")(mid_layers)
# mid_layers = keras.layers.Dropout(0.2)(mid_layers)

vitals = keras.layers.Dense(1, activation="sigmoid", name="vitals")(mid_layers)

Earlier, it was noted that validation loss starts to "oscillate" durling later epochs when the learning rate is high. Thus, defined a callback to reduce the rate every 10 epochs.

In [None]:
def scheduler(epoch, lr):
    if (epoch > 0) & (epoch % 10 == 0):
        return lr / 2
    else:
        return lr
callback = keras.callbacks.LearningRateScheduler(scheduler, 1)

In [None]:
epochs_num = 60
model = keras.Model(inputs=passengers, outputs=vitals)

model.compile(
    optimizer=keras.optimizers.RMSprop(1e-3),
    loss="BinaryCrossentropy", metrics=["accuracy"])

history = model.fit(
    train_data, train_target, batch_size=4300, epochs=epochs_num,
    validation_split=.25, shuffle=True, callbacks = [callback])

In [None]:
for _ in range(5):
    sample = train_data.sample(4000)
    # train_target[sample.index]
    model.evaluate(sample, train_target[sample.index])

In [None]:
loss = history.history["loss"].copy()
val_loss = history.history["val_loss"].copy()
g_loss = sns.lineplot(x = range(1, len(loss)+1), y=loss, label="Test")
sns.scatterplot(x = range(1, len(loss)+1), y=val_loss, ax=g_loss, label="Validation")
g_loss.set_ylim(0,1.5)
g_loss.set_ylabel("Logistic Loss")
g_loss.set_xlabel("Batch No")
g_loss.set_xticks(range(0, epochs_num+10, 10))
g_loss.grid(True, axis="x")

In [None]:
acc = history.history["accuracy"].copy()
val_acc = history.history["val_accuracy"].copy()
g_acc = sns.lineplot(x = range(1, len(acc)+1), y=acc, label="Test")
sns.scatterplot(x = range(1, len(acc)+1), y=val_acc, ax=g_acc, label="Validation")
g_acc.set_ylim(0.5,1)
g_acc.set_ylabel("Accuracy")
g_acc.set_xlabel("Batch No")
g_acc.set_xticks(range(0, epochs_num+10, 10))
g_acc.grid(True, axis="x")

In [None]:
keras.utils.plot_model(model, show_shapes=True)

## Submission

In [None]:
predict = model.predict(test_data)
predict = pd.Series(predict.reshape(-1,))

In [None]:
predict = predict.apply(lambda x: True if x>=.5 else False)
predict.value_counts()

In [None]:
submit = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
submit["Transported"] = predict

In [None]:
submit.to_csv("submission.csv", index=False)
submit

In [None]:
model.save("version10.keras")