# Spaceship Titanic
The goal of this notebook is to predict which passengers of Spaceship Titanic were transported in an alternate dimension.  It is a submission for the corresponding [Kaggle competition](https://www.kaggle.com/competitions/spaceship-titanic/overview).  After some data cleaning and exploratory analysis, we apply and evaluate several machine learning algorithms to predict the labels for the competition data.

In [None]:
# import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set_style('dark')

# Read data and some EDA

In [None]:
# read data in a Pandas DataFrame
def get_data(path):
    df = pd.read_csv(path)
    # Calculate total spending
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].apply('sum', axis='columns')
    # Extract useful features from cabin number
    df['Deck'] = df['Cabin'].str[0]
    df['Side'] = df['Cabin'].str[-1]
    # Extract age group
    df.loc[df['Age'] < 18, 'AgeGroup'] = 'Under18'
    df.loc[df['Age'] >= 18, 'AgeGroup'] = 'Adult'
    # Find whether they travel with family
    df['GroupId'] = df['PassengerId'].str[:4]
    df['LastName'] = df['Name'].str.split().map(lambda x: x[-1], na_action='ignore')
    df['PeopleInGroup'] = df.groupby('GroupId')['PassengerId'].transform('count')
    df['LastNamesInGroup'] = df.groupby('GroupId')['LastName'].transform('nunique')
    df['GroupHasUnnamed'] = df.groupby('GroupId')['LastName'].transform('count') < df['PeopleInGroup']
    df['WithFamily'] = (df['PeopleInGroup'] > 1) & (df['LastNamesInGroup'] < df['PeopleInGroup']) & ~df['GroupHasUnnamed']
    df.loc[df['GroupHasUnnamed'], 'WithFamily'] = np.nan
    # drop helper columns
    df.drop(columns=['GroupId', 'LastName', 'LastNamesInGroup', 'GroupHasUnnamed'], inplace=True)
    # Drop Cabin and Name columns
    df.drop(columns=['Cabin', 'Name'], inplace=True)
    # Set Passenger id as the index
    df.set_index('PassengerId', inplace=True)
    return df

path = '/kaggle/input/spaceship-titanic/train.csv'
titanic_df = get_data(path)
titanic_df

In [None]:
titanic_df.info()

In [None]:
titanic_df.describe()

In [None]:
numerical_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']
plt.figure(figsize=(15,15))
sns.pairplot(data=titanic_df[numerical_columns + ['Transported']], hue='Transported')

In [None]:
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'PeopleInGroup', 'WithFamily', 'AgeGroup']
for col in categorical_columns:
    print(titanic_df[col].value_counts())

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(20,50), sharey=True)

for i, col in enumerate(categorical_columns):
    ax = axs[i // 2, i % 2]
    sns.barplot(data=titanic_df, x=col, y='Transported', ax=ax)
    ax.set_xlabel(col, size='large')
    if ax.get_subplotspec().is_first_col():
        ax.set_ylabel('Transported fraction', size='large')
    else:
        ax.set_ylabel('')
    ax.set_title('Fraction of transported people per {} value'.format(col), size='xx-large')
plt.show()

#### Observations: 
1. There are some missing values that need to be taken care of.
2. Columns HomePlanet and Destination are categorical.
3. All ameneties columns are heavily skewed to the right.
4. Most passengers that are transported seem to have low RoomService, Spa and VRDeck values.
5. From the categorical features, the most important one seems to be CryoSleep since about 80% of those in cryo sleep seem to have been transported while only about 35% of those not in cryo sleep  have.
6. There are at least two distinct age groups.  We can extract a categorical variable.

## Data Cleaning - Feature Engineering

Before applying any transformation we separate the data into a training and a testign set.

In [None]:
from sklearn.model_selection import train_test_split

X = titanic_df[numerical_columns + categorical_columns]
y = titanic_df['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

Steps for data cleaning:
1. Impute missing values of numerical features with the median of each one
1. Apply log1p to the 5 amenities features
2. Standard scale all numerical features
3. One-hot encode categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline  import make_pipeline

amenities_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(func=np.log1p, inverse_func=np.expm1),
    StandardScaler()
)

amenities = [col for col in numerical_columns if col != 'Age']

age_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

col_transformer = make_column_transformer(
    (amenities_transformer, amenities),
    (age_transformer, ['Age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_columns)
)

col_transformer

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_forest_pipe = make_pipeline(col_transformer, RandomForestClassifier())

rand_forest_pipe

In [None]:
# Find optimal parameters for the model
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'randomforestclassifier__n_estimators': [5, 10, 20, 50, 100, 200],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__bootstrap': [True, False]
}

gridRandForest = GridSearchCV(rand_forest_pipe, parameter_grid, scoring='accuracy', verbose=3, n_jobs=4)

gridRandForest.fit(X_train, y_train)

In [None]:
# Best parameters for the model
display(gridRandForest.best_params_)

In [None]:
#Evaluate model using test data
display('Accuracy score: {:.4f}'.format(gridRandForest.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix

#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, gridRandForest.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the Random Forest Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
# Prepare data for submission

path = '/kaggle/input/spaceship-titanic/test.csv'
sub_df = get_data(path)
sub_df.head()

In [None]:
best_model = gridRandForest.best_estimator_
best_model.fit(X, y)

In [None]:
predictions_df = pd.DataFrame({
    'Transported': best_model.predict(sub_df)
}, index=sub_df.index)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsRFC.csv')

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_pipe = make_pipeline(col_transformer, LogisticRegression())

log_reg_pipe

In [None]:
parameter_grid = [{
    'logisticregression__penalty': ['l1'],
    'logisticregression__solver': ['liblinear', 'saga'],
    'logisticregression__warm_start': [True, False]
}, {
    'logisticregression__penalty': ['l2'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__warm_start': [True, False]
}, {
    'logisticregression__penalty': ['none'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__warm_start': [True, False]
}, {
    'logisticregression__penalty': ['elasticnet'],
    'logisticregression__solver': ['saga'],
    'logisticregression__warm_start': [True, False],
    'logisticregression__l1_ratio': np.arange(0.1, 1, 0.1)
}]

gridLogReg = GridSearchCV(log_reg_pipe, parameter_grid, scoring='accuracy', verbose=3, n_jobs=4)

gridLogReg.fit(X_train, y_train)

In [None]:
# Best parameters for the model
gridLogReg.best_params_

In [None]:
#Evaluate model using test data
display('Accuracy score: {:.4f}'.format(gridLogReg.score(X_test, y_test.astype(int))))

In [None]:
#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, gridLogReg.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the Gradient-Bossted Decision Tree Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
best_model = gridLogReg.best_estimator_
best_model.fit(X, y)

In [None]:
predictions_df = pd.DataFrame({
    'Transported': best_model.predict(sub_df)
}, index=sub_df.index)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsLogReg.csv')

# Gradient Boosted Decision Tree (XGBoost)
Let's check if this  model performs better

In [None]:
from xgboost import XGBClassifier

xgb_pipe = make_pipeline(col_transformer, XGBClassifier(use_label_encoder=False, verbosity=0, n_jobs=1))

xgb_pipe

In [None]:
parameter_grid = {
    'xgbclassifier__n_estimators': [5, 10, 20, 50, 100, 200, 300, 400],
    'xgbclassifier__learning_rate': np.arange(0.04, 0.09, 0.01),
    'xgbclassifier__booster': ['gbtree']
}

gridXGB = GridSearchCV(xgb_pipe, parameter_grid, scoring='accuracy', verbose=3, n_jobs=4)

gridXGB.fit(X_train, y_train.astype(int)) # labels are recommended to be integers

In [None]:
# Best parameters for the model
gridXGB.best_params_

In [None]:
#Evaluate model using test data
display('Accuracy score: {:.4f}'.format(gridXGB.score(X_test, y_test.astype(int))))

In [None]:
#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, gridXGB.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the Gradient-Bossted Decision Tree Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
best_model = gridXGB.best_estimator_
best_model.fit(X, y.astype(int))

In [None]:
predictions_df = pd.DataFrame({
    'Transported': best_model.predict(sub_df)
}, index=sub_df.index)
predictions_df['Transported'] = predictions_df['Transported'].astype(bool)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsXGB.csv')

# k-Nearest Naeighbors
Predict labels using a kNN classifier

In [None]:
from sklearn.neighbors  import KNeighborsClassifier

knn_pipe = make_pipeline(col_transformer, KNeighborsClassifier())
knn_pipe

In [None]:
parameter_grid = {
    'kneighborsclassifier__n_neighbors': list(range(15, 30)),
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__algorithm': ['ball_tree', 'kd_tree', 'brute']
}

gridKNN = GridSearchCV(knn_pipe, parameter_grid, scoring='accuracy', verbose=3, n_jobs=4)

gridKNN.fit(X_train, y_train)

In [None]:
gridKNN.best_params_

In [None]:
#Evaluate model using test data
display('Accuracy score: {:.4f}'.format(gridKNN.score(X_test, y_test.astype(int))))

In [None]:
#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, gridKNN.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the k-Nearest Neighbors Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
best_model = gridKNN.best_estimator_
best_model.fit(X, y)

In [None]:
predictions_df = pd.DataFrame({
    'Transported': best_model.predict(sub_df)
}, index=sub_df.index)
predictions_df['Transported'] = predictions_df['Transported'].astype(bool)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsKNN.csv')

# Artificial neural network

In [None]:
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.losses import mean_squared_error

def getDNN_model(features = 4, hidden_layer_units = 25, hidden_layers = 1, optimizer = 'adam'):
    """
    Return a compiled Deep Neural Network model
    
    Arguments:
    features: The numver of features in the input data
    hidden_layer_units: The number of units in each hidden layer of the network
    hidden_layers: The number of hidden layers in the network
    optimizer: The optimizer used by the network
    """

    model = Sequential()

    #Input layer
    model.add(Dense(1 + features, input_shape=(features,), activation='relu', kernel_initializer='normal'))
    for _ in range(hidden_layers):
        #Hidden layers
        model.add(Dense(hidden_layer_units, activation='relu', kernel_initializer='normal'))
    #Output layer
    model.add(Dense(1, activation='sigmoid', kernel_initializer='normal'))
    

    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    
    return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier

dnn_pipe = make_pipeline(col_transformer, KerasClassifier(build_fn=getDNN_model, verbose=0))

parameter_grid = {
    'kerasclassifier__optimizer': ['RMSprop', 'Adam', 'Adamax', 'Nadam'],
    'kerasclassifier__hidden_layers': list(range(1, 4)),
    'kerasclassifier__hidden_layer_units': [20, 50, 100],
    'kerasclassifier__epochs' : [10, 20, 30],
    'kerasclassifier__features': [col_transformer.fit_transform(X_train).shape[-1]]
}

gridDNN = GridSearchCV(dnn_pipe, parameter_grid, scoring='accuracy', verbose=3, n_jobs=4)

gridDNN.fit(X_train, y_train)

In [None]:
gridDNN.best_params_

In [None]:
#Evaluate model using test data
display('Accuracy score: {:.4f}'.format(gridDNN.score(X_test, y_test)))

In [None]:
#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, gridDNN.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the Deep Neural Network Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
best_model = gridDNN.best_estimator_
best_model.fit(X, y)

In [None]:
predictions_df = pd.DataFrame({
    'Transported': best_model.predict(sub_df).T[0]
}, index=sub_df.index)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsDNN.csv')

# Stack all models

In [None]:
from sklearn.ensemble import StackingClassifier

randForestParams = {k.split('__')[-1]: v for k, v in gridRandForest.best_params_.items()}
kNNParams = {k.split('__')[-1]: v for k, v in gridKNN.best_params_.items()}
LRParams = {k.split('__')[-1]: v for k, v in gridLogReg.best_params_.items()}
XGBParams = {k.split('__')[-1]: v for k, v in gridXGB.best_params_.items()}
DNNParams = {k.split('__')[-1]: v for k, v in gridDNN.best_params_.items()}

rf = RandomForestClassifier().set_params(**randForestParams)
knn = KNeighborsClassifier().set_params(**kNNParams)
lr = LogisticRegression().set_params(**LRParams)
xgb = XGBClassifier(use_label_encoder=False, verbosity=0).set_params(**XGBParams)
dnn = KerasClassifier(build_fn=getDNN_model, verbose=0).set_params(**DNNParams)
dnn._estimator_type = 'classifier'

stack_pipe = make_pipeline(col_transformer, StackingClassifier(estimators=[
    ('rf', rf),
    ('knn', knn),
    ('lr', lr),
    ('xgb', xgb),
    ('dnn', dnn)
]))

stack_pipe.fit(X_train, y_train.astype(int))

In [None]:
display('Accuracy score: {:.4f}'.format(stack_pipe.score(X_test, y_test.astype(int))))

In [None]:
#Create and display a confusion matrix
cf_matrix = confusion_matrix(y_test, stack_pipe.predict(X_test))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Confusion matrix for the Stacked Model')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('True Values')
ax.set_xticklabels(['False', 'True'])
ax.set_yticklabels(['False', 'True'])

In [None]:
predictions_df = pd.DataFrame({
    'Transported': stack_pipe.fit(X, y.astype(int)).predict(sub_df).astype(bool)
}, index=sub_df.index)
predictions_df.head()

In [None]:
predictions_df.to_csv('./predictionsStack.csv')