This is my notebook for the Spaceship Competition. I will do some exploratory data analysis, then I will clean the data, do more EDA, and predict the data using XGBoost and Optuna.

First, let's start with importing packages.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import optuna

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading Data

Now that our packages have been imported, let's load up the data. The data comes in two separate datasets: train and test, and I will combine them into one for visualizations.

In [None]:
pd.set_option('display.float_format', '{:.0f}'.format)

#These need to be converted because I will use these for my model
new_dtypes = dict(dtype=dict(CryoSleep='int64', VIP='int64'))

train = pd.read_csv('../input/spaceship-titanic/train.csv', **new_dtypes)
test = pd.read_csv('../input/spaceship-titanic/test.csv', **new_dtypes)
all_data = pd.concat([train,test])
all_data.head()

In [None]:
print('Dimensions of Training data:', train.shape)
print('Dimensions of Testing Data:', test.shape) 
print('Dimensions of All Data:', all_data.shape)

In [None]:
all_data.info()

It turns out that there are missing values everywhere except for PassengerId, CryoSleep, and VIP. I'll get to that later. 

Also, it is stated that some of the data have the factors glued together in each entry, so we need to split them into separate columns. In this case, Cabin, PassengerId, and Name contain this type of data.

In [None]:
#Split columns
train[['Deck','Number','Side']] = train.Cabin.str.split('/', expand=True)
test[['Deck','Number','Side']] = test.Cabin.str.split('/', expand=True)
all_data[['Deck','Number','Side']] = all_data.Cabin.str.split('/', expand=True)

train[['Group','GroupNum']] = train.PassengerId.str.split('_', expand=True)
test[['Group','GroupNum']] = test.PassengerId.str.split('_', expand=True)
all_data[['Group','GroupNum']] = all_data.PassengerId.str.split('_', expand=True)

train[['FirstName','LastName']] = train.Name.str.split(' ', expand=True)
test[['FirstName','LastName']] = test.Name.str.split(' ', expand=True)
all_data[['FirstName','LastName']] = all_data.Name.str.split(' ', expand=True)

In [None]:
all_data.describe()

In [None]:
all_data.info()

In [None]:
all_data.shape

## EDA

In [None]:
for col in all_data.select_dtypes('object'):
    print(f'No. of unique values for {col}: {all_data[col].nunique()}')
    print(all_data[col].unique())

In [None]:
all_data.HomePlanet.value_counts()

Most of the people in the spaceship were from Earth.

In [None]:
all_data.CryoSleep.value_counts()

In [None]:
all_data.Destination.value_counts()

Most of the people in the spaceship were trying to get to TRAPPIST-1e.

In [None]:
all_data.Transported.value_counts()

About half of the members have been transported into another dimension.

In [None]:
#Configure plotting options
sns.set_style('darkgrid')
plt.rc('axes', labelsize=14, labelweight='bold', titlesize=16, titleweight='bold')
plt.rc('figure', figsize=(8,5))

#Create a subplot of histograms
num_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
fig, ax = plt.subplots(2, 3, figsize=(15,8))
for ax, col in zip(ax.flatten(), num_cols):
    sns.histplot(all_data[col], bins=50, kde=True, color='blue', ax=ax)
fig.tight_layout()

From these histograms, two statements could be made:
- Most of the members in the spaceship are young adults (18-25).
- Only a few select members spent money at the ship.

We can determine which groups of people were the most likely to be transported with countplots. Every factor will be taken into consideration.

In [None]:
pd.crosstab(all_data.CryoSleep, all_data.Transported).plot(kind='bar');

In [None]:
pd.crosstab(all_data.VIP, all_data.Transported).plot(kind='bar');

In [None]:
pd.crosstab(all_data.Destination, all_data.Transported).plot(kind='bar');

In [None]:
pd.crosstab(all_data.HomePlanet, all_data.Transported).plot(kind='bar');

In [None]:
sns.catplot(x='HomePlanet', hue='Destination', col='Transported', data=all_data, kind='count');

From these plots, it turns out that most of the members who had Cryosleep ended up being transported. Any member that was not on Earth had a higher chance, and if any of the members were going to Trappist, there is a slight chance they would be transported as well. VIP and Destination did not seem to have an effect on the chances of being transported.

## Imputing Missing Values

In [None]:
def get_null_info(df):
    pd.set_option('display.float_format', '{:.4f}'.format)
    null_vals = [df[col].isnull().sum() for col in df.columns]
    null_prob = [df[col].isnull().sum()/len(df) for col in df.columns]
    null_info = pd.DataFrame({'Values': null_vals, 'Probablity': null_prob}, index=df.columns)
    return null_info.sort_values(by='Values', ascending=False).head(20)

get_null_info(train)

In [None]:
get_null_info(test)

As stated before, there is ALOT of missing data on both the training and testing sets. The missing values will be imputed using the function I made below and the fillna method.

In [None]:
#Impute missing values
def impute(df):
    for col in df.columns:
        if col in df.select_dtypes('float64'):
            if col == 'Age':
                df[col].fillna(df[col].median(), inplace=True)
            elif col == 'Transported':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna('Unknown', inplace=True)

impute(train)
impute(test)
impute(all_data)

In [None]:
get_null_info(train)

In [None]:
get_null_info(test)

There are no missing values left.

Additionally, we can also create a feature which totals up the amount of money each individual spent.

In [None]:
#Create the total bill feature
train['Total_Bill'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa']	+ train['VRDeck']
test['Total_Bill'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa']	+ test['VRDeck']
all_data['Total_Bill'] = all_data['RoomService'] + all_data['FoodCourt'] + all_data['ShoppingMall'] + all_data['Spa']	+ all_data['VRDeck']

This is how the data looks now.

In [None]:
all_data.head()

## EDA Part 2

With our newly transformed dataset, we can now even do more data exploration. For instance, we can see how many members are in each group.

In [None]:
all_data.Group.value_counts().value_counts()

In [None]:
#Group by group and get the max and mean
group_sizes = all_data.Group.value_counts().sort_values(ascending=False)
group_sizes

In [None]:
new_index = lambda x: str(x) + (' Member' if x == 1 else ' Members')
group_sizes.value_counts().sort_index().rename(new_index)

From this analysis, some groups had a maximum of 8 members, and most of the members in the ship weren't grouped with anyone.

In [None]:
deck_counts = all_data.Deck.value_counts()
deck_counts

In [None]:
all_data.Side.value_counts()

In [None]:
sns.catplot(x='Deck', hue='Side', col='Transported', data=train, kind='count');

From this plot, if any member was on Side S, they were more likely to be transported. Also notice how the data was skewed in each plot. Side P Members were lucky enough to not be transported, while Side S members weren't.

In [None]:
group_sizes.describe()

In regards to the group sizes, this data is heavily skewed to the right, judging by the quantiles.

## Model Building and Prediction

Since we are finished analyzing the data, we can now so some data modeling. As stated before, XGBoost and Optuna will be used.

In [None]:
feats = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side']

X_train = train[feats]
X_test = test[feats]
y = train.Transported

For the one-hot encoding code without the pipeline:

In [None]:
# def encode_data(df):
#     ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
#     cat_cols = list(df.select_dtypes('object', 'category'))
#     cat_cols_encoded = pd.DataFrame(ohe.fit_transform(df[cat_cols]))
#     cat_cols_encoded.index = df.index
#     df.drop(cat_cols, axis=1, inplace=True)
#     return pd.concat([df,cat_cols_encoded], axis=1)

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')

def objective(trial):
    param_grid = dict(n_estimators=trial.suggest_int('n_estimators', 10, 1000, 10), 
                    learning_rate=trial.suggest_float('learning_rate', .0001, .5), 
                    max_depth=trial.suggest_int('max_depth', 2, 7),
                    min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
                    colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1),
                    subsample=trial.suggest_float("subsample", 0.2, 1), 
                    random_state=42)
    
    pipe = Pipeline([('ohe', ohe), ('xgbc', XGBClassifier(**param_grid))])
    scores = cross_val_score(pipe, X_train, y, cv=10, scoring='accuracy')
    return scores.mean()
    


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
xgb_params = study.best_trial.params
xgb_params

In [None]:
study.best_trial

In [None]:
print('Best score: {:.2f}%'.format(study.best_value*100))

After tuning the hyperparameters, the highest score achieved was 75.71%.

Now that the work has been done, it's time to submit the results.

In [None]:
pipe = Pipeline([('ohe', ohe), ('xgbc', XGBClassifier(**xgb_params))])
pipe.fit(X_train,y)
y_pred = pipe.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': y_pred})
output.to_csv('results.csv', index=False)
output