In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Table of Contents
* [Load the training dataset](#load-data)
* [Exploratory Data Analysis](#eda)
* [Feature Engineering and missing values](#feature-engineering)
* [Encoding Approaches](#encoding)

<a id="load-data"></a>
## Load the training dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
train_df

<a id="eda"></a>
## Exploratory Data Analysis

In [None]:
train_df.isnull().sum()

Every column other than PassengerId and Transported has around 200 missing values. We'll have to come up with a strategy for filling in these missing values later.

In [None]:
train_df.describe()

In [None]:
train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].boxplot(figsize=(14,7), vert=False);

In [None]:
train_df[train_df['FoodCourt'] > 6500.0]

### Percentages of passengers who were Transported, VIP, and in Cryo-sleep

In [None]:
xported_vals = train_df['Transported'].value_counts()
vip_vals = train_df['VIP'].value_counts()
cryo_vals = train_df['CryoSleep'].value_counts()

fig, axes = plt.subplots(1, 3) # create a grid with 1 row, 3 columns
fig.set_size_inches(12, 4) # make the figure wide

axes[0].pie(xported_vals, labels=xported_vals.index, startangle=90, autopct='%.1f%%')
axes[0].set_title('Transported')

axes[1].pie(vip_vals, labels=vip_vals.index, startangle=90, autopct='%.1f%%')
axes[1].set_title('VIP')

axes[2].pie(cryo_vals, labels=cryo_vals.index, startangle=90, autopct='%.1f%%')
axes[2].set_title('Cryo Sleep');

### What are the most common Home and Destination planets?

In [None]:
def plot_home_dest(df):
    home_vals = df['HomePlanet'].value_counts()
    dest_vals = df['Destination'].value_counts()

    fig, axes = plt.subplots(1, 2) # create a grid with 1 row, 2 columns
    fig.set_size_inches(9, 4) # make the figure wide

    axes[0].pie(home_vals, labels=home_vals.index, startangle=90, autopct='%.1f%%')
    axes[0].set_title('Home Planet')

    axes[1].pie(dest_vals, labels=dest_vals.index, startangle=90, autopct='%.1f%%')
    axes[1].set_title('Destination')
    
plot_home_dest(train_df)

### What is the distribution of passengers by age?

In [None]:
train_df['Age'].plot.hist();

### What is the distribution of passenger spending?

In [None]:
def plot_passenger_spending(df):
    fig, axes = plt.subplots(2, 3) # create a grid with 2 rows, 2 columns
    fig.set_size_inches(12, 8)

    axes[(0, 0)].hist(df['RoomService'], bins=20)
    axes[(0, 0)].set_title('Room Service');

    axes[(0, 1)].hist(df['FoodCourt'], bins=20)
    axes[(0, 1)].set_title('Food Court');

    axes[(1, 0)].hist(df['ShoppingMall'], bins=20)
    axes[(1, 0)].set_title('Shopping Mall');

    axes[(1, 1)].hist(df['Spa'], bins=20)
    axes[(1, 1)].set_title('Spa')
    
    axes[(1, 2)].hist(df['VRDeck'], bins=20)
    axes[(1, 2)].set_title('VR Deck')
    
plot_passenger_spending(train_df)

## Distributions for the transported

Here we'll look at the passengers that were transported and see if any of the distributions we've looked at so far are significantly different from the whole population of passengers.

In [None]:
transported_df = train_df[train_df['Transported']]
transported_df

In [None]:
vip_vals = transported_df['VIP'].value_counts()
cryo_vals = transported_df['CryoSleep'].value_counts()

fig, axes = plt.subplots(1, 2) # create a grid with 1 row, 2 columns
fig.set_size_inches(9, 4) # make the figure wide

axes[0].pie(vip_vals, labels=vip_vals.index, startangle=90, autopct='%.1f%%')
axes[0].set_title('VIP')

axes[1].pie(cryo_vals, labels=cryo_vals.index, startangle=90, autopct='%.1f%%')
axes[1].set_title('Cryo Sleep');

Most of the VIP passengers were not transported. Most of the transported passengers were in Cryo-sleep. Later we'll look closer at the passengers who were in Cryo-sleep to see what proportion of that group were transported.

In [None]:
plot_home_dest(transported_df)

A larger proportion of transported passengers were from Europa. A slightly larger proportion were on their way to 55 Cancri e.

In [None]:
transported_df['Age'].plot.hist();

The shape of the distribution of ages of transported passengers looks similar to the whole population, but it looks like very young people (less than 20 years old) were selected to be transported in much higher numbers than the other age groups.

In [None]:
plot_passenger_spending(transported_df)

These distributions look the same as the overall distributions. This would indicate that where passengers spent money (and time) aboard ship probably did not have much bearing on whether or not they were among the transported.

### Were the people in Cryo-sleep spending any money?

In [None]:
sleepers_df = train_df[train_df['CryoSleep'] ==  True]
sleepers_df.describe()

In [None]:
sleepers_df.isnull().sum()

We don't really need to plot these, since we can see the maximum spent by these passengers across all services is $0. Will can fill in missing values for these five columns for all of these passengers. Let's see what proportion of cryo-sleepers were transported.

In [None]:
sleepers_df['Transported'].value_counts().plot.pie();

In [None]:
plot_home_dest(sleepers_df)

In [None]:
train_df[train_df['CryoSleep'].isnull()]

In [None]:
train_df[train_df['RoomService'].isnull()]

In [None]:
train_df[train_df['FoodCourt'].isnull()]

In [None]:
train_df[train_df['ShoppingMall'].isnull()]

In [None]:
train_df[train_df['Spa'].isnull()]

In [None]:
train_df[train_df['VRDeck'].isnull()]

<a id="feature-engineering"></a>
## Feature Engineering and missing values

Fill in missing Cabin values before splitting it up into three columns, deck, cabin number, and side of the ship.

In [None]:
train_df['Cabin'] = train_df['Cabin'].fillna('U/-1/U')

In [None]:
train_df[['Deck', 'CabinNum', 'ShipSide']] = train_df['Cabin'].str.split('/', expand=True)
train_df['CabinNum'] = train_df['CabinNum'].astype(int)

In [None]:
cabin_occ = train_df.groupby(['Cabin'])['Cabin'].transform('count')
train_df['CabinOcc'] = cabin_occ

In [None]:
train_df['Deck'].value_counts()

In [None]:
train_df[['Deck', 'HomePlanet', 'ShipSide']].groupby(['Deck', 'HomePlanet']).count()

In [None]:
train_df[['HomePlanet', 'ShipSide', 'Deck']].groupby(['HomePlanet', 'ShipSide']).count()

In [None]:
train_df[['Deck', 'Destination', 'ShipSide']].groupby(['Deck', 'Destination']).count()

In [None]:
# These decks are exclusive to Europa
train_df.loc[(train_df['HomePlanet'].isna()) & (train_df['Deck'].isin(['A', 'B', 'C', 'T'])), 'HomePlanet'] = 'Europa'

# Deck G is exclusive to Earth
train_df.loc[(train_df['HomePlanet'].isna()) & (train_df['Deck'].isin(['G'])), 'HomePlanet'] = 'Earth'

In [None]:
train_df.loc[train_df['HomePlanet'].isna()]

Split the passenger ID column into group and number.

In [None]:
train_df[['PassGroup', 'PassNum']] = train_df['PassengerId'].str.split('_', expand=True)
train_df['PassGroup'] = train_df['PassGroup'].astype(int)
train_df['PassNum'] = train_df['PassNum'].astype(int)

In [None]:
group_occ = train_df.groupby(['PassGroup'])['PassGroup'].transform('count')
train_df['GroupOcc'] = group_occ

In [None]:
def plot_deck_and_side(df):
    deck_vals = df['Deck'].value_counts().sort_index()
    side_vals = df['ShipSide'].value_counts()

    fig, axes = plt.subplots(1, 2) # create a grid with 1 row, 2 columns
    fig.set_size_inches(9, 4) # make the figure wide

    axes[0].pie(deck_vals, labels=deck_vals.index, startangle=90, autopct='%.1f%%')
    axes[0].set_title('Deck')

    axes[1].pie(side_vals, labels=side_vals.index, startangle=90, autopct='%.1f%%')
    axes[1].set_title('Side')
    
plot_deck_and_side(train_df)

In [None]:
transported_df = train_df[train_df['Transported']]
plot_deck_and_side(transported_df)

### Missing values remaining

In [None]:
train_df.isnull().sum()

We'll fill in the remaining missing values with our best guesses so far. We can refine this approach later and test if the refinements improve our model.

In [None]:
train_df['Age'] = train_df['Age'].fillna(train_df.groupby('HomePlanet')['Age'].transform('median'))

# Use median values of all passengers to replace missing numeric values.
train_df['Age'] = train_df['Age'].fillna(27.0)

In [None]:
train_df['Name'] = train_df['Name'].fillna('Name Unknown')
train_df[['FName', 'LName']] = train_df['Name'].str.split(' ', expand=True)

train_df['HomePlanet'] = train_df['HomePlanet'].fillna('Unknown')
train_df['Destination'] = train_df['Destination'].fillna('TRAPPIST-1e')

# train_df['CryoSleep'] = train_df['CryoSleep'].fillna(False)
train_df['VIP'] = train_df['VIP'].fillna(False)

train_df['RoomService'] = train_df['RoomService'].fillna(0.0)
train_df['FoodCourt'] = train_df['FoodCourt'].fillna(0.0)
train_df['ShoppingMall'] = train_df['ShoppingMall'].fillna(0.0)
train_df['Spa'] = train_df['Spa'].fillna(0.0)
train_df['VRDeck'] = train_df['VRDeck'].fillna(0.0)

Add a column for the total amount spent by each passenger on ship amenities.

In [None]:
train_df['FoodSpend'] = train_df['RoomService'] + train_df['FoodCourt']
train_df['TotalSpend'] = train_df['FoodSpend'] + train_df['ShoppingMall'] + train_df['Spa'] + train_df['VRDeck']

train_df['PctRoomService'] = train_df['RoomService'] / train_df['TotalSpend']
train_df['PctFoodCourt'] = train_df['FoodCourt'] / train_df['TotalSpend']
train_df['PctFoodSpend'] = train_df['FoodSpend'] / train_df['TotalSpend']
train_df['PctShoppingMall'] = train_df['ShoppingMall'] / train_df['TotalSpend']
train_df['PctSpa'] = train_df['Spa'] / train_df['TotalSpend']
train_df['PctVRDeck'] = train_df['VRDeck'] / train_df['TotalSpend']

pct_cols = ['PctRoomService', 'PctFoodCourt', 'PctShoppingMall', 'PctSpa', 'PctVRDeck', 'PctFoodSpend']
train_df[pct_cols] = train_df[pct_cols].fillna(0.0)

Since CryoSleep is an important feature, we don't want to just set all the missing values to the most common value (False). Instead, if a passenger has spent any money, assume they are not in CryoSleep. Otherwise, assume they are.

In [None]:
train_df['CryoSleep'] = train_df['CryoSleep'].fillna(train_df['TotalSpend'] == 0.0)

<a id="encoding"></a>
## Encoding approaches

Here I'll try replacing some of the categorical features with a few different encoding approaches to see which one performs better with this data.

1. Drop categorical columns
2. Ordinal encoding
3. One-hot encoding
4. Target encoding

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = DecisionTreeClassifier(max_depth=5)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds), confusion_matrix(y_valid, preds)

def show_confusion_matrix(cm):
    ax = sns.heatmap(cm, annot=True, fmt='g')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.xaxis.set_ticklabels(['False', 'True'])
    ax.yaxis.set_ticklabels(['False', 'True']);

In [None]:
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = train_df['Transported']
X = train_df.drop(['Transported'], axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
X_train.head()

### Score from Approach 1 (Drop Categorical Variables)

In [None]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
drop_X_train.head()

In [None]:
score, cm = score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)
print("Accuracy from Approach 1 (Drop categorical variables):", score)
show_confusion_matrix(cm)

### Score from Approach 2 (Ordinal Encoding)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# List of categorical variables to encode.
object_cols = ['HomePlanet', 'Destination', 'Deck', 'ShipSide']

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

# Drop any remaining categorical columns (passender id, cabin)
label_X_train = label_X_train.select_dtypes(exclude=['object'])
label_X_valid = label_X_valid.select_dtypes(exclude=['object'])
label_X_train.head()

In [None]:
score, cm = score_dataset(label_X_train, label_X_valid, y_train, y_valid)
print("Accuracy from Approach 2 (Ordinal Encoding):", score)
show_confusion_matrix(cm)

### Score from Approach 3 (One-Hot Encoding)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# List of categorical variables to encode.
object_cols = ['HomePlanet', 'Destination', 'Deck', 'ShipSide', 'LName']

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.select_dtypes(exclude=['object'])
num_X_valid = X_valid.select_dtypes(exclude=['object'])

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_train.head()

In [None]:
score, cm = score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)
print("Accuracy from Approach 3 (One-Hot Encoding):", score)
show_confusion_matrix(cm)

### Score from Approach 4 (Target Encoding)

In [None]:
from category_encoders.target_encoder import TargetEncoder

# List of categorical variables to encode.
object_cols = ['HomePlanet', 'Destination', 'Deck', 'ShipSide']

# Make copy to avoid changing original data 
target_enc_X_train = X_train.copy()
target_enc_X_valid = X_valid.copy()

# Apply target encoder to each column with categorical data
target_encoder = TargetEncoder()
target_enc_X_train[object_cols] = target_encoder.fit_transform(X_train[object_cols], y_train)
target_enc_X_valid[object_cols] = target_encoder.transform(X_valid[object_cols])

# Drop any remaining categorical columns (passender id, cabin)
target_enc_X_train = target_enc_X_train.select_dtypes(exclude=['object'])
target_enc_X_valid = target_enc_X_valid.select_dtypes(exclude=['object'])
target_enc_X_train.head()

In [None]:
score, cm = score_dataset(target_enc_X_train, target_enc_X_valid, y_train, y_valid)
print("Accuracy from Approach 3 (Target Encoding):", score)
show_confusion_matrix(cm)