# Spaceship Titanic with Random Forest

https://www.kaggle.com/competitions/spaceship-titanic/overview

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## The Dataset

In [48]:
# Load a dataset into a Pandas Dataframe
dataset_df = pd.read_csv('./input/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))
# Display the first 5 examples
dataset_df.head(5)

Full train dataset shape is (8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Observations:
- We have `'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'` as numerical features.
- `'CryoSleep', 'VIP'` are binary features.
- `HomePlanet', 'Destination'` are obvious categorical features.
- `'PassengerId', 'Cabin'` seems to be structured in some way.
- `'Name'` could potentially be split into `'Firstname', 'Surname'`.

In [49]:
# null or na checks
dataset_df.isnull().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

Null values in the `'Age'` feature may have to be dealt with differently. Will leave them alone for now.

In [50]:
# fill na
dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']] = dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']].fillna(value=0)
dataset_df[['HomePlanet', 'Destination']] = dataset_df[['HomePlanet', 'Destination']].fillna(value='Unknown')
dataset_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [51]:
# null or na checks
dataset_df.isnull().sum().sort_values(ascending=False)

Name            200
Cabin           199
Age             179
PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

### Cabin Analysis
We will now split the `'Cabin'` column into three columns: `'Deck', 'Cabin_num', 'Side'`

In [52]:
dataset_df[["Deck", "Cabin_num", "Side"]] = dataset_df["Cabin"].str.split("/", expand=True)
dataset_df = dataset_df.drop('Cabin', axis=1)
dataset_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


Think we can consider all three as categorical features. We'll simply drop the nan rows from the dataset.

In [53]:
print("Deck:")
print(dataset_df.Deck.describe())
print(dataset_df.Deck.unique())
print("\n")
print("Cabin_num")
print(dataset_df.Cabin_num.dropna().describe())
print(dataset_df.Cabin_num.unique())
print("\n")
print("Side")
print(dataset_df.Side.describe())
print(dataset_df.Side.unique())

Deck:
count     8494
unique       8
top          F
freq      2794
Name: Deck, dtype: object
['B' 'F' 'A' 'G' nan 'E' 'D' 'C' 'T']


Cabin_num
count     8494
unique    1817
top         82
freq        28
Name: Cabin_num, dtype: object
['0' '1' '2' ... '1892' '1893' '1894']


Side
count     8494
unique       2
top          S
freq      4288
Name: Side, dtype: object
['P' 'S' nan]


That's a lot of people without names.

In [54]:
dataset_df = dataset_df.dropna(subset=['Deck', 'Cabin_num', 'Side'])
dataset_df.isnull().sum().sort_values(ascending=False)

Name            198
Age             175
PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
Deck              0
Cabin_num         0
Side              0
dtype: int64

### Name Analysis

In [55]:
dataset_df[["F_name", "S_name"]] = dataset_df["Name"].str.split(" ", expand=True)
dataset_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side,F_name,S_name
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,Maham,Ofracculy
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,Juanna,Vines
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,Altark,Susent
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,Solam,Susent
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,Willy,Santantines


We can see some passengers share surnames.

In [56]:
print("F_name")
print(dataset_df.F_name.dropna().describe())
print(dataset_df.F_name.unique())
print("\n")
print("S_name")
print(dataset_df.S_name.describe())
print(dataset_df.S_name.unique())

dataset_df.S_name.value_counts()

F_name
count       8296
unique      2693
top       Dandra
freq          11
Name: F_name, dtype: object
['Maham' 'Juanna' 'Altark' ... 'Ants' 'Gian' 'Chain']


S_name
count          8296
unique         2208
top       Casonston
freq             17
Name: S_name, dtype: object
['Ofracculy' 'Vines' 'Susent' ... 'Fort' 'Workmanson' 'Sionerorly']


S_name
Casonston     17
Oneiles       16
Domington     15
Litthews      14
Browlerson    14
              ..
Dun            1
Witalnerod     1
Deryplinet     1
Rosargas       1
Sionerorly     1
Name: count, Length: 2208, dtype: int64

It seems poeple with shared surnames often have the same prefix in their passenger id.

In [57]:
casonston_df = dataset_df.loc[dataset_df['S_name'] == "Browlerson"]
casonston_df.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side,F_name,S_name
1118,1186_01,Earth,False,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Card Browlerson,True,G,183,S,Card,Browlerson
1119,1186_02,Earth,False,TRAPPIST-1e,8.0,0,0.0,0.0,0.0,0.0,0.0,Elany Browlerson,False,G,183,S,Elany,Browlerson
1120,1186_03,Earth,True,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Harrie Browlerson,False,G,183,S,Harrie,Browlerson
1121,1186_04,Earth,False,TRAPPIST-1e,48.0,False,0.0,33.0,654.0,0.0,113.0,Oraryn Browlerson,True,F,227,S,Oraryn,Browlerson
1122,1186_05,Earth,False,TRAPPIST-1e,12.0,False,0.0,0.0,0.0,0.0,0.0,Eriney Browlerson,False,G,183,S,Eriney,Browlerson
1857,1980_01,Earth,True,TRAPPIST-1e,3.0,False,0.0,0.0,0.0,0.0,0.0,Scotte Browlerson,True,G,312,S,Scotte,Browlerson
1859,1980_03,Earth,True,TRAPPIST-1e,44.0,False,0.0,0.0,0.0,0.0,0.0,Penne Browlerson,True,G,312,S,Penne,Browlerson
2661,2853_01,Earth,False,TRAPPIST-1e,20.0,False,56.0,757.0,111.0,0.0,0.0,Lonnez Browlerson,True,F,544,S,Lonnez,Browlerson
4588,4890_01,Earth,False,55 Cancri e,10.0,False,0.0,0.0,0.0,0.0,0.0,Rone Browlerson,True,G,795,S,Rone,Browlerson
4589,4890_02,Earth,False,TRAPPIST-1e,23.0,False,0.0,0.0,874.0,0.0,0.0,Cline Browlerson,True,F,932,S,Cline,Browlerson


### PassengerId Analysis

In [58]:
dataset_df[["PID_pre", "PID_suf"]] = dataset_df["PassengerId"].str.split("_", expand=True)
dataset_df = dataset_df.drop('PassengerId', axis=1)
dataset_df.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side,F_name,S_name,PID_pre,PID_suf
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,Maham,Ofracculy,1,1
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,Juanna,Vines,2,1
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,Altark,Susent,3,1
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,Solam,Susent,3,2
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,Willy,Santantines,4,1


In [59]:
print("PID_pre")
print(dataset_df.PID_pre.dropna().describe())
print(dataset_df.PID_pre.unique())
print("\n")
print("PID_suf")
print(dataset_df.PID_suf.describe())
print(dataset_df.PID_suf.unique())

PID_pre
count     8494
unique    6118
top       5133
freq         8
Name: PID_pre, dtype: object
['0001' '0002' '0003' ... '9278' '9279' '9280']


PID_suf
count     8494
unique       8
top         01
freq      6083
Name: PID_suf, dtype: object
['01' '02' '03' '04' '05' '06' '07' '08']


We can make the assumption that for passengers without a `'Name'` may have the same surnmae as the most frequent `'S_name'` of their shared `'PID_pre'` which can be considered as `'Group'`. Thankfully, everyohne has a passenger id.

In [60]:
casonston_df = dataset_df.loc[dataset_df['PID_pre'] == "5133"]
casonston_df.head(50)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side,F_name,S_name,PID_pre,PID_suf
4808,Earth,False,55 Cancri e,36.0,False,0.0,0.0,846.0,0.0,0.0,Benry Litthews,True,G,829,P,Benry,Litthews,5133,1
4809,Earth,True,PSO J318.5-22,24.0,False,0.0,0.0,0.0,0.0,0.0,Hene Litthews,False,G,829,P,Hene,Litthews,5133,2
4810,Earth,True,PSO J318.5-22,,False,0.0,0.0,0.0,0.0,0.0,Lina Litthews,True,G,829,P,Lina,Litthews,5133,3
4811,Earth,False,55 Cancri e,19.0,False,107.0,0.0,505.0,48.0,0.0,Done Litthews,False,F,1046,P,Done,Litthews,5133,4
4812,Earth,False,TRAPPIST-1e,26.0,False,0.0,910.0,13.0,10.0,15.0,Lynnon Pugherman,False,F,1046,P,Lynnon,Pugherman,5133,5
4813,Earth,True,55 Cancri e,19.0,False,0.0,0.0,0.0,0.0,0.0,Shanya Josey,True,G,829,P,Shanya,Josey,5133,6
4814,Earth,False,TRAPPIST-1e,26.0,False,1528.0,1.0,0.0,0.0,0.0,,False,F,1046,P,,,5133,7
4815,Earth,False,TRAPPIST-1e,35.0,False,0.0,116.0,51.0,0.0,701.0,Lawren Josey,False,F,1046,P,Lawren,Josey,5133,8


In [61]:
dataset_df["Group"] = dataset_df.PID_pre
dataset_df["Group_id"] = dataset_df.PID_suf
dataset_df['Group_size'] = dataset_df.groupby('PID_pre')['PID_pre'].transform('count')
dataset_df = dataset_df.drop('PID_pre', axis=1)

dataset_df.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Transported,Deck,Cabin_num,Side,F_name,S_name,PID_suf,Group,Group_id,Group_size
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,...,False,B,0,P,Maham,Ofracculy,1,1,1,1
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,...,True,F,0,S,Juanna,Vines,1,2,1,1
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,...,False,A,0,S,Altark,Susent,1,3,1,2
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,...,False,A,0,S,Solam,Susent,2,3,2,2
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,...,True,F,1,S,Willy,Santantines,1,4,1,1


In [63]:
dataset_df[(dataset_df['Name'].isna()) & (dataset_df['Group_size'] > 1)]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Transported,Deck,Cabin_num,Side,F_name,S_name,PID_suf,Group,Group_id,Group_size
58,Mars,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,...,True,F,14,S,,,01,0064,01,2
77,Mars,False,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,...,True,F,16,P,,,03,0082,03,3
101,Earth,False,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,...,False,G,19,S,,,02,0108,02,3
297,Europa,False,TRAPPIST-1e,32.0,False,247.0,4651.0,0.0,46.0,202.0,...,True,C,14,S,,,02,0330,02,3
305,Mars,False,55 Cancri e,0.0,False,0.0,0.0,0.0,0.0,0.0,...,True,F,63,S,,,03,0337,03,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,Earth,False,TRAPPIST-1e,9.0,False,0.0,0.0,0.0,0.0,0.0,...,True,G,1476,P,,,04,9081,04,7
8516,Europa,False,55 Cancri e,70.0,False,0.0,2113.0,0.0,1972.0,51.0,...,False,A,95,P,,,01,9085,01,3
8613,Unknown,False,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,...,False,E,603,S,,,01,9194,01,2
8629,Europa,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,...,True,B,300,P,,,02,9205,02,3
