In [1]:
import pandas as pd

In [3]:
# Load data
train_data = pd.read_csv("data/train.csv")

In [14]:
# Explore data
missing_values = train_data.isnull().sum()
print(missing_values)


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
['Europa' 'Earth' 'Mars' nan]
Europa survival rate:  0.6588456123885327


In [42]:
# Rate by Cryo
cryo_transported = train_data.loc[train_data.CryoSleep == True]["Transported"]
print("Cryo survival rate: ", sum(cryo_transported)/len(cryo_transported))

Cryo survival rate:  0.8175831412578202


In [24]:
# Rate by home planet
homeplanet_transported = train_data.loc[train_data.HomePlanet == "Europa"]["Transported"]
print("Europa survival rate: ", sum(homeplanet_transported)/len(homeplanet_transported))

homeplanet_transported = train_data.loc[train_data.HomePlanet == "Earth"]["Transported"]
print("Earth survival rate: ", sum(homeplanet_transported)/len(homeplanet_transported))

homeplanet_transported = train_data.loc[train_data.HomePlanet == "Mars"]["Transported"]
print("Mars survival rate: ", sum(homeplanet_transported)/len(homeplanet_transported))

homeplanet_transported = train_data.loc[train_data.HomePlanet.isnull()]["Transported"]
print("nan survival rate: ", sum(homeplanet_transported)/len(homeplanet_transported))

Europa survival rate:  0.65884561238855
Earth survival rate:  0.42394611038678837
Mars survival rate:  0.5230244457077885
nan survival rate:  0.5124378109452736


In [None]:
# Potential Feature Engineering -  Cabin
# Split the "Cabin" column into three separate columns
cabin_split = train_data["Cabin"].str.split("/", expand=True)

# Rename the columns
cabin_split.columns = ["CabinDeck", "CabinNum", "CabinSide"]

# Replace the "Cabin" column with the new columns
train_data = pd.concat([train_data.drop("Cabin", axis=1), cabin_split], axis=1)

In [50]:
print("Cabin sides: ", train_data["CabinSide"].unique())
print("Cabin decks: ", train_data["CabinDeck"].unique())
print("Cabin nums: ", train_data["CabinNum"].unique())

Cabin sides:  ['P' 'S' nan]
Cabin decks:  ['B' 'F' 'A' 'G' nan 'E' 'D' 'C' 'T']
Cabin nums:  ['0' '1' '2' ... '1892' '1893' '1894']


In [62]:
# Explore Cabin
cabinside_P_transported = train_data.loc[train_data.CabinSide == "P"]["Transported"]
print("P side transported rate: ", sum(cabinside_P_transported)/len(cabinside_P_transported))

cabinside_S_transported = train_data.loc[train_data.CabinSide == "S"]["Transported"]
print("S side transported rate: ", sum(cabinside_S_transported)/len(cabinside_S_transported))

for deck in train_data["CabinDeck"].unique():
    if pd.isna(deck):
        continue
    deck_transported = train_data.loc[train_data.CabinDeck == deck]["Transported"]
    print(f"Deck {deck} transported rate: ", sum(deck_transported)/len(deck_transported))


P side transported rate:  0.4512601046124584
S side transported rate:  0.5550373134328358
Deck B transported rate:  0.7342747111681643
Deck F transported rate:  0.43987115246957764
Deck A transported rate:  0.49609375
Deck G transported rate:  0.5162172723720203
Deck E transported rate:  0.3573059360730594
Deck D transported rate:  0.4330543933054393
Deck C transported rate:  0.6800535475234271
Deck T transported rate:  0.2


In [64]:
# Explore Destination
print(train_data["Destination"].unique())

['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]


In [67]:
# Explore transport rate by destination
for destination in train_data["Destination"].unique():
    if pd.isna(destination):
        continue
    destination_transported = train_data.loc[train_data.Destination == destination]["Transported"]
    print(f"Destination {destination} transported rate: ", sum(destination_transported)/len(destination_transported))

Destination TRAPPIST-1e transported rate:  0.47117497886728654
Destination PSO J318.5-22 transported rate:  0.5037688442211056
Destination 55 Cancri e transported rate:  0.61


In [101]:
# Explore age
age_to_check = 15
age_transported = train_data.loc[train_data.Age <= age_to_check]["Transported"]
print(f"Age less than {age_to_check} transported rate: ", sum(age_transported)/len(age_transported))
print(f"Number of 1000's", len(train_data[train_data.Age == 1000]))

Age less than 15 transported rate:  0.6483870967741936
Number of 0's 0


In [99]:
# Explore VIP
# Rate by Cryo
VIP_transported = train_data.loc[train_data.VIP == True]["Transported"]
print("VIP survival rate: ", sum(VIP_transported)/len(VIP_transported))

VIP survival rate:  0.38190954773869346


In [None]:
# Explore Room Service