# Spaceship Titanic

This notebook is for EDA for the titanic spaceship competition on Kaggle.

In [46]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

## 1- Import data

In [3]:
train_data = pd.read_csv("./../../01_data/11_raw/train.csv", header=0)
test_data = pd.read_csv("./../../01_data/11_raw/test.csv", header=0)


In [7]:
train_data, y_train = train_data[train_data.columns.difference(["Transported"])], train_data["Transported"]
test_data = test_data[train_data.columns]

In [10]:
train_data.head(5)

Unnamed: 0,Age,Cabin,CryoSleep,Destination,FoodCourt,HomePlanet,Name,PassengerId,RoomService,ShoppingMall,Spa,VIP,VRDeck
0,39.0,B/0/P,False,TRAPPIST-1e,0.0,Europa,Maham Ofracculy,0001_01,0.0,0.0,0.0,False,0.0
1,24.0,F/0/S,False,TRAPPIST-1e,9.0,Earth,Juanna Vines,0002_01,109.0,25.0,549.0,False,44.0
2,58.0,A/0/S,False,TRAPPIST-1e,3576.0,Europa,Altark Susent,0003_01,43.0,0.0,6715.0,True,49.0
3,33.0,A/0/S,False,TRAPPIST-1e,1283.0,Europa,Solam Susent,0003_02,0.0,371.0,3329.0,False,193.0
4,16.0,F/1/S,False,TRAPPIST-1e,70.0,Earth,Willy Santantines,0004_01,303.0,151.0,565.0,False,2.0


In [12]:
print(train_data.isna().sum())

Age             179
Cabin           199
CryoSleep       217
Destination     182
FoodCourt       183
HomePlanet      201
Name            200
PassengerId       0
RoomService     181
ShoppingMall    208
Spa             183
VIP             203
VRDeck          188
dtype: int64


## 2- Clean data

In [103]:
train_df, test_df = train_data.copy(), test_data.copy()

In [102]:
simple_imputer = SimpleImputer(strategy="most_frequent")

def cleaner(df) :
    df = df.copy()
    df.index = df["PassengerId"]

    df["Destination"] = df.Destination.str[:4]
    df[["CryoSleep","VIP"]] = df[["CryoSleep","VIP"]].astype(bool)
    df[["PassengerGroup", "PassengerNum"]] = df.PassengerId.str.split("_", expand=True).astype(np.int32)
    df[["Deck","Num","Side"]] = df.Cabin.str.split("/", expand=True)
    df.drop(["Name","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck",
             "PassengerId","PassengerNum","Cabin","Num"], axis=1, inplace=True)
    df[["Age","Destination","HomePlanet","Deck","Side"]] = simple_imputer.fit_transform(df[["Age","Destination","HomePlanet","Deck","Side"]])
    return df

In [104]:
train_df, test_df = cleaner(train_df), cleaner(test_df)

In [105]:
train_df.head(3)

Unnamed: 0_level_0,Age,CryoSleep,Destination,HomePlanet,VIP,PassengerGroup,Deck,Side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0001_01,39.0,False,TRAP,Europa,False,1,B,P
0002_01,24.0,False,TRAP,Earth,False,2,F,S
0003_01,58.0,False,TRAP,Europa,True,3,A,S


In [106]:
encoder = OneHotEncoder(drop="first")
encoder.fit(pd.concat([train_df[["Destination","HomePlanet","Deck","Side"]],
                     test_df[["Destination","HomePlanet","Deck","Side"]]]))

def encode(df) :
    index=df.index
    encoded_cols = pd.DataFrame(encoder.transform(df[["Destination","HomePlanet","Deck","Side"]]).toarray(),
                                                                    index=index, columns=encoder.get_feature_names_out())
    
    df.drop(["Destination","HomePlanet","Deck","Side"], axis=1, inplace=True)
    df = pd.concat([df, encoded_cols], axis=1)
    return df

In [107]:
train_df, test_df = encode(train_df), encode(test_df)

In [108]:
train_df.isna().sum()

Age                  0
CryoSleep            0
VIP                  0
PassengerGroup       0
Destination_PSO      0
Destination_TRAP     0
HomePlanet_Europa    0
HomePlanet_Mars      0
Deck_B               0
Deck_C               0
Deck_D               0
Deck_E               0
Deck_F               0
Deck_G               0
Deck_T               0
Side_S               0
dtype: int64