# Scratch notebook
https://www.kaggle.com/hgyoon/spaceship-titanic-eda-scratch-note-korean/edit

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problems
- Spaceship named titanic met a dust cloud.
- almost half of passnegers were transporte to other dimensions
- We need to create a model that predicts whether a passenger will be transported based on the passensgers data

# data
## input
- PassengerId(nominal): Each Id takes the form `gggg_pp` where \         
`gggg` indicates a group of the passenger is travelling with \     
`pp` is their number within the group

- HomePlanet(nominal): The planet the passenger departed from\    
- CryoSleep(binary): whether the passenger elected to be put into suspended animation for the duration of the voyage\     
- Cabin(nominal): the cabin number where the passenger is staying\    
- Destination(categorical): the planet the passenger will be debarking to\
- Age(discrete): the age of the passenger\
- VIP (Binary): Whether the passenger is VIP or not\
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck (continuous): amount the passenger billed at each amenities\
- Name: the first and last name of the passenger 

## Target
- Transported: wheter the passenger was transported or not

# Import libaray

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

from typing import List

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
print(f"train data_len: {df_train.shape[0]}")
print(f"test data_len: {df_test.shape[0]}")

In [None]:
df_train.head()

In [None]:
df_test.head()

# Making meta data

In [None]:
df_train.columns

In [None]:
def get_meta_data(df: pd.DataFrame):
    data = []
    for f in df.columns:
        # Defining the role
        if f == 'Transported':
            role = 'target'
        else:
            role = 'input'
         
        # Defining the level
        if f in ["PassengerId", "HomePlanet", "Cabin", "Destination", "Name"]:
            level = 'nominal'
        elif f in ["CryoSleep", "VIP"]:
            level = 'binary'
        elif f in ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]:
            level = 'comtinous'
        elif f in ["Age"]:
            level = 'discrete'

        # Initialize keep to True for all variables except for id
        keep = True
        if f in ['PassengerId', "Name"]:
            keep = False

        # Defining the data type 
        dtype = df[f].dtype

        # Creating a Dict that contains all the metadata for the variable
        f_dict = {
            'varname': f,
            'role': role,
            'level': level,
            'keep': keep,
            'dtype': dtype
        }
        data.append(f_dict)
        
    
    meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
    meta.set_index('varname', inplace=True)
    return meta

In [None]:
meta = get_meta_data(df_train)

In [None]:
meta

# Checking null value

In [None]:
def checking_percent_of_null(df: pd.DataFrame, stage: str = 'train'):
    print(f"Percent of NaN value for {stage}")
    for col in df.columns:
        print(f"column; {col:>12}\t Percent of NaN value: {100 * (df_train[col].isnull().sum() / df_train[col].shape[0]):.2f}")

## trainset

In [None]:
checking_percent_of_null(df_train, "train")

### Visualizing

In [None]:
msno.matrix(df=df_train.iloc[:,:], figsize=(8, 8), color=(0.8, 0.5, 0.2))

In [None]:
msno.bar(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))

## testset

In [None]:
checking_percent_of_null(df_test, "test")

### visualizing

In [None]:
msno.matrix(df=df_test.iloc[:,:], figsize=(8, 8), color=(0.8, 0.5, 0.2))

In [None]:
msno.bar(df=df_test.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))

- We can see every input feature has some null value

# Checking target label distribution

In [None]:
def checking_binary_target_distribution(df: pd.DataFrame):
    f, ax = plt.subplots(1, 2, figsize=(18, 8))
    df_train['Transported'].value_counts().plot.pie(explode=[0, 0.1], autopct="%1.1f%%", ax=ax[0], shadow=True)
    ax[0].set_title('Pie plot - Transported')
    ax[0].set_ylabel('')
    sns.countplot('Transported', data=df, ax=ax[1])
    ax[1].set_title('Count plot - Transported')
    plt.show()
    
    return 

In [None]:
checking_binary_target_distribution(df_train)

The number of each target label seems to be balanced.

# EDA

## PassnegerId (categorical):
- Split string by '_' to get group information (`PassengerGroup`)
- Make `GroupSize` feature

In [None]:
df_train["PassengerGroup"] = df_train['PassengerId'].apply(lambda x: x.split("_")[0])
df_test["PassengerGroup"] = df_test['PassengerId'].apply(lambda x: x.split("_")[0])

In [None]:
df_train = pd.merge(df_train, pd.DataFrame(df_train[["PassengerGroup"]].value_counts(), columns=["GroupSize"]).reset_index(), on="PassengerGroup", how="left")
df_test = pd.merge(df_test, pd.DataFrame(df_test[["PassengerGroup"]].value_counts(), columns=["GroupSize"]).reset_index(), on="PassengerGroup", how="left")

In [None]:
def add_row_to_metadata(meta: pd.DataFrame, df, level: List[str], role = 'input', keep=True):
    """
    add row to meta data
    
    Parameters:
    meta(pd.DataFrame): meta data
    df(pd.Series or pd.DataFrame): df added to meta
    
    Returns:
    meta
    """
    level_group = ["ordinal", "nominal", "continuous", "binary"]
    
    if type(df) == pd.core.series.Series:
        if level not in level_group:
            raise f"level should be in [ordinal, nominal, continuous, binary]"
        
        name = df.name
        if name in meta.index:
            print(f"{name} already exists")
            return meta
            
        dtype = df.dtype
        meta = pd.concat([meta, pd.DataFrame([[role, level, keep, dtype]], columns=['role', 'level', 'keep', 'dtype'], index=[name])], axis=0)
    
    if type(df) == pd.core.frame.DataFrame:
        columns = df.columns
    
        for c, l in zip(columns, level):
            if l not in level_group:
                raise f"level should be in [ordinal, nominal, continuous, binary]"
            
            name = df[c].name
            if name in meta.index:
                print(f"{name} already exists")
                continue
        
            dtype = df[c].dtype
            meta = pd.concat([meta, pd.DataFrame([[role, l, keep, dtype]], columns=['role', 'level', 'keep', 'dtype'], index=[name])], axis=0)
        
    return meta

In [None]:
meta = add_row_to_metadata(meta, df_train["GroupSize"], level="ordinal")

In [None]:
print(f"Maximum size of Group: {df_train['GroupSize'].max()}")
print(f"Maximum size of Group: {df_train['GroupSize'].min()}")

max group number is eight

In [None]:
f, ax = plt.subplots(1, 3, figsize=(40, 10))
sns.countplot("GroupSize", data=df_train, ax=ax[0])
ax[0].set_title("(1) # of passenger group", y=1.02)
sns.countplot("GroupSize", hue="Transported", data=df_train, ax=ax[1])
ax[1].set_title("(2) Transported countplot depending on GroupSize", y=1.02)
df_train[["GroupSize", "Transported"]].groupby(['GroupSize'], as_index=True).mean().sort_values(by="Transported", ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Transported rate depending on Familysize', y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

In [None]:
len(df_train[df_train["GroupSize"] == 1]) / len(df_train) * 100 

1. More than half of the passengers(55.27%) traveled alone. \
2. Except GroupSize == 8, the percentage of transported for the passengers traveling with others is over 0.5\
=> we can make new feature `IsAlone`

In [None]:
df_train["IsAlone"] = False
df_train.loc[df_train["GroupSize"] == 1, "IsAlone"] = True

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 10))
ax.set_title("(1) # of passenger group", y=1.02)
sns.countplot("IsAlone", hue="Transported", data=df_train, ax=ax)

In [None]:
meta = add_row_to_metadata(meta, df_train["IsAlone"], level="binary")
meta.loc["GroupSize", "keep"] = False

## HomePlanet

In [None]:
df_train["HomePlanet"].value_counts()

In [None]:
df_test["HomePlanet"].value_counts()

there are three homeplanets 

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(23, 8))
df_train["HomePlanet"].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'], ax=ax[0])
ax[0].set_title("Number of passenger by HomePlanet", y=y_position)
ax[0].set_ylabel("Count")
sns.countplot("HomePlanet", hue="Transported", data=df_train, ax=ax[1])
ax[1].set_title("HomePlanet Transported vs not", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

- It seems that passengers departing from europa are more likely to be transported
- Passengers from Earth appear less transported.


## CryoSleep

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 3, figsize=(30, 8))
df_train["CryoSleep"].value_counts().plot.bar(color=['#CD7F32','#FFDF00'], ax=ax[0])
ax[0].set_title("Number of passenger by CryoSleep", y=y_position)
ax[0].set_ylabel("Count")
sns.countplot("CryoSleep", hue="Transported", data=df_train, ax=ax[1])
ax[1].set_title("CryoSleep Transported vs not", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
sns.countplot("HomePlanet", hue="CryoSleep", data=df_train, ax=ax[2])
ax[2].set_title("CryoSleep by HomePlanet", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

In [None]:
sns.factorplot('HomePlanet', 'Transported', hue='CryoSleep', data=df_train, 
               size=6, aspect=1.5)
plt.title("HomePlanet & Cryosleep vs Transported")

1. CryoSleep == True => high transported rate
2. Earth => low transported rate
3. CryoSleep == True & HomePlanet == Europa => transported rate is almost 1

## Age

In [None]:
def distplot_binary(df: pd.DataFrame, feature: str, hue: str):
    fig, ax = plt.subplots(1, 1, figsize=(9, 5))
    sns.distplot(df[df[hue] == 0][feature], ax=ax)
    sns.distplot(df[df[hue] == 1][feature], ax=ax)
    plt.legend([f'{hue} == 0', f'{hue} == 1'])
    plt.title(f"{feature} vs {hue}")
    plt.show()

In [None]:
distplot_binary(df_train, "Age", "Transported")

- It seems that passengers under 10-years-old are more likely to be transported

### Age: discrete to ordinal

Convert Age to ordinal data/    
Make each category has similar number of sampels by using qcut

In [None]:
pd.qcut(df_train['Age'], 4).unique()

In [None]:
df_train['AgeBand']=0
df_train.loc[df_train['Age']<=19,'AgeBand']=0
df_train.loc[(df_train['Age']>19)&(df_train['Age']<=27),'AgeBand']=1
df_train.loc[(df_train['Age']>27)&(df_train['Age']<=38),'AgeBand']=2
df_train.loc[(df_train['Age']>38)&(df_train['Age']<79),'AgeBand']=3
df_train.loc[df_train['Age']>79,'AgeBand']=4
df_train.head(2)

In [None]:
def factorplot(df: pd.DataFrame, feature: str, target: str, hue: str = None, col: str = None):

    if hue and col:
        sns.factorplot(feature, target, hue=hue, col=col, data=df, 
                size=6, aspect=1.5)
        plt.title(f"{feature} & {hue} vs {target}")
    elif hue and (not col):
        sns.factorplot(feature, target, hue=hue, data=df, 
                size=6, aspect=1.5)
        plt.title(f"{feature} & {hue} vs {target}")
    elif (not hue) and col:
        sns.factorplot(feature, target, col=col, data=df, 
                size=6, aspect=1.5)
        plt.title(f"{feature} & {hue} vs {target}")
    else:
        sns.factorplot(feature, target, data=df, 
                size=6, aspect=1.5)
        plt.title(f"{feature} vs {target}")

In [None]:
factorplot(df_train, "AgeBand", "Transported")

- AgeBand == 0 (under 19) seems to be transported
- except AgeBand == 0, Tranported rate is incremently higher along the ageband

### Add feature: IsChild

It might be better to add `IsChild` feature instead of `AgeBand`

In [None]:
df_train["IsChild"] = False
df_train.loc[(df_train['Age']<10), 'IsChild'] = True
df_train.head()

In [None]:
factorplot(df_train, "IsChild", "Transported")

In [None]:
meta = add_row_to_metadata(meta, df_train["IsChild"], level="binary")
meta.loc["Age", "keep"] = False 
meta

## Amenities

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(40, 8))
for i, amen in enumerate(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']):
    g = sns.distplot(df_train[amen], color='b', label=f'Skewness: {df_train[amen].skew():.2f}', ax=ax[i])
    g = g.legend(loc='best')

We can see every amenity value is right skewed./     
use log

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(60, 10))
for i, amen in enumerate(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']):
    g = sns.distplot(df_train[df_train["Transported"] == 0][amen].map(lambda x: np.log(x) if x > 0 else 0), ax=ax[i])
    g = sns.distplot(df_train[df_train["Transported"] == 1][amen].map(lambda x: np.log(x) if x > 0 else 0), ax=ax[i])
    g = g.legend(["Transported == 0", "Transported == 1"])

## Amenity and HomePlanet vs Transported

In [None]:
homeplanets = df_train["HomePlanet"].unique()[:-1]

In [None]:
fig, ax = plt.subplots(3, 5, figsize=(50, 30))
for i, p in enumerate(homeplanets):
    for j, amen in enumerate(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']):
        g = sns.distplot(df_train[(df_train["Transported"] == 0) & (df_train["HomePlanet"]==p)][amen].map(lambda x: np.log(x) if x > 0 else 0), ax=ax[i][j])
        g = sns.distplot(df_train[(df_train["Transported"] == 1) & (df_train["HomePlanet"]==p)][amen].map(lambda x: np.log(x) if x > 0 else 0), ax=ax[i][j])
        g = g.legend(["Transported == 0", "Transported == 1"])
        g.set_title(p) 


Like Age, it might be better to add features that whether a passenger pay for each amenity (is_amenity)

In [None]:
#fig, ax = plt.subplots(1, 5, figsize=(40, 8))

for j, amen in enumerate(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']):
    df_train[f"Is{amen}"] = 0
    df_train[f"Is{amen}"] = df_train[amen] != 0
    g = sns.factorplot("HomePlanet", "Transported", hue=f"Is{amen}", data=df_train, size=6, aspect=1.5)

In [None]:
for amen in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    meta = add_row_to_metadata(meta, df_train[f"Is{amen}"], level="binary")
    meta.loc[amen, "keep"] = False

We can see a passenger who doesn't spend money for each amenity has higher possibility to be transported\    
and the detail number for each amenity and homeplanet is a little different.(athough the shape is similar)\     
So I'll use all is_amenity features not sum all of them

## VIP

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 3, figsize=(30, 8))
df_train["VIP"].value_counts().plot.bar(color=['#CD7F32','#FFDF00'], ax=ax[0])
ax[0].set_title("Number of VIP passengers", y=y_position)
ax[0].set_ylabel("Count")
sns.countplot("VIP", hue="Transported", data=df_train, ax=ax[1])
ax[1].set_title("VIP Transported vs not", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
sns.countplot("HomePlanet", hue="VIP", data=df_train, ax=ax[2])
ax[2].set_title("VIP by homeplanet", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

### VIP, Homplanet, transported

In [None]:
#fig, ax = plt.subplots(1, 5, figsize=(40, 8))

df_train[f"is_{amen}"] = 0
df_train[f"is_{amen}"] = df_train[amen] != 0
g = sns.factorplot("HomePlanet", "Transported", hue="VIP", data=df_train, size=6, aspect=1.5)

In [None]:
df_train[(df_train["HomePlanet"]=="Earth")].VIP.sum()

there's no VIP passenger who departed from Earth\     

## Destination

In [None]:
df_train['Destination'].value_counts()

There's three destinations

### Destination vs transported

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(23, 8))
df_train["Destination"].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'], ax=ax[0])
ax[0].set_title("Number of passenger by Destination", y=y_position)
ax[0].set_ylabel("Count")
sns.countplot("Destination", hue="Transported", data=df_train, ax=ax[1])
ax[1].set_title("Destination Transported vs not", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

### Homeplanet vs Destination

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(23, 8))
df_train["HomePlanet"].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'], ax=ax[0])
ax[0].set_title("Number of passenger by Destination", y=y_position)
ax[0].set_ylabel("Count")
sns.countplot("HomePlanet", hue="Destination", data=df_train, ax=ax[1])
ax[1].set_title("HomePlanet Destination vs not", y=y_position)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

most of passenger whose destination is PSO are from Earth

### Homeplanet & destination vs transported

The `Homeplanet` to the `destination` might contain travel distance information.

In [None]:
f, ax = plt.subplots(1, 3, figsize=(40, 12))
for i, p in enumerate(homeplanets):
    y_position = 1.02
    g = sns.countplot("Destination", hue="Transported", data=df_train[df_train["HomePlanet"]==p], ax=ax[i])
    g.set_title(p)
    plt.subplots_adjust(wspace=0.3, hspace=0.5)
f.suptitle("HomePlanet Destination vs Transported", fontsize=40)    
plt.show()

We can check the transported rate is higher for the passengers from Europa traveling to TRAPPIST-1e, 55 Cancri e Destination

## Cabin

In [None]:
df_train["deck"] = df_train["Cabin"].str.split("/").str[0]
df_train["num"] = df_train["Cabin"].str.split("/").str[1]
df_train["side"] = df_train["Cabin"].str.split("/").str[2]