# Predicting Survival on the Titanic

## Prepare dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
# The data can be accessed online.

data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')

data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [3]:
# Replace question mark by NaN.

data = data.replace('?', np.nan)

In [4]:
# Extract the first letter from the variable
# cabin.

def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
    
data['cabin'] = data['cabin'].apply(get_first_cabin)

In [5]:
# retain predictor variables

usecols = ["pclass", "sibsp", "parch", "sex", "embarked", "cabin", "survived"]

data = data[usecols]

In [6]:
# capture letter from cabin

data["cabin"] = data["cabin"].str[0]

In [7]:
data["cabin"].value_counts(normalize=True)

cabin
C    0.318644
B    0.220339
D    0.155932
E    0.138983
A    0.074576
F    0.071186
G    0.016949
T    0.003390
Name: proportion, dtype: float64

In [8]:
data.loc[data["cabin"] == "T", "cabin"] = "G"

data["cabin"].value_counts(normalize=True)

cabin
C    0.318644
B    0.220339
D    0.155932
E    0.138983
A    0.074576
F    0.071186
G    0.020339
Name: proportion, dtype: float64

In [9]:
data.fillna({
    "cabin":"M",
    "embarked": "S",
}, inplace=True)

In [10]:
data[['pclass', 'sibsp', 'parch']] = data[['pclass', 'sibsp', 'parch']].astype(int)

In [11]:
data.head()

Unnamed: 0,pclass,sibsp,parch,sex,embarked,cabin,survived
0,1,0,0,female,S,B,1
1,1,1,2,male,S,C,1
2,1,1,2,female,S,C,0
3,1,1,2,male,S,C,0
4,1,1,2,female,S,C,0


In [12]:
backup = data.copy()

In [13]:
from feature_engine.encoding import OneHotEncoder

data = OneHotEncoder(drop_last=True).fit_transform(data)

data

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Unnamed: 0,pclass,sibsp,parch,survived,sex_female,embarked_S,embarked_C,cabin_B,cabin_C,cabin_E,cabin_D,cabin_A,cabin_M,cabin_G
0,1,0,0,1,1,1,0,1,0,0,0,0,0,0
1,1,1,2,1,0,1,0,0,1,0,0,0,0,0
2,1,1,2,0,1,1,0,0,1,0,0,0,0,0
3,1,1,2,0,0,1,0,0,1,0,0,0,0,0
4,1,1,2,0,1,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,1,0,0,1,0,1,0,0,0,0,0,1,0
1305,3,1,0,0,1,0,1,0,0,0,0,0,1,0
1306,3,0,0,0,0,0,1,0,0,0,0,0,1,0
1307,3,0,0,0,0,0,1,0,0,0,0,0,1,0


In [14]:
# Save data.

data.to_csv('../titanic.csv', index=False)

In [15]:
from feature_engine.encoding import OrdinalEncoder

backup = OrdinalEncoder(encoding_method="arbitrary").fit_transform(backup)

backup

  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
  return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


Unnamed: 0,pclass,sibsp,parch,sex,embarked,cabin,survived
0,1,0,0,0,0,0,1
1,1,1,2,1,0,1,1
2,1,1,2,0,0,1,0
3,1,1,2,1,0,1,0
4,1,1,2,0,0,1,0
...,...,...,...,...,...,...,...
1304,3,1,0,0,1,5,0
1305,3,1,0,0,1,5,0
1306,3,0,0,1,1,5,0
1307,3,0,0,1,1,5,0


In [16]:
# Save data.

backup.to_csv('../titanic_trees.csv', index=False)