# Predicting Survival on the Titanic

## Prepare dataset

In [22]:
import pandas as pd
import numpy as np

In [23]:
# The data can be accessed online on openml

data = pd.read_csv('TitanicData.csv') #  #https://www.openml.org/data/get_csv/16826755/phpMYEkMl
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


## Replace question mark by NaN.

In [24]:
# Accessing the specific 'TCS' categorical data
data[data['age'] == '?']

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,?,0,0,PC 17318,25.925,?,S,?,?,"New York, NY"
37,1,1,"Bradley, Mr. George ('George Arthur Brayton')",male,?,0,0,111427,26.55,?,S,9,?,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,?,0,0,112379,39.6,?,C,?,?,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,?,0,0,113798,31,?,S,?,?,?
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,?,0,0,17770,27.7208,?,C,5,?,"New York, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1293,3,0,"Williams, Mr. Howard Hugh 'Harry'",male,?,0,0,A/5 2466,8.05,?,S,?,?,?
1297,3,0,"Wiseman, Mr. Phillippe",male,?,0,0,A/4. 34244,7.25,?,S,?,?,?
1302,3,0,"Yousif, Mr. Wazli",male,?,0,0,2647,7.225,?,C,?,?,?
1303,3,0,"Yousseff, Mr. Gerious",male,?,0,0,2627,14.4583,?,C,?,?,?


In [25]:
# Replace question mark by NaN.
data = data.replace('?', np.nan)


In [26]:
data[data['age'] == '?']

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [27]:
data.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [28]:
data['cabin'].unique()

array(['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', nan, 'C62 C64', 'B35',
       'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99',
       'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4',
       'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8',
       'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45',
       'B22', 'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16',
       'A20', 'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111',
       'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123',
       'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125',
       'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45',
       'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49',
       'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17',
       'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30',
       'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', '

## Extract the first letter from the variable cabin.

In [32]:
# Extract the first letter from the variable
# cabin.

def get_first_cabin(row):
    try:
        return row.split()[0][0]
    except:
        return np.nan


In [33]:
data['cabin1'] = data['cabin'].apply(get_first_cabin)

In [35]:
data['cabin1'].unique()

array(['B', 'C', 'E', 'D', 'A', nan, 'T', 'F', 'G'], dtype=object)

In [36]:
data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cabin1
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",B
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",C
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",C
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,,135,"Montreal, PQ / Chesterville, ON",C
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328,,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304,,
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C,,,,


## Save data.

In [37]:
# Save data.
data.to_csv('titanic.csv', index=False)