In [1]:
# https://www.kaggle.com/netssfy/learning-curve
# https://medium.com/i-like-big-data-and-i-cannot-lie/how-i-scored-in-the-top-9-of-kaggles-titanic-machine-learning-challenge-243b5f45c8e9
import pandas as pd
import numpy as np

pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

In [2]:
fp_tr = "../local-data/input/train.csv"
fp_te = "../local-data/input/test.csv"

train = pd.read_csv(fp_tr)
test = pd.read_csv(fp_te)

dat =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
TestPassengerID = test['PassengerId']
dat.shape

(1309, 12)

In [3]:
df = dat.copy()

In [4]:
# missing = df.copy().isna().sum()
# (missing.reset_index(name='cnt')
#  .sort_values(by='cnt', ascending=True))

#### Cabin (needed for hasCabin feature)

In [5]:
# df['Cabin'].value_counts(normalize=True).head()

In [6]:
# modify Cabin
df['Cabin'] = df['Cabin'].fillna('U')
df['Cabin'] = df.Cabin.str.extract('([A-Za-z])', expand=False)

df['Cabin'].value_counts(normalize=True)
pd.crosstab(df.Pclass, # ROWS
              df.Cabin, # COLS
              values=df.PassengerId, 
              aggfunc=pd.Series.nunique,
              margins = 'True',
#               normalize='index',
              dropna=False).round(2)

Cabin,A,B,C,D,E,F,G,T,U,All
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.0,65.0,94.0,40.0,34.0,,,1.0,67.0,323
2,,,,6.0,4.0,13.0,,,254.0,277
3,,,,,3.0,8.0,5.0,,693.0,709
All,22.0,65.0,94.0,46.0,41.0,21.0,5.0,1.0,1014.0,1309


In [7]:
# tmp = df[(df.Cabin=='U')]
# print("Cabin Unknown")
# pd.crosstab(tmp.Pclass, # ROWS
#               tmp.Sex, # COLS
#               values=tmp.PassengerId, 
#               aggfunc=pd.Series.nunique,
# #               margins = 'True',
# #               normalize='index',
# #               normalize='columns',
#               dropna=False).round(2)

#### Title

In [8]:
# obtain Title from name (Mr, Mrs, Miss etc)
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'].value_counts(normalize=True)

Mr          0.578304
Miss        0.198625
Mrs         0.150497
Master      0.046600
Dr          0.006112
Rev         0.006112
Col         0.003056
Major       0.001528
Mlle        0.001528
Ms          0.001528
Dona        0.000764
Don         0.000764
Jonkheer    0.000764
Countess    0.000764
Capt        0.000764
Mme         0.000764
Sir         0.000764
Lady        0.000764
Name: Title, dtype: float64

In [9]:
# consolidate titles
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Dona'],'Royalty')
df['Title'] = df['Title'].replace(['Mme'], 'Mrs')
df['Title'] = df['Title'].replace(['Mlle','Ms'], 'Miss')
df['Title'] = df['Title'].replace(['Capt', 'Col', 'Major','Rev'], 'Officer')
df['Title'] = df['Title'].replace(['Jonkheer', 'Don','Sir'], 'Royalty')
# dataset.loc[(dataset.Sex == 'male')   & (dataset.Title == 'Dr'),'Title'] = 'Mr'
# dataset.loc[(dataset.Sex == 'female') & (dataset.Title == 'Dr'),'Title'] = 'Mrs'
df['Title'].value_counts(normalize=True)

Mr         0.578304
Miss       0.201681
Mrs        0.151261
Master     0.046600
Officer    0.011459
Dr         0.006112
Royalty    0.004584
Name: Title, dtype: float64

In [10]:
# missing = df.copy().isna().sum()
# (missing.reset_index(name='cnt')
#  .sort_values(by='cnt', ascending=True))

#### Impute Missing Age

In [11]:
df['Age'] = df['Age'].fillna(0)
df.Age = df.Age.astype(float)
df['Age'].value_counts(normalize=True).head()

0.0     0.200917
24.0    0.035905
22.0    0.032850
21.0    0.031322
30.0    0.030558
Name: Age, dtype: float64

In [12]:
df[df.Age<0] # sanity check

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title


In [13]:
gpd = df[df.Age>0]
gpd = (gpd.groupby(['Title', 'Sex'])['Age']
 .mean()
 .to_frame()
 .reset_index()
#  .rename(columns={0:'cnt'})
 .sort_values(by=['Title', 'Sex'], ascending=False)
).round(0).set_index(['Title', 'Sex'])
gpd

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Title,Sex,Unnamed: 2_level_1
Royalty,male,42.0
Royalty,female,40.0
Officer,male,48.0
Mrs,female,37.0
Mr,male,32.0
Miss,female,22.0
Master,male,5.0
Dr,male,43.0
Dr,female,49.0


In [14]:
d = dict(zip(gpd.index, gpd.Age))
d

{('Royalty', 'male'): 42.0,
 ('Royalty', 'female'): 40.0,
 ('Officer', 'male'): 48.0,
 ('Mrs', 'female'): 37.0,
 ('Mr', 'male'): 32.0,
 ('Miss', 'female'): 22.0,
 ('Master', 'male'): 5.0,
 ('Dr', 'male'): 43.0,
 ('Dr', 'female'): 49.0}

In [15]:
conditions  = conditions2 = [(df.Title == k[0]) & (df.Sex == k[1]) & (df.Age == 0) for k in d.keys()]
choices     = [v for v in d.values()]

df['Age2'] = np.select(conditions, choices, default = df['Age']).round(0)
df.sample(10)
# df[df.Title=='Master']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age2
412,413,1.0,1,"Minahan, Miss. Daisy E",female,33.0,1,0,19928,90.0,C,Q,Miss,33.0
37,38,0.0,3,"Cann, Mr. Ernest Charles",male,21.0,0,0,A./5. 2152,8.05,U,S,Mr,21.0
753,754,0.0,3,"Jonkoff, Mr. Lalio",male,23.0,0,0,349204,7.8958,U,S,Mr,23.0
135,136,0.0,2,"Richard, Mr. Emile",male,23.0,0,0,SC/PARIS 2133,15.0458,U,C,Mr,23.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,Mrs,35.0
1163,1164,,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26.0,1,0,13508,136.7792,C,C,Mrs,26.0
1061,1062,,3,"Lithman, Mr. Simon",male,0.0,0,0,S.O./P.P. 251,7.55,U,S,Mr,32.0
349,350,0.0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,U,S,Mr,42.0
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,U,C,Mrs,14.0
718,719,0.0,3,"McEvoy, Mr. Michael",male,0.0,0,0,36568,15.5,U,Q,Mr,32.0


#### Other New Features

In [16]:
# Create family size from 'sibsq + parch + 1'
df['FamilySize'] = (df['SibSp'] + df['Parch'] + 1).astype(int)

# reduce levels in FamilySize
conditions = \
[
    (df.FamilySize > 1) & (df.FamilySize < 4),
     df.FamilySize > 4
]
outputs = ['2-3', '4+']
df.FamilySize = np.select(conditions, outputs, df.FamilySize)

df['FamilySize'].value_counts(normalize=True)
# pd.crosstab(df.FamilySize, # ROWS
#               df.Survived, # COLS
#               values=df.PassengerId, 
#               aggfunc=pd.Series.nunique,
#               normalize='index',
#               dropna=False).round(2)

1      0.603514
2-3    0.300993
4+     0.062643
4      0.032850
Name: FamilySize, dtype: float64

In [17]:
# fill the missing values of Embarked feature with the most common occurance

# get most frequent port
most_freq_port = df.Embarked.dropna().mode()[0]

# fill missing
df['Embarked'] = df['Embarked'].fillna(most_freq_port)

# inspect
df['Embarked'].value_counts(normalize=True)
# pd.crosstab(df.Embarked, # ROWS
#               df.Survived, # COLS
#               values=df.PassengerId, 
#               aggfunc=pd.Series.nunique,
#               normalize='index',
#               dropna=False).round(2)


S    0.699771
C    0.206264
Q    0.093965
Name: Embarked, dtype: float64

#### Interim Inspect

In [18]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Age2',
       'FamilySize'],
      dtype='object')

In [19]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age2,FamilySize
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,U,S,Mr,22.0,2-3


#### Binary Features

In [20]:
# isChild
conditions = [df.Title == 'Master']
outputs = [1]
df['isChild'] = np.select(conditions, outputs, 0)

df['isChild'].value_counts(normalize=True).head()
pd.crosstab(df.isChild, # ROWS
              df.FamilySize, # COLS
              values=df.PassengerId, 
              aggfunc=pd.Series.nunique,
              normalize='index',
              dropna=False).round(2)

FamilySize,1,2-3,4,4+
isChild,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.63,0.29,0.03,0.05
1,0.02,0.51,0.08,0.39


In [21]:
# hasCabin
conditions = [df.Cabin == 'U']
outputs = [0]
df['hasCabin'] = np.select(conditions, outputs, 1)

df['hasCabin'].value_counts(normalize=True).head()

# inspect
df['hasCabin'].value_counts(normalize=True)
# pd.crosstab(df.hasCabin, # ROWS
#               df.Survived, # COLS
#               values=df.PassengerId, 
#               aggfunc=pd.Series.nunique,
#               normalize='index',
#               dropna=False).round(2)

0    0.774637
1    0.225363
Name: hasCabin, dtype: float64

In [22]:
# isChild

conditions = [df.Title == 'Master']
outputs = [1]
df['isChild'] = np.select(conditions, outputs, 0)

# inspect
df['isChild'].value_counts(normalize=True)
# pd.crosstab(df.isChild, # ROWS
#               df.Survived, # COLS
#               values=df.PassengerId, 
#               aggfunc=pd.Series.nunique,
#               normalize='index',
#               dropna=False).round(2)

0    0.9534
1    0.0466
Name: isChild, dtype: float64

In [23]:
# last sanity check missing values
missing = df.copy().isna().sum()
(missing.reset_index(name='cnt')
 .sort_values(by='cnt', ascending=True))

Unnamed: 0,index,cnt
0,PassengerId,0
14,FamilySize,0
13,Age2,0
12,Title,0
11,Embarked,0
10,Cabin,0
15,isChild,0
8,Ticket,0
6,SibSp,0
5,Age,0


In [24]:
# Map categorical to numeric
df['Sex'] = df['Sex'].astype('category')
df['FamilySize'] = df['FamilySize'].astype('category')
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
df.head(1)
df.shape

(1309, 17)

In [25]:
df['Age2'] = df['Age2'].astype(int)
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age2,FamilySize,isChild,hasCabin
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,U,S,Mr,22,1,0,0


In [26]:
# split train / test
ls = TestPassengerID.values.tolist()
df_tr = df[~ df.PassengerId.isin(ls)] 
df_te = df[df.PassengerId.isin(ls)] 

In [27]:
cols_tr = ['Survived']
cols_te = ['PassengerId', 'Pclass', 'Sex', 'Age2', 'FamilySize', 'hasCabin', 'isChild']
cols_tr = cols_tr + cols_te

In [28]:
df_tr.loc[:, cols_tr].to_csv("../local-data/output/titanic_train_clean.csv", index=False)
df_te.loc[:, cols_te].to_csv("../local-data/output/titanic_test_clean.csv", index=False)