In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Data loading
train_dataframe = pandas.read_csv('train.csv')

In [3]:
train_dataframe.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Title extraction from name

In [4]:
# augment with title
import re
train_dataframe['Title'] = train_dataframe.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))

In [5]:
# get stats 
titles = pandas.DataFrame(train_dataframe['Title'].value_counts().reset_index())
titles.columns = ['title', 'count']
print('%d over %d' % ( titles['count'].sum(), len(train_dataframe.index)))
titles.head(len(titles.index))

891 over 891


Unnamed: 0,title,count
0,Mr,517
1,Miss,182
2,Mrs,125
3,Master,40
4,Dr,7
5,Rev,6
6,Major,2
7,Mlle,2
8,Col,2
9,Jonkheer,1


In [6]:
# reduce to 6 classes
Title_Dictionary = {"Capt":     "Officer",
                    "Col":      "Officer",
                    "Major":    "Officer",
                    "Dr":       "Officer",
                    "Rev":      "Officer",
                    "Jonkheer": "Royalty",
                    "Don":      "Royalty",
                    "Sir" :     "Royalty",
                    "Countess": "Royalty",
                    "Dona":     "Royalty",
                    "Lady":     "Royalty",
                    "Mme":      "Mrs",
                    "Ms":       "Mrs",
                    "Mrs":      "Mrs",
                    "Mlle":     "Miss",
                    "Miss":     "Miss",
                    "Mr":       "Mr",
                    "Master":   "Master"}
train_dataframe['Title'] = train_dataframe.Title.map(Title_Dictionary)
titles = pandas.DataFrame(train_dataframe['Title'].value_counts().reset_index())
titles.columns = ['title', 'count']
print('%d over %d' % ( titles['count'].sum(), len(train_dataframe.index)))
titles.head(len(titles.index))

891 over 891


Unnamed: 0,title,count
0,Mr,517
1,Miss,184
2,Mrs,127
3,Master,40
4,Officer,18
5,Royalty,5


# Null values stats

In [7]:
def null_stats(dataframe):
    
    feature_names = dataframe.columns[1:]
    null_values = np.zeros((feature_names.size))

    for idx,c in enumerate(feature_names):
        null_values[idx] = train_dataframe[c].isnull().sum() / train_dataframe[c].size
    
    idx = np.nonzero( null_values )
    return pandas.DataFrame(data={'feature': feature_names[idx],
                                  'nulls' : null_values[idx]})

In [8]:
null_stats(train_dataframe).head()

Unnamed: 0,feature,nulls
0,Age,0.198653
1,Cabin,0.771044
2,Embarked,0.002245


# Age stats from Sex,Pclass,Title

In [9]:
group = train_dataframe.groupby(['Sex','Pclass','Title'])
print(group['Age'].median())

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        40.0
                Officer    49.0
                Royalty    40.5
        2       Miss       24.0
                Mrs        31.5
        3       Miss       18.0
                Mrs        31.0
male    1       Master      4.0
                Mr         40.0
                Officer    51.0
                Royalty    40.0
        2       Master      1.0
                Mr         31.0
                Officer    46.5
        3       Master      4.0
                Mr         26.0
Name: Age, dtype: float64


In [10]:
# substitute missing with median
train_dataframe.loc[train_dataframe.Age.isnull(), 'Age'] = group.Age.transform('median')
print('Null on Age: %d' % train_dataframe.Age.isnull().sum())

Null on Age: 0
