In [27]:
import pandas as pd
import pprint
import matplotlib.pyplot as plt
import numpy as np

In [11]:
%matplotlib inline

# Load the data

In [85]:
dtypes = {
    'PassengerId': int,
    'Survived': int,
    'Pclass': int,
    'Name': str,
    'Sex': str,
    'Age': str,
    'SibSp': int,
    'Parch': int,
    'Ticket': int,
    'Fare': float,
    'Cabin': str,
    'Embarked': str,
         }

files = {
    'train': '../data/train.csv',
    'test':  '../data/test.csv',
        }
d = {}

for f in files:
    d[f] = pd.read_csv(files[f], index_col=0)

# Using gender as a model

For a first model, try females survive, males do not.  

In [None]:
# Create masks for women and men
women_only = d['train']['Sex'] == 'female'
men_only = d['train']['Sex'] == 'male'

In [51]:
# Get dataframes for men and women
women_onboard = d['train'][women_only]
men_onboard =   d['train'][men_only]

# 1 is survived, 0 is not, so take sum of column and then divide by total count of entries
proportion_women_survived = women_onboard['Survived'].sum() / women_onboard['Survived'].count()
print('Proportion of women survived: {}'.format(proportion_women_survived))
proportion_men_survived = men_onboard['Survived'].sum() / men_onboard['Survived'].count()
print('Proportion of men survived: {}'.format(proportion_men_survived))


Proportion of women survived: 0.7420382165605095
Proportion of men survived: 0.18890814558058924


Looks like a strong trend.  Use this as the model for test predictions.

In [86]:
# Define a function that returns survival based on gender.
def model_sex(ds):
    """
    Predict survival based on gender.  Return a pd.Series object so that the result from 
    .apply will be a dataframe"""
    if ds['Sex'] == 'female':
        return pd.Series({'Survived': 1})
    else:
        return pd.Series({'Survived': 0})

# Using the above model, apply it to each row of the dataframe.
# axis=1 looks at each row at a time, then the function above takes that ds and works
# using the 'Sex' column in that row. 
p_genderbasedmodel = d['test'].apply(model_sex, axis=1)
# print(p_genderbasedmodel)

Using the gender model, make a submission file for the Kaggle competition.

In [84]:
p_genderbasedmodel.to_csv('genderbasedmodel.csv')

# Using a bit more than gender...

Make a table of data that shows the chance of survival for any combination of Sex [male, female], Class [1, 2, 3], and Fare

In [126]:
classes = [1, 2, 3]
fare_ranges = [[-np.inf, 10], [10, 20], [20, 30], [30, np.inf]]
genders = ['male', 'female']
survival_table = np.zeros((len(genders), len(classes), len(fare_ranges)))
for i in range(len(classes)):
    for j in range(len(fare_ranges)):
        for k in range(len(genders)):
#             print("{} {} {}".format(classes[i], fare_ranges[j], genders[k]))
            stats = d['train'][
                        (d['train']['Sex'] == genders[k]) &
                        (d['train']['Pclass'] == classes[i]) &
                        (d['train']['Fare'] >= fare_ranges[j][0]) & 
                        (d['train']['Fare'] < fare_ranges[j][1])
                     ]
#             print(stats.count())
            survival_table[k,i,j] = stats['Survived'].mean(skipna=True)
#         print(women_only_stats)

print(survival_table)


[[[ 0.                 nan  0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]

 [[        nan         nan  0.83333333  0.97727273]
  [        nan  0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]]


In [127]:
# One way of finding nan is by checking if elements are equal to themselves:
# survival_table != survival_table

# But something that feels more intuitive is...
survival_table[np.isnan(survival_table)] = 0
print(survival_table)

[[[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]

 [[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]]


Using this, create a survival table based on whether each category is >0.5 or <0.5. 
Also, pretty much all men die...

In [128]:
survival_table[survival_table >= 0.5] = 1
survival_table[survival_table < 0.5] = 0
print(survival_table)

[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]]


Apply the survival table above to the data.

Process the data into fare bins first

In [165]:
d['genderclass'] = d['test'].copy()
def fare_bin(ds):
    if np.isnan(ds['Fare']):
        # Use Pclass as a surrogate for fare group.  Take first class as the
        # top Fare bin and work backward
        return len(fare_ranges) - ds['Pclass']
    for i in range(len(fare_ranges)):
        if ds['Fare'] >= fare_ranges[i][0] and ds['Fare'] < fare_ranges[i][1]:
            return i
    raise ValueError('Fare outside all fare ranges')   
    
d['genderclass']['Fare_bin'] = d['genderclass'].apply(fare_bin, axis=1)

Add in bins for gender (male/female labels are pain)

In [180]:
def gender_bin(ds):
    if ds['Sex'] == 'male':
        return 0
    elif ds['Sex'] == 'female':
        return 1
    else:
        raise ValueError('Invalid sex')

d['genderclass']['Sex_bin'] = d['genderclass'].apply(gender_bin, axis=1)

Class is 0-indexed

In [192]:
def class_bin(ds):
    return ds['Pclass'] - 1
d['genderclass']['Pclass_bin'] = d['genderclass'].apply(class_bin, axis=1)

Predict survival given the survival table and a data record

In [209]:
def model_genderclass(ds, survival_table, index_labels):
    """
    Return survival given a record as a Pandas DataSeries and a survival table.
    
    :inputs:
    :ds: Pandas DataSeries of a single record
    :survival_table: an N-dimensional survival table
    :index_labels: An N-element list if DataSeries column labels corresponding 
                   to the dimensions of the survival table, ie:
                     ['Fare', 'Sex']
                   means survival = survival_table[ds['Fare']][ds['Sex']]
                   
    Returns as a Pandas DataSeries so that when using with apply, it becomes a 
    dataframe
    """
#         return pd.Series({'Survived': 1})

    indices = [ds[l] for l in index_labels]
    
    def rec_list_lookup(lst, indices):
        if len(indices) == 1:
            return lst[indices[0]]
        else:
            return rec_list_lookup(lst[indices[0]], indices[1:])
    
    return pd.Series({'Survived': int(rec_list_lookup(survival_table, indices))})
    

In [210]:
p_genderclassbasedmodel = d['genderclass'].apply(
    model_genderclass, axis=1, args=(survival_table, ['Sex_bin', 'Pclass_bin', 'Fare_bin',]))

In [213]:
p_genderclassbasedmodel.to_csv('genderclassbasedmodel.csv')