In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.tools as tls
import seaborn as sns
import pdb
from scipy.stats import ttest_ind
%matplotlib inline
pp = pprint.PrettyPrinter(indent=4)

Data Dictionary
Variable Definition Key survival Survival 0 = No, 1 = Yes pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation

In [96]:
titanic = pd.read_csv("titanic_data.csv")
titanic.info()

# Summary Stats
titanic.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Cleaning the data
* fill NaN age values with the mean
* replace passenger class and survived values with text
* add up siblings/spouses and parents/children to get a rough estimate of family size
* drop unnecessary fields
* drop all rows and columns where all values are NaN

In [97]:
# Use a t-test to see whether to ignore NaN values in age column or fill with the mean
with_age = titanic[titanic['Age'].notnull()]
without_age = titanic[titanic['Age'].isnull()]
ttest_ind(with_age['Survived'], without_age['Survived'])
# value of 2.76 indicates a significant difference and suggests filling NaNs with the age mean

def modify_columns(df):
    
    # fill NaN age values with mean
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Passenger class
    pclass = {1: 'Upper', 2: 'Middle', 3: 'Lower'}
    df['Pclass'] = df['Pclass'].map(pclass)
    
    # Survived/Perished
    #survived = {1: 'Survived', 0: 'Perished'}
    #df = df[pd.notnull(df['Survived'])]
    #df['Survived'] = df['Survived'].map(survived)
    
    # family size = sum of SibSp and Parch
    df['family_size'] = df['SibSp'] + df['Parch']
    
    # drop name because it has no use for this analysis
    # drop ticket number because it serves no use - passenger id will suffice
    # drop fare because class will provide more useful information
    # drop port of embarkation because it has no use in my analysis
    df = df.drop(['Cabin','Name','Ticket', 'Fare', 'Embarked'], axis=1)
    
    # Remove rows where there are NaN values in all columns
    df = df.dropna( how = "all" )
    # Remove columns where all the values are NaN
    df = df.dropna( axis = 1, how = "all" )
    
    return df

titanic_df = modify_columns(titanic)
titanic_df.info()
titanic_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
family_size    891 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 62.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,family_size
0,1,0,Lower,male,22.0,1,0,1
1,2,1,Upper,female,38.0,1,0,1
2,3,1,Lower,female,26.0,0,0,0
3,4,1,Upper,female,35.0,1,0,1
4,5,0,Lower,male,35.0,0,0,0


In [94]:
# A brief look at some individual variables

# how many passengers survived? #=> 342
print('Passenger survival:')
print(titanic['Survived'].value_counts())

# box plot of ages
sns.set_style("whitegrid")
ax = sns.boxplot(x=titanic["Age"])

# age of survivors/perishers
age_of_survivors = titanic.groupby('Age')['Survived'].value_counts()
age_of_survivors["Counts"] = titanic.groupby('Age')['Survived'].value_counts()

# Survived by Sex
survival_by_sex = pd.crosstab( titanic_df.Sex, titanic.Survived )
#print('Survival by sex:')
#print(survival_by_sex)

# Survived by Class
#survival_by_class = pd.crosstab( titanic_df.Pclass, titanic.Survived )
#print('Survival by Class:')
#print(survival_by_class)

%pylab inline
sns.factorplot('Pclass','Survived', data = titanic_df, kind = 'bar')
sns.factorplot('Sex', 'Survived', data = titanic_df, kind = 'bar')
sns.factorplot('Pclass', 'Survived', hue = 'Sex', data = titanic_df, kind = 'bar')
sns.factorplot('Age',  'Survived',  data = titanic_df, hue_order = [1,0], orient="h", palette="Set3", kind="violin")

Passenger survival:
Perished    549
Survived    342
Name: Survived, dtype: int64
Populating the interactive namespace from numpy and matplotlib


ValueError: Neither the `x` nor `y` variable appears to be numeric.