In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pdb
from scipy.stats import ttest_ind
%matplotlib inline

Data Dictionary

Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation

In [84]:
titanic = pd.read_csv("titanic_data.csv")
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [85]:
# Use a t-test to see whether to ignore NaN values in age column or fill with the mean
with_age = titanic[titanic['Age'].notnull()]
without_age = titanic[titanic['Age'].isnull()]
ttest_ind(with_age['Survived'], without_age['Survived']) 
# value of 2.76 indicates a significant difference and suggests filling NaNs with the age mean

Ttest_indResult(statistic=2.7606993230995345, pvalue=0.0058865348400755626)

In [86]:
def modify_columns(df):

    # Passenger class
    pclass = {1: 'Upper', 2: 'Middle', 3: 'Lower'}
    df['Pclass'] = df['Pclass'].map(pclass)
    
    # family size = sum of SibSp and Parch
    df['family_size'] = df['SibSp'] + df['Parch']
    
    # drop name because it has no use for this analysis
    # drop ticket number because it serves no use - passenger id will suffice
    # drop fare because class will provide more useful information
    # drop port of embarkation because it has no use in my analysis
    df = df.drop(['Cabin','Name','Ticket', 'Fare', 'SibSp','Parch','Embarked'], axis=1)
    
    return df

titanic_df = modify_columns(titanic)
titanic_df.head()

   PassengerId  Survived Pclass     Sex   Age  family_size
0            1         0  Lower    male  22.0            1
1            2         1  Upper  female  38.0            1
2            3         1  Lower  female  26.0            0
3            4         1  Upper  female  35.0            1
4            5         0  Lower    male  35.0            0

In [87]:
titanic_df['Age'] = titanic_df['Age'].fillna(titanic['Age'].mean()) # fill NaN values with mean age
fig = plt.figure()
plt.hist(titanic_df['Age'], bins=range(0,90,5))
fig.suptitle('Passenger Ages')
plt.xlabel('age')
plt.ylabel('passengers')
plt.axvline(titanic_df['Age'].mean(), color='b', linestyle='dashed', linewidth=3, label='Mean Passenger Age') # add mean age (29.69)
plt.show()

<matplotlib.figure.Figure at 0x7ffa5ea034d0>

In [88]:
survivors = titanic_df[titanic_df['Survived'] == 1] # fill NaN values with mean age
fig = plt.figure()
plt.hist(survivors['Age'], bins=range(0,90,5))
fig.suptitle('Age of Survivors')
plt.xlabel('ages')
plt.ylabel('survivors')
plt.axvline(survivors['Age'].mean(), color='b', linestyle='dashed', linewidth=3, label='Mean Survivor Age') # add mean age
plt.show()

deceased = titanic_df[titanic_df['Survived'] == 0] # fill NaN values with mean age
fig = plt.figure()
plt.hist(deceased['Age'], bins=range(0,90,5))
fig.suptitle('Age of Deceased')
plt.xlabel('ages')
plt.ylabel('deceased')
plt.axvline(deceased['Age'].mean(), color='b', linestyle='dashed', linewidth=3, label='Mean Survivor Age') # add mean age
plt.show()

<matplotlib.figure.Figure at 0x7ffa5e6aa690>

<matplotlib.figure.Figure at 0x7ffa5e972150>

In [82]:
%pylab inline
#sns.factorplot('Pclass','Survived', data = titanic_df, kind = 'bar')
#sns.factorplot('Sex', 'Survived', data = titanic_df, kind = 'bar')
#sns.factorplot('Pclass', 'Survived', hue = 'Sex', data = titanic_df, kind = 'bar')
#sns.factorplot('Age', 'Survived',  data = titanic_df, hue_order = [1,0], orient="h", palette="Set3", kind="violin")

Populating the interactive namespace from numpy and matplotlib
