## Here we have an untidy dataset of Titanic survivors.
### We will look at the first 5 records


In [78]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series 

data = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/Titanic.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


 ## Let's see what the data looks like by checking the number of columns and rows and missing values

In [79]:
print '(Rows, Columns)', data.shape,'\n'
print 'Missing Values','\n', data.isnull().sum()
#this tells us that there are 557 null values for Age and no other missing values

(Rows, Columns) (1313, 7) 

Missing Values 
Unnamed: 0      0
Name            0
PClass          0
Age           557
Sex             0
Survived        0
SexCode         0
dtype: int64


 ## So we have a lot of missing ages in this dataset
 ### Let's see what the data looks like if we filter out records with a missing age

In [80]:
newset = data[data['Age'].notnull()]
newset
#We have 756 records with an Age value

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
5,6,"Anderson, Mr Harry",1st,47.00,male,1,0
6,7,"Andrews, Miss Kornelia Theodosia",1st,63.00,female,1,1
7,8,"Andrews, Mr Thomas, jr",1st,39.00,male,0,0
8,9,"Appleton, Mrs Edward Dale (Charlotte Lamson)",1st,58.00,female,1,1
9,10,"Artagaveytia, Mr Ramon",1st,71.00,male,0,0


 ## Here, we'll try and pull some meaningful information from the records with Age listed
 ### This will help to ensure results aren't skewed by missing values
 ### First we will modify the series to look at just survivors

In [81]:
Survivors = newset[(newset.Survived == 1)]
#Average age of survivers by Class
#It is interesting that age increases in the higher classes
print 'Average Age of Survivors by', Survivors.groupby('PClass').Age.mean(),'\n'

#How many males vs. females survived?
#We can see that many more females survived
print 'Total Survivors by', newset.groupby('Sex').Survived.sum(),'\n'

#How about Survivors by class?
#As expected many more higher class survivors than lower class
print 'Total Survivors by', newset.groupby('PClass').Survived.sum()



Average Age of Survivors by PClass
1st    36.776403
2nd    24.225313
3rd    22.461538
Name: Age, dtype: float64 

Total Survivors by Sex
female    217
male       96
Name: Survived, dtype: int64 

Total Survivors by PClass
1st    139
2nd     96
3rd     78
Name: Survived, dtype: int64


## OK, let's clean it up
## We have 2 unnecessary columns, SexCode and the Unnamed column which is just an index number. Since our records area already indexed by pandas, we don't need that column.

In [95]:
#create a new dataset ('clean') by droping the 2 columns from newset
clean = newset.drop(['SexCode','Unnamed: 0'], axis=1)
clean.head()

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1
