In [1]:
# import the dependencies
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
data = pd.read_csv('./resources/train.csv')

In [6]:
# find out how many rows and columns are present - know how large dataset is
print(data.shape)

(891, 12)


In [7]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The survived column is the target variable. Survival 0 = passenger dead and 1 = passenger survive
We will use the variable from this column to help in the prediction

The other variables are all factors/features to describe the passengers.

- PassengerID: ID # given to each traveler on the boat
- Pclass: passenger class that is broken down 1-3 i.e. 1st class - 3rd class
- Name: name of the passenger
- Sex: gender of the passenger
- Age: age of the passenger
- SibSp: number of siblings and/or spouses that are traveling with that passenger
- Parch: number of parents and/or children that are traveling with the passenge4r
- Ticket: the ticket number given to each passenger
- Fare: the price of the ticket
- Cabin: the number of the cabin that the passenger was staying in
- Embarkation: 3 possible locations to get onto the Titanic (S, C, Q = Southampton, Cherbourg, Queenstown)

In [9]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


The count variable shows that in the Age column has over 100 values missing. 

A solution to fill in the null values is to put in the median age of a typical passenger. We could also impute with the mean age but the median is more robust to outliers.

In [10]:
data["Age"] = data["Age"].fillna(data["Age"].median())

In [11]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292
