# Titanic Data Visualization

In [1]:
library(readr)
library(dplyr)
library(tibble)
library(ggplot2)
library(ggthemes)
library(gridExtra)
library(RColorBrewer)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine



In [3]:
titanic <- read_csv("../Data/titanic_train.csv")
head(titanic)

Parsed with column specification:
cols(
  PassengerId = col_integer(),
  Survived = col_integer(),
  Pclass = col_integer(),
  Name = col_character(),
  Sex = col_character(),
  Age = col_double(),
  SibSp = col_integer(),
  Parch = col_integer(),
  Ticket = col_character(),
  Fare = col_double(),
  Cabin = col_character(),
  Embarked = col_character()
)


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


<hr>

# Cleaning up Data

## Look at Missing Values (NA)
- See Which Columns have Missing Values
- Use select_if() to provide condition
- Use summarise_each() function to get a count for each Column
- Use . to refer to all the columns to which the functions in funs are applied

In [4]:
titanic %>% 
  select_if(function(x) any(is.na(x))) %>% 
  summarise_each(funs(sum(is.na(.)))) 

`summarise_each()` is deprecated.
Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
To map `funs` over all variables, use `summarise_all()`


Age,Cabin,Embarked
177,687,2


### Only 2 Missing Values in Embarked
- Look at the distribution
- Only 2 Missing Values and 72% is "S"
- Replace NA with "S"


In [5]:
table(titanic$Embarked)


  C   Q   S 
168  77 644 

In [6]:
titanic %>%
  group_by(Embarked) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = round(Count/sum(Count)*100))

Embarked,Count,Percentage
C,168,19
Q,77,9
S,644,72
,2,0


In [7]:
# Filter the rows that have Missing Values for Embarked
titanic %>%
  filter(is.na(Embarked))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28,
830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28,


In [8]:
# Replace NA in Embarked with "S"
# replace(x, list, values) replaces the values in x with indices given in list by those given in values
# Check before overwritting
titanic %>%
  mutate(Embarked = replace(Embarked, is.na(Embarked), "S")) %>%
  group_by(Embarked) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = round(Count/sum(Count)*100))

Embarked,Count,Percentage
C,168,19
Q,77,9
S,646,73


In [10]:
titanic <- titanic %>%
  mutate(Embarked = replace(Embarked, is.na(Embarked), "S"))

head(titanic)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


## Add another Varibale for Family Size
- Add SibSp and Parch + Self

In [11]:
titanic <- titanic %>%
  mutate(FamilySize = 1 + SibSp + Parch) 

## Convert "Pclass", "Survived", "Sex", and "Embarked" Variables into Factors

In [12]:
titanic <- titanic %>%
  mutate(Pclass = factor(Pclass), Survived = factor(Survived), Sex = factor(Sex), Embarked = factor(Embarked)) 

head(titanic)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1


<hr>