# Titanic Bayes Analysis
### *2018-11-03* | *Steven Rankine* | *Data Analytics Boot Camp (Fall 2018)*
- - -
## Using the Titanic dataset and Bayes Theorem find out which of the following is more likely to survive:

1. a boy in 2nd class
2. a women in 3rd class 
3. a man in 1st class

upload a link to your github repository
- - -

In [25]:
# Import libraries
import os
import pandas as pd
import numpy  as np
os.getcwd()

'/home/nbuser/library/notebooks'

In [26]:
# Import data into the python environment
df = pd.read_csv("../datasets/kaggle/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- - -
## Assumptions made
- Children are less than or equal to 18 years
- Men and women are over the age of 18 years
- All features are independent 
- Data is normally distributed

Setup conditional searches 

In [27]:
# Conditions
cond_1  = (df['Sex'] == 'male')  & (df['Age'] <= 18) & (df['Pclass'] == 2) # 2nd class boys
cond_1y = cond_1 & (df['Survived'] == 1)                                   # 2nd class boys surviving
cond_1n = cond_1 & (df['Survived'] == 0)                                   # 2nd class boys not surviving
cond_2  = (df['Sex'] == 'female')& (df['Age']  > 18) & (df['Pclass'] == 3) # 3rd class women  
cond_2y = cond_2 & (df['Survived'] == 1)                                   # 3rd class women surviving
cond_2n = cond_2 & (df['Survived'] == 0)                                   # 3rd class women not surviving
cond_3  = (df['Sex'] == 'male')  & (df['Age']  > 18) & (df['Pclass'] == 1) # 1st class men
cond_3y = cond_3 & (df['Survived'] == 1)                                   # 1st class men surviving
cond_3n = cond_3 & (df['Survived'] == 0)                                   # 1st class men not surviving
cond_4  = ~(cond_1 | cond_2 | cond_3)                                      # Everybody else
cond_4y = cond_4 & (df['Survived'] == 1)                                   # Everybody else surviving
cond_4n = cond_4 & (df['Survived'] == 0)                                   # Everybody else not surviving

- - -
## Convert Data Into Frequncies/Counts 

In [28]:
# Counts
n_all = [len(df[cond_1]),  len(df[cond_2]),  len(df[cond_3]),  len(df[cond_4]) ] # Class total counts
n_no  = [len(df[cond_1n]), len(df[cond_2n]), len(df[cond_3n]), len(df[cond_4n])] # Class non-survival count
n_yes = [len(df[cond_1y]), len(df[cond_2y]), len(df[cond_3y]), len(df[cond_4y])] # Class survival count

- - -
## Calculate Class/Predictor Probabilities

In [29]:
# Predictor Prior Probabilities' 
PA  = 100*sum(n_yes)/sum(n_all)
# Class Prior probability
PC  = [100.0*n_all[0]/sum(n_all),  # Prob of 2nd class boy in population
       100.0*n_all[1]/sum(n_all),  # Prob of 3rd class women in population
       100.0*n_all[2]/sum(n_all),  # Prob of 1st class man in population
       100.0*n_all[3]/sum(n_all)]  # Prob of the rest in population
# Class likelihoods which is the probability of predictor given class
PCA = [100.0*n_yes[0]/sum(n_yes),  # Prob of 2nd class boy out of survivors
       100.0*n_yes[1]/sum(n_yes),  # Prob of 3rd class women out of survivors
       100.0*n_yes[2]/sum(n_yes),  # Prob of 1st class man out of survivors
       100.0*n_yes[3]/sum(n_yes)]  # Prob of being in the rest out of survivors
# Naive-Bayes Theorem
PAC = [PCA[0]*PA/PC[0],            # Prob of a 2nd class boy surviving
       PCA[1]*PA/PC[1],            # Prob of a 3rd class women surviving
       PCA[2]*PA/PC[2],            # Prob of a 1st class men surviving
       PCA[3]*PA/PC[3]]            # Prob of rest of population surviving

- - -
## Display Results

In [30]:
#
print('|-----------------------------------------------------------|')
print('|                    |  Survival |        |        |        |')
print('| Class              | Frequency |  P(C)  | P(C|A) | P(A|C) |')
print('|                    |  No | Yes |        |        |        |')
print('|------------------- + --------- + ------ + ------ + -------|')
print('| Boy in 2nd Class   | %3d | %3d | %5.1f%% | %5.1f%% | %5.1f%% |' % (n_no[0], n_yes[0], PC[0], PCA[0], PAC[0]) )
print('| Women in 3rd Class | %3d | %3d | %5.1f%% | %5.1f%% | %5.1f%% |' % (n_no[1], n_yes[1], PC[1], PCA[1], PAC[1]) )
print('| Men in 1st Class   | %3d | %3d | %5.1f%% | %5.1f%% | %5.1f%% |' % (n_no[2], n_yes[2], PC[2], PCA[2], PAC[2]) )
print('| The rest           | %3d | %3d | %5.1f%% | %5.1f%% | %5.1f%% |' % (n_no[3], n_yes[3], PC[3], PCA[3], PAC[3]) )
print('|------------------- + --------- + -------------------------|')
print('| Total              | %3d | %3d |'      % (sum(n_no), sum(n_yes)) )
print('|------------------- + --------- |')
print('                     | %.f%% | %.f%% |'  % (100*sum(n_no)/sum(n_all), PA) )
print('                     |~P(A)| P(A)|')
print('                     |-----------| \n')


|-----------------------------------------------------------|
|                    |  Survival |        |        |        |
| Class              | Frequency |  P(C)  | P(C|A) | P(A|C) |
|                    |  No | Yes |        |        |        |
|------------------- + --------- + ------ + ------ + -------|
| Boy in 2nd Class   |   6 |   9 |   1.7% |   2.6% |  60.0% |
| Women in 3rd Class |  34 |  25 |   6.6% |   7.3% |  42.4% |
| Men in 1st Class   |  60 |  36 |  10.8% |  10.5% |  37.5% |
| The rest           | 449 | 272 |  80.9% |  79.5% |  37.7% |
|------------------- + --------- + -------------------------|
| Total              | 549 | 342 |
|------------------- + --------- |
                     | 62% | 38% |
                     |~P(A)| P(A)|
                     |-----------| 

