# Titanic Bayes - SOLUTION

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.naive_bayes import GaussianNB   #import Gaussian Bayes modeling function
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [5]:
location = "../datasets/titanic.xls"

df = pd.read_excel(location)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Clean the data

In [6]:
#find columns that have missing values
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [33]:
df.groupby(['survived', 'sex', 'pclass'])['age'].mean()

survived  sex     pclass
0         female  1         35.200000
                  2         34.090909
                  3         23.418750
          male    1         43.658163
                  2         33.092593
                  3         26.679598
1         female  1         37.109375
                  2         26.711051
                  3         20.814815
          male    1         36.168240
                  2         17.449274
                  3         22.436441
Name: age, dtype: float64

In [7]:
#fill missing values for age based on survival status, sex, and passenger class
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

In [34]:
df.loc[df['embarked'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [8]:
#only 2 missing values so we'll fill with most common embarkation point
df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [9]:
#fill missing values
df['embarked'].fillna('S', inplace=True)

In [10]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

In [11]:
modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [12]:
#columns left in our dataframe
modeldf.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'embarked'], dtype='object')

Create dummy variables for categorical values

In [13]:
#dummy variables for passenger class embarkation port
#get_dummies will auto-drop columns that dummies were created from
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
#modeldf.head()

In [14]:
#change sex values to binary
#female=0, male=1
modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})
#modeldf.head()

In [15]:
#create new column based on number of family members
#drop sibsp and parch columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)
#modeldf.head()

In [16]:
modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)
#modeldf.head()

In [36]:
modeldf.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num,TravelAlone
0,1,0,29.0,1,0,0,0,0,1,0,1
1,1,1,0.9167,1,0,0,0,0,1,3,0
2,0,0,2.0,1,0,0,0,0,1,3,0
3,0,1,30.0,1,0,0,0,0,1,3,0
4,0,0,25.0,1,0,0,0,0,1,3,0


In [37]:
modeldf.dtypes

survived         int64
sex              int64
age            float64
pclass_1         uint8
pclass_2         uint8
pclass_3         uint8
embarked_C       uint8
embarked_Q       uint8
embarked_S       uint8
family_num       int64
TravelAlone      int32
dtype: object

### Gaussian Naive Bayes model

In [17]:
#extract target variable
#make copy of 'survived' column
y = modeldf['survived']

In [18]:
#copy of modeldf without 'survived' column
X = modeldf.drop(['survived'], axis=1)

In [19]:
#80% for training data, 20% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [20]:
#assign decision tree function to model variable
gnb = GaussianNB()

In [21]:
#develop model using training data
#defining arguments in the model can help prevent overfitting
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
gnb.score(X_train, y_train)

0.766953199617956

In [23]:
#run the predictions on the test data
y_predict = gnb.predict(X_test)

In [24]:
gnb.score(X_test, y_test)

0.7633587786259542

In [25]:
#look at true and false predictions
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

Unnamed: 0,Predicted Not Survival,Predicted Survival
True Not Survival,134,33
True Survival,29,66


In [26]:
#from precision column, model is better at predicting passengers that do not survive
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       167
           1       0.67      0.69      0.68        95

   micro avg       0.76      0.76      0.76       262
   macro avg       0.74      0.75      0.75       262
weighted avg       0.77      0.76      0.76       262



### Bernoulli Naive Bayes model

In [27]:
bnb = BernoulliNB()

In [28]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [29]:
bnb.score(X_train, y_train)

0.7554918815663801

In [30]:
y_predict = bnb.predict(X_test)

In [31]:
#look at true and false predictions
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

Unnamed: 0,Predicted Not Survival,Predicted Survival
True Not Survival,130,37
True Survival,29,66


In [32]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       167
           1       0.64      0.69      0.67        95

   micro avg       0.75      0.75      0.75       262
   macro avg       0.73      0.74      0.73       262
weighted avg       0.75      0.75      0.75       262

