In [None]:
# Computational Media II
# Assignment #1
# What is the chance of survival ?
# Naive Bayes classification applied to the Titanic dataset
# Due date: Tuesday, Feb 21st, noon.

# Implementation approaches gleaned from: https://www.youtube.com/watch?v=PPeaRc-r1OI&ab_channel=codebasics 

This assignment asks you to reflect on the significance of input parameters for model behavior, exemplified on Naive Bayes Classification (via the scikit library) on the Titanic data set.


Your tasks are as follows:

1) Create at least two different Naive Bayes models.

Each model must have a different configuration of inputs (at least one column must be different). Describe your rational for the choice of inputs. Describe how you wrangle your data to ensure clean inputs to the Naive Bayes classifier, and implement it in code.

2) Evaluate the behavior of your models on imagined passengers (as the example below shows). Compare the performance of your two classifiers and describe the results.

3) Search the Kaggle repository (https://www.kaggle.com/) for additional datasets that you could imagine applying Naive Bayes classification on.
Describe which aspects of the selected dataset are of interest to you. Describe which features would be inputs and which would be the target.

In [None]:
import os, sys
import numpy
import pandas

# The Titanic dataset
source = 'https://raw.githubusercontent.com/realtechsupport/CriticalMachineLearning/main/various_datasets/titanic_train.csv'
# Load data as a dataframe
dataframe = pandas.read_csv(source, sep = ',')
# Show first 5 rows
dataframe.head(5)

In [None]:
# Show dimensions
dataframe.shape

In [None]:
# CHANGE THIS SELECTION !!!

dataframe.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Cabin', 'Ticket', 'Embarked'], axis='columns', inplace=True)
dataframe.head()

In [None]:
# NO CHANGE HERE

# Define target and dependent variables in two separate entities
target = dataframe.Survived
inputs = dataframe.drop('Survived', axis = 'columns')

In [None]:
# ADAPT this feature to suit your data choices

# We will use a version of one-hot encoding to represent gender
# N possible values for a categorical variable, will necessitate N columns for representation
# Male and female are mapped to two columns
# Convert gender to a boolean using the pandas functionality "get_dummies"
dummies = pandas.get_dummies(inputs.Sex)
dummies.head(3)

In [None]:
# ADJUST this change to suit your inputs

# Append the boolean representation of gender to the existing inputs
# Also drop the gender column as it is no longer needed
inputs = pandas.concat([inputs, dummies], axis='columns')
inputs.drop('Sex', axis = 'columns', inplace=True)
inputs.head(3)

In [None]:
# Check the dataset for any missing data
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [None]:
# Now find those AGE NaN values
inputs[inputs.isna().any(axis=1)]

In [None]:
# EXPERIMENT with different ways of addressing missing values
# average / removal / data from different source, etc
# Defend your approach

# That is alot of missing values.
# Here is a trick - fill in the values with imaginary numbers
# Take the average age as a guess
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head(10)

#Can you imagine a better way to do this?

In [None]:
# CHANGE test percentage to see how training and test size choices impact the classifier

# OK, now we have a mathematically complete dataset, albeit one that makes some really strong assumptions...
# We can create training and testing sets on the data now.
# Check out the sklearn library
# https://scikit-learn.org/stable/

from sklearn.model_selection import train_test_split
test_percentage = 0.2
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size = test_percentage)

In [None]:
# Check the training and test sets
print(len(X_train))
print(len(X_test))
X_train[0:5]

In [None]:
X_test[0:5]

In [None]:
# NO CHANGE required here

# Create the Naive Bayes Model, and use a Gaussian distribution 
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [None]:
# NO CHANGE required here

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# NO CHANGE required here

# How good is the performance on the test set?
model.score(X_test, y_test)

In [None]:
# NO CHANGE required here

# That meas about 77% accuracy with a 80 / 20 train test data split
# Now you can use the trained model to predict how one of the test cases would be evaluated
# In other words, if they would survive, based on the model

# Lets try the first 5 cases
model.predict(X_test[:5])


In [None]:
# NO CHANGE required here

# 1 means survive, 0 means die
# Compare to the ground truth data, y_test
y_test[:5]

In [None]:
# Ah ha, we have one false prediction (#100)
# If we retrain the network with more data, and use less for testing,
# We might get better results...

In [None]:
# NO CHANGE required here

# We can use label encoder to translate the 0/1 back to text
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(["survived", "died"])
list(le.inverse_transform(y_test[:5]))

['survived', 'died', 'died', 'survived', 'died']

In [None]:
# NO CHANGE required here

# If you want the probabilities of survival (instead of a live/die result),
# use the predict_proba() function
model.predict_proba(X_test[:5])

In [None]:
# The first number is the probability of dying, the second the probability of survival, based on the current model

In [None]:
# HERE is where you compare model_1 with model_2

# Write a function that takes in the features of an imagined passenger, and a model
# and returns the prediction
# Include a loop so you can process multiple passengers

# features of an imagined passenger
pclass = 2
age = 15
fare = 1000
female = 0
male = 1
imagined = [pclass, age, fare, female, male]
a_imagined = numpy.array(imagined)

prediction_imagined_passenger = model.predict([a_imagined])
list(le.inverse_transform(prediction_imagined_passenger))