# Import our libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline


# Set a random seed

In [2]:
import random
random.seed(42)

# Load the dataset

In [3]:
full_data = pd.read_csv('titanic_data.csv')
# in_file = 'titanic_data.csv'
# full_data = pd.read_csv(in_file)

# Print the first few entries of the RMS Titanic data
display(full_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**欄位敘述**

1. Survival - Survival (0 = No; 1 = Yes). Not included in test.csv file.
2. Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
3. Name - Name
4. Sex - Sex
5. Age - Age
6. Sibsp - Number of Siblings/Spouses Aboard
7. Parch - Number of Parents/Children Aboard
8. Ticket - Ticket Number
9. Fare - Passenger Fare
10. Cabin - Cabin
11. Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

# Data Preprocessing

In [4]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
features_raw = full_data.drop('Survived', axis = 1)
display(features_raw.head())

# Removing the names
features_no_name = features_raw.drop(['Name'], axis=1)
# features_no_name = features_raw.drop(['Name', 'Ticket', 'Cabin'], axis=1)
display(features_no_name.head())

# One-hot encoding
features = pd.get_dummies(features_no_name)
display(features.head())
features = features.fillna(0.0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,female,35.0,1,0,113803,53.1,C123,S
4,5,3,male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


# Import models from sklearn

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC


# Instantiate a number of our models

In [8]:
naive_bayes = MultinomialNB()
bag_mod = BaggingClassifier(n_estimators=200)
rf_mod = RandomForestClassifier(n_estimators=200)
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
svm_mod = SVC()


# Fit each of the 4 models

In [9]:
naive_bayes.fit(X_train, y_train)
bag_mod.fit(X_train, y_train)
rf_mod.fit(X_train, y_train)
ada_mod.fit(X_train, y_train)
svm_mod.fit(X_train, y_train)


SVC()

# Make predictions using each of your models

In [12]:
preds_bag = naive_bayes.predict(X_test)
preds_rf = bag_mod.predict(X_test)
preds_ada = rf_mod.predict(X_test)
preds_nb = ada_mod.predict(X_test)
preds_svm = svm_mod.predict(X_test)


# Evaluation

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [13]:
def accuracy(actual, preds):

    return np.sum(preds == actual)/len(actual)

print(accuracy(y_test, preds_nb))
print(accuracy_score(y_test, preds_nb))

0.8212290502793296
0.8212290502793296


In [14]:
def precision(actual, preds):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    pred_pos = (preds==1).sum()
    return tp/pred_pos

print(precision(y_test, preds_nb))
print(precision_score(y_test, preds_nb))

0.828125
0.828125


In [15]:
def recall(actual, preds):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    act_pos = (actual==1).sum()
    return tp/act_pos

print(recall(y_test, preds_nb))
print(recall_score(y_test, preds_nb))

0.7162162162162162
0.7162162162162162


In [16]:
def f1(preds, actual):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    pred_pos = (preds==1).sum()
    prec = tp/(pred_pos)
    act_pos = (actual==1).sum()
    recall = tp/act_pos
    return 2*prec*recall / (prec+recall)

print(f1(y_test, preds_nb))
print(f1_score(y_test, preds_nb))

0.7681159420289855
0.7681159420289855


In [17]:
def print_metrics(y_true, preds):

   print('Accuracy score: ', format(accuracy_score(y_true, preds)))
   print('Precision score: ', format(precision_score(y_true, preds)))
   print('Recall score: ', format(recall_score(y_true, preds)))
   print('F1 score: ', format(f1_score(y_true, preds)))
   print('\n')

In [18]:
# Print scores
print_metrics(y_test, preds_bag)
print_metrics(y_test, preds_rf)
print_metrics(y_test, preds_ada)
print_metrics(y_test, preds_nb)
print_metrics(y_test, preds_svm)

Accuracy score:  0.6927374301675978
Precision score:  0.6461538461538462
Recall score:  0.5675675675675675
F1 score:  0.6043165467625901


Accuracy score:  0.8212290502793296
Precision score:  0.828125
Recall score:  0.7162162162162162
F1 score:  0.7681159420289855


Accuracy score:  0.8156424581005587
Precision score:  0.8360655737704918
Recall score:  0.6891891891891891
F1 score:  0.7555555555555555


Accuracy score:  0.8212290502793296
Precision score:  0.828125
Recall score:  0.7162162162162162
F1 score:  0.7681159420289855


Accuracy score:  0.5977653631284916
Precision score:  0.6666666666666666
Recall score:  0.05405405405405406
F1 score:  0.1


