# Kaggle Machine Learning Competition: Predicting Titanic Survivors

Kaggle URL: https://www.kaggle.com/c/titanic
<br>
URL: https://github.com/donnemartin/data-science-ipython-notebooks/blob/master/kaggle/titanic.ipynb


# Important note:
<br>
The first section of this notebook file contains a summary of our Titanic case study.
<br>
This section contains only the code needed to prepare the data for modeling.
<br>
Refer to the last week of the previous teaching block (TB1/Data Management module) to remind more details of our Titanic case study.

# Preparing Data for Modeling
<br>
After running the following code cell your data will be prepared for modeling. 
<br>
You can find these commands descriptions in the last week of the previous teaching block (Week11_CaseStudy_Titanic.ipynb file).
<br>
RUN THE FOLLOWING CELL.
<br>
THEN YOUR DATA IS READY FOR MODELING.
<br>
You can use these variables to train and test your models:
<br>
train_x
<br>
test_x
<br>
train_y
<br>
test_y

In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('TB2_Week04_titanic_train.csv')

# Sex
sexes = sorted(df_train['Sex'].unique())
genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))
df_train['Sex_Val'] = df_train['Sex'].map(genders_mapping).astype(int)

# Embarked
df_train['Embarked_nonull'] = df_train[['Embarked']].replace(np.nan,'nan')
embarked_locs = sorted(df_train['Embarked_nonull'].unique())
embarked_locs_mapping = dict(zip(embarked_locs, range(0, len(embarked_locs) + 1)))
df_train['Embarked_Val'] = df_train['Embarked_nonull'] \
                               .map(embarked_locs_mapping) \
                               .astype(int)
df_train['Embarked'] = df_train[['Embarked_nonull']].replace('nan','S')
df_train['Embarked_Val'] = df_train[['Embarked_Val']].replace(3,2)
embarked_locs = sorted(df_train['Embarked_Val'].unique())
df_train = pd.concat([df_train, pd.get_dummies(df_train['Embarked_Val'], prefix='Embarked_Val')], axis=1)

# Age
df_train['AgeFill'] = df_train['Age']
df_train['AgeFill'] = df_train['AgeFill'] \
                        .groupby([df_train['Sex_Val'], df_train['Pclass']]) \
                        .apply(lambda x: x.fillna(x.median()))

# Family Size
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']

# Data Preparation
df_train = df_train.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
df_train = df_train.drop(['Age', 'SibSp', 'Parch', 'PassengerId', 'Embarked_Val', 'Embarked_nonull'], axis=1)
train_data = df_train.values

# Train and Test Sets
from sklearn.model_selection import train_test_split
train_features = train_data[:, 1:]
# 'Survived' column values
train_target = train_data[:, 0]
# Split 80-20 train vs test data
train_x, test_x, train_y, test_y = train_test_split(train_features, 
                                                    train_target, 
                                                    test_size=0.20, 
                                                    random_state=0)

print (train_features.shape, train_target.shape)
print (train_x.shape, train_y.shape)
print (test_x.shape, test_y.shape)

# K-Nearest Neighbor: Training and Evaluate Model
Go to the following link and read more about this algorithm and its parameters; then build some models and evaluate them using different evaluation measures (like the example).
<br>
URL: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
# Example:

clf = KNeighborsClassifier(n_neighbors=5, weights='distance')

clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)

# Evaluation measures
print ("Accuracy = %.2f" % metrics.accuracy_score(test_y, predict_y))
print ("Precision: %.2f" % metrics.precision_score(test_y, predict_y))
print ("Recall: %.2f" % metrics.recall_score(test_y, predict_y))
print ("F1 Measure: %.2f" % metrics.f1_score(test_y, predict_y))
print ("Confusion Matrix:")
metrics.confusion_matrix(test_y, predict_y)

In [None]:
# Build your first DT model and evaluate it


In [None]:
# Build your second KNN model and evaluate it


In [None]:
# Build your third DT model and evaluate it


# Decision Tree: Training and Evaluate Model
Go to the following link and read more about this algorithm and its parameters; then build some models and evaluate them using different evaluation measures (like the example).
<br>
URL: https://scikit-learn.org/stable/modules/tree.html

In [None]:
from sklearn import tree
from sklearn import metrics

In [None]:
# Example:

clf = tree.DecisionTreeClassifier() 

clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)

# Evaluation measures
print ("Accuracy = %.2f" % metrics.accuracy_score(test_y, predict_y))
print ("Precision: %.2f" % metrics.precision_score(test_y, predict_y))
print ("Recall: %.2f" % metrics.recall_score(test_y, predict_y))
print ("F1 Measure: %.2f" % metrics.f1_score(test_y, predict_y))
print ("Confusion Matrix:")
metrics.confusion_matrix(test_y, predict_y)

In [None]:
# Build your first DT model and evaluate it


In [None]:
# Build your second DT model and evaluate it


In [None]:
# Build your third DT model and evaluate it


# Random Forest: Training and Evaluate Model
Go to the following link and read more about this algorithm and its parameters; then build some models and evaluate them using different evaluation measures (like the example).
<br>
URL: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
# Example:

clf = RandomForestClassifier(n_estimators=100)

clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)

# Evaluation measures
print ("Accuracy = %.2f" % metrics.accuracy_score(test_y, predict_y))
print ("Precision: %.2f" % metrics.precision_score(test_y, predict_y))
print ("Recall: %.2f" % metrics.recall_score(test_y, predict_y))
print ("F1 Measure: %.2f" % metrics.f1_score(test_y, predict_y))
print ("Confusion Matrix:")
metrics.confusion_matrix(test_y, predict_y)

In [None]:
# Build your first Random Forest model and evaluate it


In [None]:
# Build your second Random Forest model and evaluate it


In [None]:
# Build your third Random Forest model and evaluate it


# Naïve Bayes: Training and Evaluate Model
Go to the following link and read more about this algorithm and its parameters; then build some models and evaluate them using different evaluation measures (like the example).
<br>
URL: https://scikit-learn.org/stable/modules/naive_bayes.html

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [None]:
# Example:

clf = GaussianNB() 

clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)

# Evaluation measures
print ("Accuracy = %.2f" % metrics.accuracy_score(test_y, predict_y))
print ("Precision: %.2f" % metrics.precision_score(test_y, predict_y))
print ("Recall: %.2f" % metrics.recall_score(test_y, predict_y))
print ("F1 Measure: %.2f" % metrics.f1_score(test_y, predict_y))
print ("Confusion Matrix:")
metrics.confusion_matrix(test_y, predict_y)

In [None]:
# Build your MultinomialNB model and evaluate it


In [None]:
# Build your ComplementNB model and evaluate it


In [None]:
# Build your BernoulliNB model and evaluate it


In [None]:
# Build your CategoricalNB model and evaluate it


# Support Vector Machines (SVM): Training and Evaluate Model
Go to the following link and read more about this algorithm and its parameters; then build some models and evaluate them using different evaluation measures (like the example).
<br>
URL: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.svm import SVC
from sklearn import metrics

In [None]:
# Example:

clf = SVC(kernel='linear') 

clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)

# Evaluation measures
print ("Accuracy = %.2f" % metrics.accuracy_score(test_y, predict_y))
print ("Precision: %.2f" % metrics.precision_score(test_y, predict_y))
print ("Recall: %.2f" % metrics.recall_score(test_y, predict_y))
print ("F1 Measure: %.2f" % metrics.f1_score(test_y, predict_y))
print ("Confusion Matrix:")
metrics.confusion_matrix(test_y, predict_y)

In [None]:
# Build your first SVM model and evaluate it


In [None]:
# Build your second SVM model and evaluate it


In [None]:
# Build your third SVM model and evaluate it
