In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 1. Acquire Data
#=================
trainData = pd.read_csv('../input/data-science-day1-titanic/DSB_Day1_Titanic_train.csv', encoding= 'unicode_escape')
testData = pd.read_csv('../input/titanic/test.csv', encoding= 'unicode_escape')
combine=[trainData, testData]
print(trainData.shape)

In [None]:
# Preview the data
trainData.head(10)

In [None]:
trainData.tail(10)

In [None]:
# Analyze by describing data
print(trainData.columns.values)

In [None]:
# Lets observe datatypes of features in the dataset
trainData.info()
print('_'*40)
testData.info()

In [None]:
# What is the distribution of numerical features
trainData.describe()

In [None]:
# What is the distribution of categorical features
trainData.describe(include=['O'])

In [None]:
#Pclass: We observe significant correlation (>0.5) among Pclass=1 and Survived (classifying #3). 
# We decide to include this feature in our model.
trainData[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values (by='Survived', ascending=False)

In [None]:
#Sex: We confirm the observation during problem definition that 
# Sex=female had very high survival rate at 74% (classifying #1).
trainData[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean(). sort_values(by='Survived', ascending=False)

In [None]:
# Correlating numerical features
g = sns.FacetGrid(trainData, col='Survived')
g.map(plt.hist, 'Age', bins=30)

In [None]:
grid = sns.FacetGrid(trainData, col='Survived', row='Pclass', height=3, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

In [None]:
trainData['Age'].value_counts(dropna=False)

In [None]:
# Correlating categorical features
grid = sns.FacetGrid(trainData, row='Embarked', height=3, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
# Correlating categorical and numerical features
grid = sns.FacetGrid(trainData, row='Embarked', col='Survived', height=3, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

In [None]:
# Wrangle data
# Correcting by dropping unwanted features
print("Before", trainData.shape, testData.shape, combine[0].shape, combine[1].shape)

trainData = trainData.drop(['Ticket', 'Cabin'], axis=1)
testData = testData.drop(['Ticket', 'Cabin'], axis=1)
combine = [trainData, testData]

print("After", trainData.shape, testData.shape, combine[0].shape, combine[1].shape)

In [None]:
# Creating new feature extracting from existing ones
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(trainData['Title'], trainData['Sex'])

In [None]:
# Replace many titles with a more common name or classify them as 'Rare'
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
trainData[['Title', 'Survived']].groupby(['Title'], as_index=False).mean(). sort_values(by='Survived', ascending=False)

In [None]:
# Convert the categorical titles to ordinal.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

trainData.head()

In [None]:
# Now we can safely drop the Name feature from training and testing datasets. 
# We also do not need the PassengerId feature in the training dataset.
trainData = trainData.drop(['Name', 'PassengerId'], axis=1)
testData = testData.drop(['Name'], axis=1)
combine = [trainData, testData]
trainData.shape, testData.shape

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

trainData.head()

In [None]:
# Completing a numerical continuous feature
# First create a empty array
guess_ages = np.zeros((2,3))
guess_ages

In [None]:
# Now we iterate over Sex (0 or 1) and Pclass (1, 2, 3) to calculate guessed values 
# of Age for the six combinations.
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

trainData.head()

In [None]:
# Let us create Age bands and determine correlations with Survived.
trainData['AgeBand'] = pd.cut(trainData['Age'], 5)
trainData[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False). mean().sort_values(by='AgeBand', ascending=True)

In [None]:
# Let us replace Age with ordinals based on these bands.

for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
trainData.head()

In [None]:
# Remove the AgeBand feature.
trainData = trainData.drop(['AgeBand'], axis=1)
combine = [trainData, testData]
trainData.head()

In [None]:
# Create new feature combining existing features
# Create a new feature for FamilySize which combines Parch and SibSp. 
# This will enable us to drop Parch and SibSp from our datasets.

for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

trainData[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False). mean().sort_values(by='Survived', ascending=False)

In [None]:
# Create another feature called IsAlone.
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

trainData[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
# Drop Parch, SibSp, and FamilySize features in favor of IsAlone.
trainData = trainData.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
testData = testData.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [trainData, testData]

trainData.head()

In [None]:
# Create an artificial feature combining Pclass and Age.
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

trainData.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:
# Completing a categorical feature
# Embarked feature takes S, Q, C values based on port of embarkation. 
# Our training dataset has two missing values. 
# We simply fill these with the most common occurance.
freq_port = trainData.Embarked.dropna().mode()[0]
freq_port

In [None]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
trainData[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False). mean().sort_values(by='Survived', ascending=False)

In [None]:
# Converting categorical feature to numeric
# Convert the EmbarkedFill feature by creating a new numeric Port feature.
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

trainData.head()

In [None]:
# Complete the Fare feature for single missing value in test dataset 
# using mode to get the value that occurs most frequently for this feature. )
testData['Fare'].fillna(testData['Fare'].dropna().median(), inplace=True)
testData.head()

In [None]:
# Create FareBand
trainData['FareBand'] = pd.qcut(trainData['Fare'], 4)
trainData[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False). mean().sort_values(by='FareBand', ascending=True)

In [None]:
# Convert the Fare feature to ordinal values based on the FareBand.
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

trainData = trainData.drop(['FareBand'], axis=1)
combine = [trainData, testData]
    
trainData.head(10)

In [None]:
testData.head(10)

In [None]:
# Now, lets do modeling for prediction
# Based on supervised learning plus classification and regression, 
# we narrow down our choice of models to a few. These include:
#    Logistic Regression
#    KNN or k-Nearest Neighbors
#    Support Vector Machines
#    Naive Bayes classifier
#    Decision Tree
#    Random Forrest
#    Perceptron
#    Artificial neural network
#    RVM or Relevance Vector Machine

X_train = trainData.drop("Survived", axis=1)
Y_train = trainData["Survived"]
X_test  = testData.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
X_train.info()

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

In [None]:
# k-Nearest Neighbors algorithm with k=3

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

In [None]:
# Rank our evaluation of all the models to choose the best one

models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)