## My heart will go on
In this part of the exercise we will work on the Titanic dataset provided by Kaggle. The Titanic dataset contains information of the passengers boarding the Titanic on its final voyage. We will work on predicting whether a given passenger will survive the trip.

## Import Packages

In [204]:
import pandas as pd
import numpy as np
import math
from sklearn import datasets, linear_model

## Declare Variable

In [None]:
# Declare Variable
lr = 0.001
iter = 7500

## Prepare Data

In [205]:
#training set
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Age"] = train["Age"].fillna(train["Age"].median())
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [206]:
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].median())

In [207]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [208]:
#test set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) 
test["Constant"] = 1
test["Age"] = test["Age"].fillna(test["Age"].median())
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Constant         0
dtype: int64

In [209]:
# Explore data before scaling Age
print(f"Max train age : {max(train['Age'])}")
print(f"Max test age : {max(test['Age'])}""")

Max train age : 80.0
Max test age : 76.0


In [210]:
# Normalize data
train_data = pd.DataFrame()
test_data = pd.DataFrame()
for i in train.columns:
    if i in ("Pclass", "Age", "Embarked"):
        train_min = min(train[i])
        train_max = max(train[i])
        train_data[i] = (train[i] - train_min)/(train_max - train_min)
        test_data[i] = (test[i] - train_min)/(train_max - train_min)

train_data["Survived"] = train["Survived"]
train_data["Sex"] = train["Sex"]
train_data["Constant"] = 1
train_data["Pclass_2"] = train_data["Pclass"]**2
train_data["Sex_2"] = train_data["Sex"]**2
train_data["Age_2"] = train_data["Age"]**2
train_data["Embarked_2"] = train_data["Embarked"]**2

test_data["Sex"] = test["Sex"]
test_data["Constant"] = 1
test_data["Pclass_2"] = test_data["Pclass"]**2
test_data["Sex_2"] = test_data["Sex"]**2
test_data["Age_2"] = test_data["Age"]**2
test_data["Embarked_2"] = test_data["Embarked"]**2

## define function

In [211]:
# iteration to find theta
def iteration(iter, lr, feature, y, theta, data):
    for round in range(iter):
        lin = theta.dot(data)
        # print(lin)
        g = 1/(1 + math.e ** (-lin)) 
        sum_data = ((y - g) * data).sum(axis = 1)
        theta = theta + lr * sum_data
    for i in range(len(theta)):
        print(f"theta_{i}(feature : {feature[i]}) : {theta[i]}")
    return theta

In [212]:
def prediction(feature, dataset, theta):
    data = np.array(dataset[feature]).transpose()
    predict = theta * data
    predict = 1/(1+math.e ** (-predict.sum(axis = 0)))
    predicted = np.where(predict<=0.5,0,1)
    dataset["Survived_predicted"] = predicted

In [213]:
def test_to_csv(filename, test, test_data):
    # Export to CSV
    filepath = "./" + filename +".csv"
    Question = test[["PassengerId", "Name", "Sex", "Age"]]
    Question.loc[Question["Sex"] == 0, "Sex"] = "male"
    Question.loc[Question["Sex"] == 1, "Sex"] = "female"
    Question["Survived"] = test_data["Survived_predicted"]
    Question.to_csv(filepath, index = False)

## Prediction with 4 features ["Pclass", "Sex", "Age", "Embarked"]

In [214]:
# Declare variable
filename = "Question1_Vectorization"
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked"]
data = np.array(train_data[feature]).transpose()
y = np.array(train_data["Survived"]).reshape((1,data.shape[1]))
theta = np.array([0 for i in range(len(data))])


In [215]:
theta = iteration(iter, lr, feature, y, theta, data)
theta = theta.reshape(5,1)
prediction(feature, train_data, theta)

theta_0(feature : Constant) : -0.2012361307345755
theta_1(feature : Pclass) : -1.7764799868890386
theta_2(feature : Sex) : 2.475546797793185
theta_3(feature : Age) : -0.6440942872712742
theta_4(feature : Embarked) : 0.4777285173478564


In [216]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

Accuracy : 78.56341189674524%


In [217]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     266
Alive    152
Name: Survived_predicted, dtype: int64

In [218]:
# Export to .CSV
test_to_csv(filename, test, test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## (Optional) Try adding some higher order features to your training

In [230]:
# Declare variable
filename = "Question2_Vectorization"
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked", "Pclass_2", "Sex_2", "Age_2", "Embarked_2"]
data = np.array(train_data[feature]).transpose()
y = np.array(train_data["Survived"]).reshape((1,data.shape[1]))
theta = np.array([0 for i in range(len(data))])

In [231]:
theta = iteration(iter, lr, feature, y, theta, data)
theta = theta.reshape((9,1))
prediction(feature, train_data, theta)

theta_0(feature : Constant) : 0.7857136307788849
theta_1(feature : Pclass) : -1.5047932434984554
theta_2(feature : Sex) : 1.2886062762134265
theta_3(feature : Age) : -3.385911086579276
theta_4(feature : Embarked) : 1.5579246619388971
theta_5(feature : Pclass_2) : -0.7638753199346533
theta_6(feature : Sex_2) : 1.2886062762134265
theta_7(feature : Age_2) : 1.1082708253532785
theta_8(feature : Embarked_2) : -1.008220086612889


In [232]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

Accuracy : 79.34904601571269%


In [233]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     261
Alive    157
Name: Survived_predicted, dtype: int64

In [234]:
# Export to .CSV
test_to_csv(filename, test, test_data)

## (Optional) What happens if you reduce the amount of features to just Sex and Age?

In [248]:
# Declare variable
filename = "Question3_Vectorization"
feature = ["Constant", "Sex", "Age"]
data = np.array(train_data[feature]).transpose()
y = np.array(train_data["Survived"]).reshape((1,data.shape[1]))
theta = np.array([0 for i in range(len(data))])

In [249]:
theta = iteration(iter, lr, feature, y, theta, data)
theta = theta.reshape(3,1)
prediction(feature, train_data, theta)

theta_0(feature : Constant) : -1.3174995712370055
theta_1(feature : Sex) : 2.505313792611764
theta_2(feature : Age) : -0.3770187569810276


In [250]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

Accuracy : 78.67564534231201%


In [251]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     266
Alive    152
Name: Survived_predicted, dtype: int64

In [252]:
# Export to .CSV
test_to_csv(filename, test, test_data)