## My heart will go on
In this part of the exercise we will work on the Titanic dataset provided by Kaggle. The Titanic dataset contains information of the passengers boarding the Titanic on its final voyage. We will work on predicting whether a given passenger will survive the trip.

## Import Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model

## Declare Variable

In [None]:
lr = 0.001
iter = 7500

## Prepare Data

In [None]:
#training set
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Age"] = train["Age"].fillna(train["Age"].median())
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train.isna().sum()

In [None]:
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].median())

In [None]:
train.isna().sum()

In [None]:
#test set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) 
test["Constant"] = 1
test["Age"] = test["Age"].fillna(test["Age"].median())
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test.isna().sum()

In [None]:
# Explore data before scaling Age
print(f"Max train age : {max(train['Age'])}")
print(f"Max test age : {max(test['Age'])}""")

In [None]:
# Normalize data
train_data = pd.DataFrame()
test_data = pd.DataFrame()
for i in train.columns:
    if i in ("Pclass", "Age", "Embarked"):
        train_min = min(train[i])
        train_max = max(train[i])
        train_data[i] = (train[i] - train_min)/(train_max - train_min)
        test_data[i] = (test[i] - train_min)/(train_max - train_min)

train_data["Survived"] = train["Survived"]
train_data["Sex"] = train["Sex"]
train_data["Constant"] = 1
train_data["Pclass_2"] = train_data["Pclass"]**2
train_data["Sex_2"] = train_data["Sex"]**2
train_data["Age_2"] = train_data["Age"]**2
train_data["Embarked_2"] = train_data["Embarked"]**2

test_data["Sex"] = test["Sex"]
test_data["Constant"] = 1
test_data["Pclass_2"] = test_data["Pclass"]**2
test_data["Sex_2"] = test_data["Sex"]**2
test_data["Age_2"] = test_data["Age"]**2
test_data["Embarked_2"] = test_data["Embarked"]**2

## define function

In [None]:
# iteration to find theta
def iteration(iter, theta, data):
    for round in range(iter):
        sum_data = [0 for i in range(len(theta))]
        for i in range(len(data)):
            lin = theta.dot(data[i].reshape((len(data[i]),1)))[0]
            exp = 1/(1+ np.exp(-lin))
            for n in range(len(sum_data)):
                sum_data[n] += (y[i] - exp) * data[i][n]
        theta = theta + lr * np.array(sum_data)
    for i in range(len(theta)):
        print(f"theta_{i}(feature : {feature[i]}) : {theta[i]}")
    return theta

In [None]:
 def prediction(feature, dataset, theta):
    predicted = []
    data = np.array(dataset[feature])
    for i in range(len(data)):
        predict = theta.dot(data[i].reshape(len(data[i]),1))[0]
        predicted.append(predict)
    for i in range(len(predicted)):
        checked = 1/(1+np.exp(-predicted[i]))
        if checked <= 0.5:
            predicted[i] = 0
        else:
            predicted[i] = 1
    dataset["Survived_predicted"] = predicted

In [None]:
def test_to_csv(filename, test, test_data):
    # Export to CSV
    filepath = "./" + filename +".csv"
    Question = test[["PassengerId", "Name", "Sex", "Age"]]
    Question.loc[Question["Sex"] == 0, "Sex"] = "male"
    Question.loc[Question["Sex"] == 1, "Sex"] = "female"
    Question["Survived"] = test_data["Survived_predicted"]
    Question.to_csv(filepath, index = False)

## Prediction with 4 features ["Pclass", "Sex", "Age", "Embarked"]

In [None]:
# Declare variable
filename = "Question1"
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [None]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

In [None]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

In [None]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

In [None]:
# Export to .CSV
test_to_csv(filename, test, test_data)

## (Optional) Try adding some higher order features to your training

In [None]:
# Declare variable
filename = "Question2"
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked", "Pclass_2", "Sex_2", "Age_2", "Embarked_2"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [None]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

In [None]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

In [None]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

In [None]:
# Export to .CSV
test_to_csv(filename, test, test_data)

## (Optional) What happens if you reduce the amount of features to just Sex and Age?

In [None]:
# Declare variable
filename = "Question3"
feature = ["Constant", "Sex", "Age"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [None]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

In [None]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
prediced_result = train_data["result"].value_counts()
accuracy = prediced_result[True] * 100 /(prediced_result[True] + prediced_result[False])
print(f"Accuracy : {accuracy}%")

In [None]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

In [None]:
# Export to .CSV
test_to_csv(filename, test, test_data)