## My heart will go on
In this part of the exercise we will work on the Titanic dataset provided by Kaggle. The Titanic dataset contains information of the passengers boarding the Titanic on its final voyage. We will work on predicting whether a given passenger will survive the trip.

In [249]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model

In [250]:
#training set
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Age"] = train["Age"].fillna(train["Age"].median())
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [251]:
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].median())

In [252]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [253]:
#test set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) 
test["Constant"] = 1
test["Age"] = test["Age"].fillna(test["Age"].median())
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Constant         0
dtype: int64

In [254]:
# Explore data before scaling Age
print(f"Max train age : {max(train['Age'])}")
print(f"Max test age : {max(test['Age'])}""")

Max train age : 80.0
Max test age : 76.0


In [255]:
# Normalize data
train_data = pd.DataFrame()
test_data = pd.DataFrame()
for i in train.columns:
    if i in ("Pclass", "Age", "Embarked"):
        train_min = min(train[i])
        train_max = max(train[i])
        train_data[i] = (train[i] - train_min)/(train_max - train_min)
        test_data[i] = (test[i] - train_min)/(train_max - train_min)

train_data["Survived"] = train["Survived"]
train_data["Sex"] = train["Sex"]
train_data["Constant"] = 1
train_data["Pclass_2"] = train_data["Pclass"]**2
train_data["Sex_2"] = train_data["Sex"]**2
train_data["Age_2"] = train_data["Age"]**2
train_data["Embarked_2"] = train_data["Embarked"]**2

test_data["Sex"] = test["Sex"]
test_data["Constant"] = 1
test_data["Pclass_2"] = test_data["Pclass"]**2
test_data["Sex_2"] = test_data["Sex"]**2
test_data["Age_2"] = test_data["Age"]**2
test_data["Embarked_2"] = test_data["Embarked"]**2

Unnamed: 0,Pclass,Age,Embarked,Survived,Sex,Constant,Pclass_2,Sex_2,Age_2,Embarked_2
0,1.0,0.271174,0.0,0,0,1,1.00,0,0.073535,0.00
1,0.0,0.472229,0.5,1,1,1,0.00,1,0.223000,0.25
2,1.0,0.321438,0.0,1,1,1,1.00,1,0.103322,0.00
3,0.0,0.434531,0.0,1,1,1,0.00,1,0.188817,0.00
4,1.0,0.434531,0.0,0,0,1,1.00,0,0.188817,0.00
...,...,...,...,...,...,...,...,...,...,...
886,0.5,0.334004,0.0,0,0,1,0.25,0,0.111558,0.00
887,0.0,0.233476,0.0,1,1,1,0.00,1,0.054511,0.00
888,1.0,0.346569,0.0,0,1,1,1.00,1,0.120110,0.00
889,0.0,0.321438,0.5,1,0,1,0.00,0,0.103322,0.25


## define function

In [256]:
# iteration to find theta
def iteration(iter, theta, data):
    for round in range(iter):
        sum_data = [0 for i in range(len(theta))]
        for i in range(len(data)):
            lin = theta.dot(data[i].reshape((len(data[i]),1)))[0]
            exp = 1/(1+ np.exp(-lin))
            for n in range(len(sum_data)):
                sum_data[n] += (y[i] - exp) * data[i][n]
        theta = theta + lr * np.array(sum_data)
    for i in range(len(theta)):
        print(f"theta_{i}(feature : {feature[i]}) : {theta[i]}")
    return theta

In [257]:
 def prediction(feature, dataset, theta):
    predicted = []
    data = np.array(dataset[feature])
    for i in range(len(data)):
        predict = theta.dot(data[i].reshape(len(data[i]),1))[0]
        predicted.append(predict)
    for i in range(len(predicted)):
        checked = 1/(1+np.exp(-predicted[i]))
        if checked <= 0.5:
            predicted[i] = 0
        else:
            predicted[i] = 1
    dataset["Survived_predicted"] = predicted

In [258]:
def test_to_csv(filename, test, test_data):
    # Export to CSV
    filepath = "./" + filename +".csv"
    Question = test[["PassengerId", "Name", "Sex", "Age"]]
    Question.loc[Question["Sex"] == 0, "Sex"] = "male"
    Question.loc[Question["Sex"] == 1, "Sex"] = "female"
    Question["Survived"] = test_data["Survived_predicted"]
    Question.to_csv(filepath, index = False)

## Prediction with 4 features ["Pclass", "Sex", "Age", "Embarked"]

In [259]:
# Declare variable
filename = "Question1"
lr = 0.001
iter = 5000
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [260]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

theta_0(feature : Constant) : 0.8609832070823091
theta_1(feature : Pclass) : -2.39271822258936
theta_2(feature : Sex) : 2.5754004481651633
theta_3(feature : Age) : -2.684422986503412
theta_4(feature : Embarked) : 0.6418055616478123


In [261]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
train_data["result"].value_counts()

True     704
False    187
Name: result, dtype: int64

In [262]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     259
Alive    159
Name: Survived_predicted, dtype: int64

In [263]:
# Export to .CSV
test_to_csv(filename, test, test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## (Optional) Try adding some higher order features to your training

In [264]:
# Declare variable
filename = "Question2"
lr = 0.001
iter = 5000
feature = ["Constant", "Pclass", "Sex", "Age", "Embarked", "Pclass_2", "Sex_2", "Age_2", "Embarked_2"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [265]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

theta_0(feature : Constant) : 0.785713630778885
theta_1(feature : Pclass) : -1.5047932434984543
theta_2(feature : Sex) : 1.2886062762134263
theta_3(feature : Age) : -3.3859110865792763
theta_4(feature : Embarked) : 1.5579246619388971
theta_5(feature : Pclass_2) : -0.763875319934654
theta_6(feature : Sex_2) : 1.2886062762134263
theta_7(feature : Age_2) : 1.108270825353279
theta_8(feature : Embarked_2) : -1.0082200866128888


In [266]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
train_data["result"].value_counts()

True     707
False    184
Name: result, dtype: int64

In [267]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     261
Alive    157
Name: Survived_predicted, dtype: int64

In [268]:
# Export to .CSV
test_to_csv(filename, test, test_data)

## (Optional) What happens if you reduce the amount of features to just Sex and Age?

In [269]:
# Declare variable
filename = "Question3"
lr = 0.001
iter = 5000
feature = ["Constant", "Sex", "Age"]
data = np.array(train_data[feature])
y = np.array(train_data["Survived"])
theta = np.array([0 for i in range(len(data[0]))])

In [270]:
theta = iteration(iter, theta, data)
prediction(feature, train_data, theta)

theta_0(feature : Constant) : -1.3174995712370055
theta_1(feature : Sex) : 2.505313792611764
theta_2(feature : Age) : -0.37701875698102755


In [271]:
# result from training data
train_data["result"] = (train_data["Survived_predicted"] == train_data["Survived"])
train_data[["Survived", "Survived_predicted", "result"]]
train_data["result"].value_counts()

True     701
False    190
Name: result, dtype: int64

In [272]:
# predict test data
prediction(feature, test_data, theta)
test_data.loc[test_data["Survived_predicted"] == 0, "Survived_predicted"] = "Dead"
test_data.loc[test_data["Survived_predicted"] == 1, "Survived_predicted"] = "Alive"
test_data["Survived_predicted"].value_counts()

Dead     266
Alive    152
Name: Survived_predicted, dtype: int64

In [273]:
# Export to .CSV
test_to_csv(filename, test, test_data)