# Project Exercise - Solution

In [None]:
import pandas as pd

## Titanic dataset: predict who would survive the Titanic tragedy.

In [None]:
df = pd.read_csv("titanic.csv")
df.head()

---

## Inspect features
Check for missing values. Which features can be dropped?

In [None]:
df.info()

---

## 1.)  Drop the features 'PassengerId', 'Name', 'Ticket', 'Fare', and 'Cabin'.

In [None]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1)
df.head()

## 2.) Drop all rows with missing values.

In [None]:
df = df.dropna()
df.head()

## 3.) Check the size of the dataset after dropping the rows with missing values.

In [None]:
df.shape

## 4.) Rename the feature "SibSp" to "Sibling/Spouse", and rename "Parch" to "Parent/Child".

In [None]:
df = df.rename(columns= {"SibSp":"Sibling/Spouse", "Parch":"Parent/Child"})
df.head()

---

# Feature Transformation

## 5.) Transform the categorical features "Sex" and "Embarked" to numerical values.

### Binary Encode ("Sex")

In [None]:
df["Sex"].unique()

In [None]:
df["Sex"] = df["Sex"].replace({'female':1, 'male':0})
df.head()

### One-Hot Encode ("Embarked")

In [None]:
df["Embarked"].unique()

In [None]:
df = pd.get_dummies(df, columns=["Embarked"], dtype="int")
df.head()

---

# Feature Engineering

## 6.) Create a new feature called "Family Size" derived from summing the features "Sibling/Spouse" and "Parent/Child" + 1.
#### Family Size is an individual plus his/her siblings, spouse, parents, children.

In [None]:
df ["Family Size"] = df["Sibling/Spouse"] + df["Parent/Child"] + 1
df.head()

---

## 7.) The features "Sibling/Spouse" and "Parent/Child" are no longer needed. Drop them.

In [None]:
df = df.drop(["Sibling/Spouse", "Parent/Child"], axis=1)
df.head()

---

## 8.) Move the target "Survived" so that it is the last feature.

In [None]:
df = df.iloc[:, [1,2,3,4,5,6,7,0]]
df.head()

### Alternative solution
#### List comprehension

In [None]:
feature_order = [feature for feature in list(df.columns) if feature not in ["Survived"]] + ["Survived"]

df = df[feature_order]

df.head()

---

## 9.) Separate the data into X (features) and y (target)

In [None]:
# Return only the values, not the DataFrame.
    
X = df.iloc[:,:-1].values

In [None]:
X[:5]

In [None]:
 # Return only the values.
    
y = df.iloc[:,-1].values

In [None]:
y[:10]

---

### Once you believe that your data is appropriately prepared, "Run All Below" (under the Cell menu) to confirm that your data can train a machine learning model.

---

# Machine Learning

## Split the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

## Standardize the data

In [None]:
# Standardize the features so that they are all on the same scale (between -3 and 3)

from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

---

# Machine Learning Algorithm

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
log_reg = LogisticRegression()
log_reg.fit(X_train_std, y_train);

 ### Evaluate the model's performance on the training set.
  Compares the model's predictions to the y_train answer key and returns the performance

In [None]:
log_reg.score(X_train_std, y_train)

 ### Evaluate the model's performance on the test set.
 Compares the model's predictions to the y_test answer key and returns the performance

In [None]:
log_reg.score(X_test_std, y_test)