In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import & Explore Data

In [None]:
train_data = pd.read_csv(r'../input/titanic/train.csv')
test_data = pd.read_csv(r'../input/titanic/test.csv')

In [None]:
train_data.head()

The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

### Set `Passengerid` column as the index column

In [None]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [None]:
train_data.head()

In [None]:
# General Information
train_data.info()

In [None]:
# Checking missing values
train_data.isna().sum()

- **Cabin** has 687 missing values out of 891 instances. Its better to exclude the column
- **Embarked** has only 2 missing values. We can easily replace them with `mode` of the variable without much side-effect.
- **Age** is complicated. It has **177 missing values** out of 891. If we replaced missing values with some numbers, we risk adding bias to our model. Excluding this variable without good justification would risk losing some important information from this variable. We will dig further into this variable later to make our decision.
<br></br>For now, we will remove **Cabin** from our training data.

In [None]:
train_data.drop(columns="Cabin", inplace=True)

In [None]:
# Columns "Name" & 'Ticket' are identity data of passengers its not going to help
 # us predict their survivorship. So we're going to remove them
    # from our training data
train_data.drop(columns = ['Name', 'Ticket'], inplace=True)

In [None]:
train_data.columns

In [None]:
# Data distribution of each pair of numerical variables
import seaborn as sns
sns.pairplot(train_data, hue = 'Survived')

In [None]:
# Select only numerical variables
X_train_num = train_data.select_dtypes(include=np.number) # Numerical data

# Survived is the label so we want the corr to be high
# For other features, we want them to have low correlation
# X_train_num.drop('Survived',axis = 1, inplace= True)

# annot = True -> annotate corr value in each cell
sns.heatmap(X_train_num.corr(), annot=True)

In [None]:
# Deselect numerical variables (Categorical variables)
X_train_cat = train_data.select_dtypes(exclude=np.number) # Categories data
X_train_cat.describe()

# Preprocessing Data (Build a pipeline)

We want to treat numerical & categorical data differently.<br></br>
**Numerical Attributes:** we can replace its missing values with averages and might want to normalize the numbers so that our estimators won't bias towards larger numbers 

## Preprocessing Numerical Attributes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# `imputer` Replace missing values with mean
# `scaler` normalizes (Standardization) numerical attributes
num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

## Preprocessing Categorical Attributes

**Categorical Attributes:** We cannot replace its missing values with sample average because it doesn't make sense to have 0.65 as gender even if you convert male to '0' and female to '1'. So we're going to replace missing values with `mode` instead.
<br></br>
We need to normalize numerical attributes. For categorical attributes, we need to **encode** them. Some estimators like `logistic regression` takes numerical attributes as predictors even though it is a classifier. i.e. these models cannot take `male` or `female` as input so we need to convert `male -> 0` and `female -> 1`.

In [None]:
from sklearn.preprocessing import OneHotEncoder

# `imputer` Replaces missing values with mode
# `cat_encoder` encoding is needed to feed categorical data into many scikit-learn estimators like linear models  
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

## Selecting features

Let's choose the attributes we're going to feed into the preprocessing pipelines 

In [None]:
# Define numerical attributes
 # Take columns where data type = numerical
X_train_num = train_data.select_dtypes(include=np.number)
X_train_num

In [None]:
# Although Pclass & Survived are int, let's remove them from numerical attributes 
X_train_num = train_data.select_dtypes(include=np.number)
X_train_num.drop(['Pclass','Survived'],axis = 1, inplace= True)
X_train_num

What about 'Age'? We wasn't sure whether we want to include it in our training set.

## Analysis on 'Age'

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,12))
plt.subplot(2,2,1)
sns.distplot(train_data[train_data['Survived']==0]['Age'])
plt.title('1. Age distibution (Not Survived)')

plt.subplot(2,2,2)
sns.distplot(train_data[train_data['Survived']==1]['Age'])
plt.title('2. Age distibution (Survived)')

plt.subplot(2,2,3)
sns.distplot(train_data[train_data['Survived']==0]['Age'], label ='Not survived')
sns.distplot(train_data[train_data['Survived']==1]['Age'], label ='Survived')
plt.legend()
plt.title('3. Overlapped Age distibution')

plt.subplot(2,2,4)
sns.distplot(train_data['Age'])
plt.title('4. Age distibution (Survived & Not Survived)')

Pay attention to the shape of distribution curve for "Survived" and "Not Survived". Can we make any interesting **hypothesis** on the effect of 'Age' on probability of survival?
<br></br>
- 0 < Age < 5 has higher chances of survival in figure 2 (f(x) > 0.020) compared to figure 4 (f(x) around 0.015). This could mean that babies were prioritized to broad on a safety boat.
- Other parts of the distribution were more or less similar with passengers' age distribution.
- We might be better (or not) if we just add a column to determine if age < 10. 
- But here we would just keep this column and replace the missing values with averages. Also missing values can be replaced with conditional mean imputation. (i.e. average age given passenger's sex.)


In [None]:
num_attribs = X_train_num.columns.tolist()
num_attribs

In [None]:
# Convert "Pclass" into an object instead of int
train_data['Pclass'] = train_data['Pclass'].astype(str)
test_data['Pclass'] = test_data['Pclass'].astype(str)
X_train_cat = train_data.select_dtypes(exclude=np.number)
X_train_cat

In [None]:
cat_attribs = X_train_cat.columns.tolist()
cat_attribs

In [None]:
from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

## Apply preprocessing_pipeline.fit_transform()

In [None]:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
X_train

In [None]:
# Define our target
y_train = train_data['Survived']
y_train

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.1)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Modelling

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Defining grid search parameters
param = [
    {'n_estimators': [100, 200, 300, 400], 
     'max_depth': [6, 8, 10, 12, 15, 20], 
     'max_leaf_nodes': [15, 20, 25]}, 
]
# Random forest classifier
rf = RandomForestClassifier()
# Grid Search with 5-fold cross validation 
gs_rf = GridSearchCV(rf, param, cv = 5, n_jobs = -1, verbose = 1)
gs_rf.fit(X_train, y_train)
gs_rf.best_estimator_, gs_rf.score(X_valid, y_valid), gs_rf.score(X_train, y_train)

In [None]:
# Showing best parameters
rf_best = gs_rf.best_estimator_
rf_score = gs_rf.score(X_valid, y_valid)
print("Best score = ", rf_score)
print("Best parameters = ",rf_best)

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC
param = [
    {
        'kernel': ['rbf'], 'C': [0.3, 1, 2, 3, 4, 6, 8], 
        'gamma': [0.001, 0.003, 0.01, 0.03, 0, 0.1, 0.3, 1, 3, 10]
    }, 
]

svc = SVC(probability = True)
gs_svc = GridSearchCV(svc, param, cv = 5, n_jobs = -1, verbose = 1)
gs_svc.fit(X_train, y_train)
gs_svc.best_estimator_, gs_svc.score(X_valid, y_valid), gs_svc.score(X_train, y_train)

In [None]:
# Showing best parameters
svc_best = gs_svc.best_estimator_
svc_score = gs_svc.score(X_valid, y_valid)
print("Best score = ", svc_score)
print("Best parameters = ",svc_best)

## Extra Trees Classifier (Extremely Randomized Trees)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
param = [
    {'n_estimators': range(8, 28, 4), 
     'max_depth': range(4, 20, 4),
     'max_leaf_nodes': range(4, 20, 4),
    }
]

et = ExtraTreesClassifier()
gs_et = GridSearchCV(et, param, cv = 5, n_jobs = -1, verbose = 1)
gs_et.fit(X_train, y_train)
gs_et.best_estimator_, gs_et.score(X_valid, y_valid), gs_et.score(X_train, y_train)

In [None]:
# Showing best parameters
et_best = gs_et.best_estimator_
et_score = gs_et.score(X_valid, y_valid)
print("Best score = ", et_score)
print("Best parameters = ",et_best)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

lr.score(X_valid, y_valid), lr.score(X_train, y_train)
lr_score = lr.score(X_valid, y_valid)
print("Best score = ", lr_score)

# Model Selection

## Accuracy across different fold of cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(rf_best, X_train, y_train, cv=10)
svc_scores = cross_val_score(svc_best, X_train, y_train, cv=10)
et_scores = cross_val_score(et_best, X_train, y_train, cv=10)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot([1]*10, forest_scores, ".")
plt.plot([2]*10, svc_scores, ".")
plt.plot([3]*10, et_scores, ".")
plt.plot([4]*10, lr_scores, ".")
plt.boxplot([forest_scores, svc_scores, et_scores, lr_scores ], 
            labels=("Random Forest", "SVM", "Extra Trees", "Logistic Regression"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()

- SVM is slightly better than random forest since it has higher upside while almost the same downside across all folds of cross-validation

## Accuracy on validation data

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
models = ["Random Forest", "SVM", "Extra Trees", "Logistic Regression"]
scores = [rf_score, svc_score, et_score, lr_score]
ax.bar(models,scores)
plt.show()

- Here, random forest is slightly better than SVM with higher accuracy on validation data

# Final Model

- Actually Random Forest & SVM are equally good on their performance.
- So it is okay to choose either one.
- I would choose SVM from our conclusion in `Accuracy across different fold of cross-validation` session

# Submission

In [None]:
# Transform test data
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
predictions = svc_best.predict(X_test)
sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = predictions

In [None]:
sub.to_csv('./submission.csv',index=False)

In [None]:
pd.read_csv("./submission.csv")