### 1. Preparation & Data Import

Import required packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

Read in training and test data

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

### 2. Exploratory Data Analysis

Rows & Columns

In [None]:
train_data.shape, test_data.shape

Display the first few lines of both

In [None]:
train_data.head()

In [None]:
test_data.head()

#### Sex

In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [None]:
men = train_data.loc[train_data.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

#### Passenger Class

In [None]:
first_class = train_data.loc[train_data.Pclass == 1]["Survived"]
rate_first_class = sum(first_class)/len(first_class)

print("% of people in the first class who survived:", rate_first_class)

In [None]:
second_class = train_data.loc[train_data.Pclass == 2]["Survived"]
rate_second_class = sum(second_class)/len(second_class)

print("% of people in the second class who survived:", rate_second_class)

In [None]:
third_class = train_data.loc[train_data.Pclass == 3]["Survived"]
rate_third_class = sum(third_class)/len(third_class)

print("% of people in the third class who survived:", rate_third_class)

#### Age

Dividing Age into four groups to see if there is any correlation

In [None]:
pd.qcut(train_data['Age'],4).value_counts()

In [None]:
train_data['Survived'].groupby(pd.qcut(train_data['Age'], 4)).mean()

#### Heatmaps - Correlation of independent variables with Survivability

In [None]:
plt.figure(figsize=(6, 9))
heatmap = sns.heatmap(train_data.corr()[['Survived']].sort_values(by='Survived', ascending=False), vmin=-1, vmax=1, annot=True, cmap='YlGnBu')

heatmap.set_title('Variables Correlating with Survivability', fontdict={'fontsize':12}, pad=16);

In [None]:
plt.figure(figsize=(9, 9))
heatmap = sns.heatmap(train_data.corr(), vmin=-1, vmax=1, annot=True, cmap='YlGnBu')

heatmap.set_title('Correlating Variables', fontdict={'fontsize':12}, pad=16);

### 3. Imputation

Display missing values in both the test and training data set

In [None]:
train_data.isna().sum(), test_data.isna().sum()

Filling in missing values of Age with Median

In [None]:
age_median = train_data['Age'].median()
age_median

In [None]:
train_data['Age'].fillna(age_median, inplace = True)
test_data['Age'].fillna(age_median, inplace = True)

Display rows with missing values in 'Embarked'

In [None]:
train_data[train_data['Embarked'].isna() == True]

Look for similar Rows and take a majority vote on what to put into the missing values for 'Embarked'

In [None]:
train_data[(train_data['Survived'] == 1) & (train_data['Pclass'] == 1) & (train_data['Sex'] == 'female') & (train_data['SibSp'] == 0) & (train_data['Parch'] == 0)].sort_values(by=['Fare'])

In [None]:
embarked_nn_mode = train_data[(train_data['Survived'] == 1) & (train_data['Pclass'] == 1) & (train_data['Sex'] == 'female') & (train_data['SibSp'] == 0) & (train_data['Parch'] == 0)].sort_values(by=['Fare'])['Embarked'].mode()

In [None]:
embarked_nn_mode

Fill in the missing values for 'Embarked'

In [None]:
train_data['Embarked'].fillna('C', inplace = True)

Display rows with missing values in 'Fare'

In [None]:
test_data[test_data['Fare'].isna() == True]

Look for similar Rows and take a majority vote on what to put into 'Fare'

In [None]:
fare_nn_median = test_data[(test_data['Pclass'] == 3) & (test_data['Sex'] == 'male') & (test_data['SibSp'] == 0) & (test_data['Parch'] == 0)].sort_values(by=['Fare'])['Fare'].median()

In [None]:
fare_nn_median

In [None]:
test_data['Fare'].fillna(fare_nn_median, inplace = True)

Cabin - Missing Values

In [None]:
train_data['Cabin'].fillna('0', inplace = True)
test_data['Cabin'].fillna('0', inplace = True)

Final check for missing values

In [None]:
train_data.isna().sum(), test_data.isna().sum()

### 4. Feature Engineering

Defining features & converting them to indicator values

In [None]:
# def extract_second_position_from_id(df):  # input is a DataFrame (with 1 column)
#    """Returns the second position of a string column"""
#    first_char = df.iloc[:, 0].str[1].astype(int)
#    return first_char.values.reshape(-1, 1) # output has to be a 2D matrix

In [None]:
# trans = ColumnTransformer([
#    ('my_id', FunctionTransformer(extract_second_position_from_id), ['Individual ID']),    
#    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Island', 'Sex']),
#    ('my_binning', KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile'), ['Culmen Depth (mm)']),   # like pd.qcut()
#    ('do_nothing', 'passthrough', ['Pclass'])
#    ])
#
# , 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'
#
# trans.fit(train_data)
# X_train = trans.transform(train_data)
# X_train

In [None]:
# p = make_pipeline(
#    trans,
#    MinMaxScaler(),
#    RandomForestClassifier()
#)

In [None]:
# X_train.shape

In [None]:
# p.fit(X_train, y_train)
# p.score(X_train, y_train)

In [None]:
train_data['Name']

In [None]:
train_data['Name_Length'] = train_data['Name'].apply(lambda x: len(x))
train_data['Survived'].groupby(pd.qcut(train_data['Name_Length'],3)).mean()

In [None]:
pd.qcut(train_data['Name_Length'], 3).value_counts()

In [None]:
name_cut_labels = ['short', 'medium', 'long']
train_data['Name_Length'] = pd.qcut(train_data['Name_Length'], 3, labels = name_cut_labels)

In [None]:
train_data['Name_Length'].value_counts()

In [None]:
test_data['Name_Length'] = test_data['Name'].apply(lambda x: len(x))
test_data['Name_Length'] = pd.cut(test_data['Name_Length'], 3, labels = name_cut_labels)

#### Fare

In [None]:
train_data['Survived'].groupby(pd.qcut(train_data['Fare'], 4)).mean()

In [None]:
pd.qcut(train_data['Fare'], 4).value_counts()

In [None]:
fare_cut_labels = ['0', '1', '2', '3']
train_data['Fare'] = pd.qcut(train_data['Fare'], q=4, labels = fare_cut_labels)

In [None]:
train_data['Fare'].value_counts()

In [None]:
test_data['Fare'] = pd.qcut(test_data['Fare'], q=4, labels = fare_cut_labels)

#### Cabin First Letter / Deck

In [None]:
train_data['Deck'] = train_data['Cabin'].apply(lambda x: str(x)[0])
test_data['Deck'] = test_data['Cabin'].apply(lambda x: str(x)[0])

In [None]:
train_data['Deck'] = train_data['Deck'].apply(lambda x: 0 if x == "0" else 1)
test_data['Deck'] = test_data['Deck'].apply(lambda x: 0 if x == "0" else 1)

In [None]:
train_data['Deck'].value_counts(), train_data['Deck'].value_counts()

In [None]:
train_data['Deck']

#### Normalize Age

In [None]:
age_range = train_data['Age'].max() - train_data['Age'].min()
age_range

In [None]:
train_data['Age_normalized'] = ( train_data['Age'] - train_data['Age'].min() ) / age_range
test_data['Age_normalized'] = ( test_data['Age'] - test_data['Age'].min() ) / age_range

In [None]:
train_data['Age_normalized'], test_data['Age_normalized']

#### Normalize Pclass

In [None]:
# scaler = MinMaxScaler()
# scaler.fit(train_data[['Pclass']].reshape(-1, 1))
# print(scaler.transform(train_data[['Pclass']].reshape(-1, 1)))

In [None]:
features = ['Pclass', 'Sex', 'Age_normalized', 'SibSp', 'Parch', 'Embarked', 'Deck', 'Name_Length', 'Fare']
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
print(X_train)
print(X_test)

#### Heatmaps - Correlation of independent variables with Survivability

In [None]:
features = ['Pclass', 'Sex', 'Age_normalized', 'SibSp', 'Parch', 'Embarked', 'Fare', 'Survived', 'Deck', 'Name_Length']
X_heat = pd.get_dummies(train_data[features])

In [None]:
plt.figure(figsize=(6, 9))

heatmap = sns.heatmap(X_heat.corr()[['Survived']].sort_values(by='Survived', ascending=False), vmin=-1, vmax=1, annot=True, cmap='YlGnBu')
heatmap.set_title('Features Correlating with Survivability', fontdict={'fontsize':12}, pad=16);

### 5. Creating model(s)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train

In [None]:
X_test

In [None]:
X_train.shape, X_test.shape

Define y for the training data

In [None]:
y_train = train_data['Survived']
y_train

#### Creating Baseline Model

In [None]:
model_BL = DummyClassifier(strategy='most_frequent', random_state=10)

In [None]:
model_BL.fit(X_train, y_train)

#### Logistic Regression

In [None]:
model_LR = LogisticRegression(random_state=23)
model_LR.fit(X_train, y_train)

In [None]:
model_DT = DecisionTreeClassifier(max_depth=7, min_samples_split=2, random_state=23)
model_DT.fit(X_train, y_train)

#### Random Forest

In [None]:
model_RF = RandomForestClassifier(n_estimators=900, max_depth=7, min_samples_split=10, random_state=23)
model_RF.fit(X_train,y_train)

### 6. Model predictions

In [None]:
predictions_BL = model_BL.predict(X_train)
predictions_LR = model_LR.predict(X_train)
predictions_DT = model_DT.predict(X_train)
predictions_RF = model_RF.predict(X_train)

### 7. Model Performance

#### Baseline Performance

In [None]:
print(classification_report(y_train, predictions_BL))

In [None]:
plot_confusion_matrix(model_BL, X_train, y_train)

#### Logistic Regression Performance

In [None]:
print(classification_report(y_train, predictions_LR))

In [None]:
plot_confusion_matrix(model_LR, X_train, y_train)

#### Decision Tree Performance

In [None]:
print(classification_report(y_train, predictions_DT))

In [None]:
plot_confusion_matrix(model_DT, X_train, y_train)

#### Random Forest Performance

In [None]:
print(classification_report(y_train, predictions_RF))

In [None]:
plot_confusion_matrix(model_RF, X_train, y_train)

### 8. Hyperparameter Optimization / Cross Validation

Defining Hyperparameters for the Decision Tree

In [None]:
hyperparams_DT = {
    'max_depth': list(range(2, 9)), 
    'min_samples_split': list(range(2, 20, 2))
}

Creating an estimator for Decision Tree

In [None]:
g = GridSearchCV(model_DT, hyperparams_DT, cv=5)
g.fit(X_train, y_train)

Show me the best parameters for the Decision Tree model

In [None]:
g.best_params_

Defining Hyperparameters for the Random Forest

In [None]:
hyperparams_RF = {
    'max_depth': list(range(3, 8)), 
    'min_samples_split': list(range(5, 31, 5)),
    'n_estimators': list(range(900, 1001, 100))
}

Creating an estimator for Random Forest

In [None]:
g = GridSearchCV(model_RF, hyperparams_RF, cv=5, verbose=1, n_jobs=-1)
g.fit(X_train, y_train)

Show me the best parameters for the Random Forest model

In [None]:
g.best_params_

### 9. Formatting for Export

In [None]:
predictions = model_RF.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)