1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [Initial EDA](#Initial-EDA)
    1. [object feature EDA](#object-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target)
    1. [number feature EDA](#number-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target2)
        1. [Correlation](#Correlation)
        1. [Pair plot](#Pair-plot)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    


# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [2]:
# standard libary and settings
import sys
import time
rundate = time.strftime("%Y%m%d")

import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

try:
#     import mlmachine as mlm
#     from prettierplot.plotter import PrettierPlot
#     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append("../../mlmachine") if "../../../mlmachine" not in sys.path else None
    sys.path.append("../../prettierplot") if "../../../prettierplot" not in sys.path else None
    
    import mlmachine as mlm
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print('This notebook relies on the libraries mlmachine and prettierplot. Please run:')
    print('\tpip install mlmachine')
    print('\tpip install prettierplot')

## Data

<a id = 'Data'></a>

In [4]:
# load data and print dimensions
df_train = pd.read_csv("../data/kaggleTitanic/train.csv")
df_valid = pd.read_csv("../data/kaggleTitanic/test.csv")

print("Training data dimensions: {}".format(df_train.shape))
print("Validation data dimensions: {}".format(df_valid.shape))

Training data dimensions: (891, 12)
Validation data dimensions: (418, 11)


In [5]:
# Load training data into mlmachine
train = mlm.Machine(
    data=df_train,
    target="Survived",
    remove_features=["PassengerId", "Ticket"],
    identify_as_object=["Pclass", "SibSp", "Parch"],
    target_type="object",
)
print(train.data.shape)

******************
object label encoding

0 --> 0
1 --> 1
(891, 9)


# Initial EDA

<a id = 'Initial-EDA'></a>

## object feature EDA

<a id = 'object-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target'></a>

In [None]:
# object features
for feature in train.data.feature_by_mlm_dtype["object"]:
    train.eda_cat_target_cat_feat(feature=feature, level_count_cap=50)

## number feature EDA

<a id = 'number-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target2'></a>

In [None]:
# number features
for feature in train.data.feature_by_mlm_dtype["number"]:
    train.eda_cat_target_num_feat(feature=feature)

### Correlation

<a id = 'Correlation'></a>

##### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.make_canvas()
p.pretty_corr_heatmap(df=train.data, annot=True, ax=ax)

##### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_prop=10)
ax = p.make_canvas()
p.pretty_corr_heatmap_target(
    df=train.data, target=train.target, thresh=0.01, annot=True, ax=ax
)

> Remarks - There are three pairs of highly correlated features:
    - 'GarageArea' and 'GarageCars'
    - 'TotRmsAbvGrd' and 'GrLivArea'
    - '1stFlrSF' and 'TotalBsmtSF
This makes sense, given what each feature represents and how each pair items relate to each other. We likely only need one feature from each pair.

### Pair plot

<a id = 'Pair-plot'></a>

In [None]:
# pair plot
p = PrettierPlot(chart_prop=12)
p.pretty_pair_plot(df=train.data, diag_kind="auto")

In [None]:
# pair plot
p = PrettierPlot(chart_prop=12)
p.pretty_pair_plot(
    df=train.data.dropna(),
    diag_kind="kde",
    target=train.target,
    columns=["Age", "Fare", "Pclass", "Parch", "SibSp"],
    legend_labels=["Died", "Survived"],
    bbox=(2.0, 0.0),
)

## Faceting

<a id = 'Faceting'></a>

##### object by object

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship, embark location by passenger class", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    y_units="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship, passenger class by gender", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Pclass",
    y=train.target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship,embark location by gender", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)

In [None]:
#
p = PrettierPlot()
p.pretty_facet_two_cat_point(
    df=train.recombine_data(train.data, train.target),
    x="Sex",
    y=train.target.name,
    split="Pclass",
    cat_row="Embarked",
    aspect=1.0,
    height=5,
    bbox=(1.3, 1.2),
    legend_labels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.pretty_facet_two_cat_point(
    df=train.recombine_data(train.data, train.target).dropna(subset=["Embarked"]),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    cat_row="Sex",
    aspect=1.0,
    height=5,
    bbox=(1.5, 0.8),
    legend_labels=["1st class", "2nd class", "3rd class"],
)

##### object by number

In [None]:
#
p = PrettierPlot()
p.pretty_facet_cat_num_hist(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    num_col="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

In [None]:
#
p = PrettierPlot(chart_prop=15)
p.pretty_facet_cat_num_scatter(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    xNum="Fare",
    yNum="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)