# EDA (Exploratory Data Analysis)

![EDA](../image/eda.png)


<font color="Blue">Refer to [this GitHub repo](https://github.com/sesise0307/pydata2021-eda) for a detailed tutorial.</font>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Data Loading

<div class="alert alert-block alert-warning">
We have to copy & paste the `load_adult_data` function from the 1st notebook.
</div>

In [None]:
def load_adult_data(data_file='../data/adult_data.csv'):
    COLUMN_NAMES = (
        'age',
        'workclass',
        'fnlwgt',
        'education',
        'education_num',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital_gain',
        'capital_loss',
        'hours_per_week',
        'native_country',
        'income',
    )
    
    return pd.read_csv(
        data_file,
        names=COLUMN_NAMES,
        skipinitialspace=True
    )

In [None]:
adult_df = load_adult_data()

# Preprocessing

<div class="alert alert-block alert-warning">
We have to copy & paste the functions from the 2nd notebook.
</div>

In [None]:
def add_age_group(adult_df):
    age_group = pd.cut(
        adult_df['age'],
        bins=range(10, 101, 10),
        right=False,
        labels=[f'{age_start}~{age_start + 9}'
                for age_start in range(10, 100, 10)]
    )
    
    return adult_df.assign(age_group=age_group)

def change_education_type_to_category(adult_df):
    education_order = (
        adult_df
        .groupby('education')['education_num']
        .unique()
        .sort_values()
        .index
    )
    
    return adult_df.astype({
        "education": pd.CategoricalDtype(categories=education_order,
                                         ordered=True),
        "education_num": pd.CategoricalDtype(ordered=True),
    })

In [None]:
adult_df = (
    adult_df
    .pipe(add_age_group)
    .pipe(change_education_type_to_category)
)

# Histogram / KDE / Boxplot

In [None]:
plt.rcParams['figure.figsize'] = 10, 5  # Set a default figure size
sns.set_style('whitegrid')  # Set a default figure style

In [None]:
plt.subplot(131)
sns.histplot(data=adult_df, x='age', hue='income', multiple="stack")

plt.subplot(132)
sns.kdeplot(data=adult_df, x='age', hue='income', multiple="stack")

plt.subplot(133)
sns.boxplot(data=adult_df, y='age', x='income');

<div class="alert alert-block alert-info">
What about other variables? Let's define a function.
</div>

In [None]:
def statistical_plots(data, var, separate_by="income"):
    plt.subplot(131)
    sns.histplot(data=data, x=var, hue=separate_by, multiple="stack")

    plt.subplot(132)
    sns.kdeplot(data=data, x=var, hue=separate_by, multiple="stack")

    plt.subplot(133)
    sns.boxplot(data=data, y=var, x=separate_by)    

In [None]:
statistical_plots(adult_df, "hours_per_week")