In [None]:
import pandas as pd

# Data Loading

<div class="alert alert-block alert-warning">
We have to copy & paste the `load_adult_data` function from the 1st notebook.
</div>

In [None]:
def load_adult_data(data_file='../data/adult_data.csv'):
    COLUMN_NAMES = (
        'age',
        'workclass',
        'fnlwgt',
        'education',
        'education_num',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital_gain',
        'capital_loss',
        'hours_per_week',
        'native_country',
        'income',
    )
    
    return pd.read_csv(
        data_file,
        names=COLUMN_NAMES,
        skipinitialspace=True
    )

In [None]:
adult_df = load_adult_data()
adult_df.head()

# Age Grouping

In [None]:
adult_df['age'].describe()

In [None]:
adult_df['age_group'] = pd.cut(
    adult_df['age'],
    bins=range(10, 101, 10),
    right=False,
    labels=[f'{age_start}~{age_start + 9}'
            for age_start in range(10, 100, 10)]
)

In [None]:
adult_df['age_group'].cat.categories

In [None]:
adult_df[['age', 'age_group']].tail(10)

## Refactoring: To a Function

Original code:

```python
adult_df['age_group'] = pd.cut(
    adult_df['age'],
    bins=range(10, 101, 10),
    right=False,
    labels=[f'{age_start}~{age_start + 9}'
            for age_start in range(10, 100, 10)]
)
```

Improvements:

- Do not modify the original `adult_df` to avoid side effects. For that, use `assign`.

In [None]:
def add_age_group(adult_df):
    age_group = pd.cut(
        adult_df['age'],
        bins=range(10, 101, 10),
        right=False,
        labels=[f'{age_start}~{age_start + 9}'
                for age_start in range(10, 100, 10)]
    )
    
    # Use `assign` to avoid modifying the original `adult_df` dataframe
    return adult_df.assign(age_group=age_group)

# Cateogry and Ordering

In [None]:
(
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
)

In [None]:
education_order = (
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
    .index
)
education_order

In [None]:
adult_df['education'] = adult_df['education'].astype(
    pd.CategoricalDtype(categories=education_order,
                        ordered=True)
)

In [None]:
adult_df['education'].head()

In [None]:
adult_df['education_num'] = adult_df['education_num'].astype(
    pd.CategoricalDtype(ordered=True)
)

In [None]:
adult_df['education_num'].head()

## Refactoring: To a Function

Original code:

```python
education_order = (
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
    .index
)

adult_df['education'] = adult_df['education'].astype(
    pd.CategoricalDtype(categories=education_order,
                        ordered=True)
)

adult_df['education_num'] = adult_df['education_num'].astype(
    pd.CategoricalDtype(ordered=True)
)
```

Improvements:

- Do not modify the original `adult_df` to avoid side effects. Instead return `adult_df.astype()` directly.
- Change types of `education` and `education_num` at once.

In [None]:
def change_education_type_to_category(adult_df):
    education_order = (
        adult_df
        .groupby('education')['education_num']
        .unique()
        .sort_values()
        .index
    )
    
    return adult_df.astype({
        "education": pd.CategoricalDtype(categories=education_order,
                                         ordered=True),
        "education_num": pd.CategoricalDtype(ordered=True),
    })