In [1]:
import pandas as pd

# Data Loading

We have to copy & paste the `load_adult_data` function from the 1st notebook.

In [2]:
def load_adult_data(data_file='../data/adult_data.csv'):
    COLUMN_NAMES = (
        'age',
        'workclass',
        'fnlwgt',
        'education',
        'education_num',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital_gain',
        'capital_loss',
        'hours_per_week',
        'native_country',
        'income',
    )
    
    return pd.read_csv(
        data_file,
        names=COLUMN_NAMES,
        skipinitialspace=True
    )

In [3]:
adult_df = load_adult_data()
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Age Grouping

In [4]:
adult_df['age'].describe()

count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64

In [5]:
adult_df['age_group'] = pd.cut(
    adult_df['age'],
    bins=range(10, 101, 10),
    right=False,
    labels=[f'{age_start}~{age_start + 9}'
            for age_start in range(10, 100, 10)]
)

In [6]:
adult_df['age_group'].cat.categories

Index(['10~19', '20~29', '30~39', '40~49', '50~59', '60~69', '70~79', '80~89',
       '90~99'],
      dtype='object')

In [7]:
adult_df[['age', 'age_group']].tail(10)

Unnamed: 0,age,age_group
32551,32,30~39
32552,43,40~49
32553,32,30~39
32554,53,50~59
32555,22,20~29
32556,27,20~29
32557,40,40~49
32558,58,50~59
32559,22,20~29
32560,52,50~59


## Refactoring: To a Function

Original code:

```python
adult_df['age_group'] = pd.cut(
    adult_df['age'],
    bins=range(10, 101, 10),
    right=False,
    labels=[f'{age_start}~{age_start + 9}'
            for age_start in range(10, 100, 10)]
)
```

Improvements:

- Do not modify the original `adult_df` to avoid side effects. For that, use `assign`.

In [8]:
def add_age_group(adult_df):
    age_group = pd.cut(
        adult_df['age'],
        bins=range(10, 101, 10),
        right=False,
        labels=[f'{age_start}~{age_start + 9}'
                for age_start in range(10, 100, 10)]
    )
    
    # Use `assign` to avoid modifying the original `adult_df` dataframe
    return adult_df.assign(age_group=age_group)

In [9]:
new_df = load_adult_data()

In [10]:
new_df = add_age_group(new_df)

In [11]:
new_df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income', 'age_group'],
      dtype='object')

# Cateogry and Category Ordering

In [12]:
(
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
)

education
Preschool        [1]
1st-4th          [2]
5th-6th          [3]
7th-8th          [4]
9th              [5]
10th             [6]
11th             [7]
12th             [8]
HS-grad          [9]
Some-college    [10]
Assoc-voc       [11]
Assoc-acdm      [12]
Bachelors       [13]
Masters         [14]
Prof-school     [15]
Doctorate       [16]
Name: education_num, dtype: object

In [13]:
education_order = (
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
    .index
)

In [14]:
adult_df['education'] = adult_df['education'].astype(
    pd.CategoricalDtype(categories=education_order,
                        ordered=True)
)

In [15]:
adult_df['education'].head()

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: category
Categories (16, object): ['Preschool' < '1st-4th' < '5th-6th' < '7th-8th' ... 'Bachelors' < 'Masters' < 'Prof-school' < 'Doctorate']

In [16]:
adult_df['education_num'] = adult_df['education_num'].astype(
    pd.CategoricalDtype(ordered=True)
)

In [17]:
adult_df['education_num'].head()

0    13
1    13
2     9
3     7
4    13
Name: education_num, dtype: category
Categories (16, int64): [1 < 2 < 3 < 4 ... 13 < 14 < 15 < 16]

## Refactoring: To a Function

Original code:

```python
education_order = (
    adult_df
    .groupby('education')['education_num']
    .unique()
    .sort_values()
    .index
)

adult_df['education'] = adult_df['education'].astype(
    pd.CategoricalDtype(categories=education_order,
                        ordered=True)
)

adult_df['education_num'] = adult_df['education_num'].astype(
    pd.CategoricalDtype(ordered=True)
)
```

Improvements:

- Do not modify the original `adult_df` to avoid side effects. Instead return `adult_df.astype()` directly.
- Change types of `education` and `education_num` at once.

In [None]:
def change_education_type_to_category(adult_df):
    education_order = (
        adult_df
        .groupby('education')['education_num']
        .unique()
        .sort_values()
        .index
    )
    
    return adult_df.astype({
        "education": pd.CategoricalDtype(categories=education_order,
                                         ordered=True),
        "education_num": pd.CategoricalDtype(ordered=True),
    })