In [None]:
import pandas as pd

# Data Loading

<div class="alert alert-block alert-warning">
We have to copy & paste the `load_adult_data` function from the 1st notebook.
</div>

In [None]:
def load_adult_data(data_file='../data/adult_data.csv'):
    COLUMN_NAMES = (
        'age',
        'workclass',
        'fnlwgt',
        'education',
        'education_num',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital_gain',
        'capital_loss',
        'hours_per_week',
        'native_country',
        'income',
    )
    
    return pd.read_csv(
        data_file,
        names=COLUMN_NAMES,
        skipinitialspace=True
    )

In [None]:
adult_df = load_adult_data()

# Preprocessing

<div class="alert alert-block alert-warning">
We have to copy & paste the functions from the 2nd notebook.
</div>

In [None]:
def add_age_group(adult_df):
    age_group = pd.cut(
        adult_df['age'],
        bins=range(10, 101, 10),
        right=False,
        labels=[f'{age_start}~{age_start + 9}'
                for age_start in range(10, 100, 10)]
    )
    
    return adult_df.assign(age_group=age_group)

def change_education_type_to_category(adult_df):
    education_order = (
        adult_df
        .groupby('education')['education_num']
        .unique()
        .sort_values()
        .index
    )
    
    return adult_df.astype({
        "education": pd.CategoricalDtype(categories=education_order,
                                         ordered=True),
        "education_num": pd.CategoricalDtype(ordered=True),
    })

In [None]:
adult_df = (
    adult_df
    .pipe(add_age_group)
    .pipe(change_education_type_to_category)
)

# Feature Engineering

In [None]:
x = pd.concat(
    [
        adult_df[["age", "hours_per_week"]],
        pd.get_dummies(adult_df[["education", "sex", "race", "marital_status"]]),
    ],
    axis="columns"
)

x.head()

In [None]:
y = adult_df["income"]

y.head()

# Prediction

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
clf = RandomForestClassifier(max_depth=5, random_state=123)

In [None]:
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

---

It looks ok, but not that satisfactory.

Now we want to improve predictions with:

- Other feature engineerings
- Other models (SVM, XGBoost, CatBoost, deep learning, etc.)
- Different cross-validation strategies
- Hyper parameter tuning
- Another dataset with the same format
- ...

There are tons of choices and combinations.

Do we have to copy & paste the common functions everytime we create a new notebook?

No, that does not sound very efficient.

<div class="alert alert-block alert-info">
Fortunately, we can turn the common functions into a python package.
</div>