# Import

In [None]:
import numpy as np
import pandas as pd

# [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult)

Predict whether income exceeds $50K/yr based on census data.
Also known as "Census Income" dataset.

**Attributes**

- age
- workclass
  - Represents the employment status of an individual
- fnlwgt
  - Final weight which is the number of people the census believes the entry represents
  - People with similar demographic characteristics should have similar weights
    - This only applies within state.
- education
  - The highest level of education achieved by an individual
- education-num
  - The highest level of education achieved in numerical form
- marital-status
- relationship
  - Represents what this individual is relative to others
- race
- sex
- capital-gain
- capital-loss
- hours-per-week
  - The hours an individual has reported to work per week
- native-country

# Data Load

## Data Files

I have cut the [original data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
into 4 seprate files.

https://github.com/sesise0307/pydata2021-eda/tree/main/data

<img src="../image/data_list.png" alt="Data List" width="300"/>

## Loading a file

In [None]:
df_0 = pd.read_csv(
    '../data/adult-0.data',
#     'https://raw.githubusercontent.com/sesise0307/pydata2021-eda/main/data/adult-0.data',
)

df_0.shape

In [None]:
df_0.head()

In [None]:
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]

In [None]:
df_0 = pd.read_csv(
    '../data/adult-0.data',
#     'https://raw.githubusercontent.com/sesise0307/pydata2021-eda/main/data/adult-0.data',
    names=names,
    skipinitialspace=True,  # Skip spaces after delimiter
)

df_0.shape

In [None]:
df_0.head()

## Merging many files

TODO: Quiz?

In [None]:
df_list = list()

for i in range(4):
    df_list.append(
        pd.read_csv(
            f'../data/adult-{i}.data',
#             f'https://raw.githubusercontent.com/sesise0307/pydata2021-eda/main/data/adult-{i}.data',
            names=names,
            skipinitialspace=True,
        )
    )

df = pd.concat(df_list, ignore_index=True)

In [None]:
df.shape

In [None]:
df.head(10)

# Essential Check & Preprocessing

## info() and describe()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(exclude=np.number)

## Unique Values

In [None]:
for column in df.columns:
    n_unique = df[column].nunique()
    
    if df[column].nunique() < 50:
        print(f'{column} ({n_unique}): {df[column].unique()}')
    else:
        print(f'{column} ({n_unique})')

## NaN Values

In [None]:
import missingno as msno

msno.matrix(df);

In [None]:
msno.bar(df);

In [None]:
# Replace '?' to NaN
df = df.replace('?', np.nan)
df.head()

In [None]:
msno.matrix(df);

In [None]:
msno.bar(df);

In [None]:
# Manual computation of NaN ratio
(df.isna().sum() / df.shape[0]).sort_values(ascending=False)

How to handle missing data

Drop or fill
- How to fill

Row vs column

# (Primitive) Feature Engineering

## Age Grouping

In [None]:
df['age_group'] = pd.cut(
    df['age'],
    bins=range(10, 101, 10),
    right=False,
    labels=[f'{age_start}~{age_start + 9}' for age_start in range(10, 100, 10)]
)

In [None]:
df['age_group'].cat.categories

In [None]:
df[['age', 'age_group']].sample(10)

## Cateogry and Category Ordering

In [None]:
(
    df
    .groupby('education')['education-num']
    .unique()
    .sort_values()
)

In [None]:
education_order = (
    df
    .groupby('education')['education-num']
    .unique()
    .sort_values()
    .index
)

In [None]:
df['education'] = df['education'].astype(
    pd.CategoricalDtype(categories=education_order,
                        ordered=True)
)

In [None]:
df['education'].head()

In [None]:
df['education-num'] = df['education-num'].astype(
    pd.CategoricalDtype(ordered=True)
)

In [None]:
df['education-num'].head()

## Captial gain and loss

In [None]:
(  # Check if capital-gain and capital-loss appear at the same time
    df[['capital-gain', 'capital-loss']]
#     .astype(bool)
#     .sum(axis='columns')
#     .max()
)

In [None]:
df['capital-gain-loss'] = df['capital-gain'] - df['capital-loss']

df['capital-gain-loss'].sample(10)

TODO: Add some links for more feature engineerings

# Save

pandas supports various files types for both reading and saving your DataFrame.

![Pandas I/O](../image/pandas_io.png)

Source: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

[The Best Format to Save Pandas Data](https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d)

"It seems that feather format is an ideal candidate to store the data between Jupyter sessions. It shows high I/O speed, doesn’t take too much memory on the disk and doesn’t need any unpacking when loaded back into RAM."

In [None]:
df.to_feather('../data/preprocessed/adult.feather')

In [None]:
df_feather = pd.read_feather('../data/preprocessed/adult.feather')
df_feather.head()

In [None]:
df_feather.info()