In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

We'll be using the [Plotly library](https://plotly.com/python/) to create visualizations.

# Use bar charts for categorical data

In [None]:
# Read data into pandas dataframe
income_df = pd.read_csv('../../data/raw/2011-2012_income.csv')
income_df.info()

Find documentation on the data at https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire.
The documentation for 2011-2012 Income is [here](https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/INQ_G.htm).
Let's keep track of what the variables represent.

In [None]:
income_df_columns = {
    'SEQN': 'Respondent sequence number',
    'INQ020': 'Income from wages/salaries',
    'INQ012': 'Income from self employment',
    'INQ030': 'Income from Social Security or RR',
    'INQ060': 'Income from other disability pension',
    'INQ080': 'Income from retirement/survivor pension',
    'INQ090': 'Income from Supplemental Security Income',
    'INQ132': 'Income from state/county cash assistance',
    'INQ140': 'Income from interest/dividends or rental',
    'INQ150': 'Income from other sources',
    'IND235': 'Monthly family income',
    'INDFMMPI': 'Family monthly poverty level index',
    'INDFMMPC': 'Family monthly poverty level category',
    'INQBOX1': 'CHECK ITEM',
    'INQ244': 'Family has savings more than $5000',
    'IND247': 'Total savings/cash assets for the family'
}

Let's take a look at the unique values from income from wages/salaries (INQ020).

In [None]:
income_df['INQ020'].unique()

We will also need to rely on the documentation to tell use what these value mean.

In [None]:
value_defs = {
    '1': 'Yes',
    '2': 'No',
    '7': 'Refused',
    '9': 'Don\'t know',
    '.': 'Missing'
}

Now that we know the definitions of these values, we can create a bar plot with the counts!

In [None]:
# Convert raw values into meaningful values
INC020_values = income_df['INQ020'].apply(lambda x: 'Missing' if np.isnan(x) else value_defs[str(int(x))])
INC020_values

In [None]:
# Calculate value counts
INC020_value_count = INC020_values.value_counts().reset_index()
INC020_value_count

Do the results above match the [documentation](https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/INQ_G.htm#INQ020)?

In [None]:
# Create and show plotly bar chart
fig = px.bar(INC020_value_count, x='index', y='INQ020')
fig.show()