# Census Estimates
---

**Purpose:**

Working with "wide" data

**Data Source:**

https://www.census.gov/data/tables.html

## Prepare environment to view wide formats

In [None]:
# this trick will reduce the borders on the cells and make the font a little smaller so that more columns will fit on the screen
# Note: sometimes you have to run this cell again in order for it to work

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; font-size:90%;}</style>"))

In [None]:
import numpy as np
import pandas as pd

# these pandas dataframe options will make the columns more legible

pd.options.display.float_format = "{:,.4f}".format
pd.set_option('display.precision', 4)
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 250)

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

In [None]:
site = "http://www2.census.gov/programs-surveys/popest/datasets/{0}-{1}/national/totals/nst-est{1}-alldata.csv"
start_year, end_year = 2010, 2019

url = site.format(start_year, end_year)
print(url)

In [None]:
census_wide = pd.read_csv(url)
census_wide.info()

In [None]:
census_wide.head()

## Data Preparation (to reduce columns)

In [None]:
# peek at column names

census_wide.columns

In [None]:
# shorten the column names

census_wide.columns = census_wide.columns.str.lower().str.replace(" ","_").str.replace('estimate','est')

In [None]:
# peek at column types

census_wide.dtypes

In [None]:
# reduce to the columns needs

df = census_wide.filter(regex='sumlev|division|name|est\d{4}')
df.head()

In [None]:
# summarize by rows

df.groupby('sumlev')['popest2019'].sum()

In [None]:
df['sumlev'].value_counts(dropna=False)

In [None]:
# replace dataframe with a filter based on rows and columns

df = df[df['sumlev'] ==40].loc[:, 'name':'popest2019']
df.head()

In [None]:
# rename a column

df.rename(columns={'name':'state_name'}, inplace=True)
df.head()

In [None]:
# set and index based on a column with unique values

df.set_index('state_name', inplace=True)
df.head()

In [None]:
# search for specific rows & columns by name

df.loc['South Dakota':, 'popest2015':]

In [None]:
# search for specific rows & columns by index

df.iloc[-11:, -5:]

In [None]:
# analyze by each row (axis = 0)

df.style.highlight_max(axis=0).format("{:,.0f}")

In [None]:
# analyze by each column (axis = 1)

df.style.highlight_max(axis=1).format("{:,.0f}")

In [None]:
# transpose the dataframe

df.T

In [None]:
# review descriptive statistics

df.T.describe().style.format("{:,.0f}")

In [None]:
def format_number(x):
    """Format a number into B(billions), M(millions), or K(thousands)"""
    if x >= 1_000_000_000:
        return "{:,.1f}B".format(x*1e-9)
    elif x >= 1_000_000:
        return "{:,.1f}M".format(x*1e-6)
    elif x >= 1_000: 
        return "{:,.1f}K".format(x*1e-3)
    return str(x)

test_numbers = [5, 500, 5000, 50_000, 500_000, 5_000_000, 5e9, 5e12]

dict(zip(test_numbers, list(map(format_number, test_numbers))))

In [None]:
# view the dataframe using the format_number function
df.applymap(format_number)

In [None]:
# review by plotting (notice plotting problems)

df.plot();

In [None]:
# review by plotting against columns to correct problems

for column, items in df.T.iteritems():
    if items.mean() >= 15_000_000:
        items.plot(legend=column, figsize=(10,6), rot=45)

In [None]:
# review averages for each index/row

df.T.mean().sort_values(ascending=False).plot.bar(figsize=(14,6), rot=90, color='darkblue', grid=False)
plt.title("Average Population by State");

In [None]:
# review & highlight specific index/row

color_map = np.where(df.T.mean() >= 10_000_000, 'crimson', 'grey')

df.T.mean().plot.bar(figsize=(14,6), rot=90, color=color_map, alpha= .7, grid=False)
plt.title("Average Population by State");

In [None]:
# review the difference between each column

df.diff(axis=1).dropna(how='all', axis=1).style.format("{:,.0f}")

In [None]:
# review the percent change between each column

df.pct_change(axis=1).fillna(0).style.format("{:,.3%}")

In [None]:
# make a new dataframe based on the percent changes

xf = df.pct_change(axis=1).fillna(0)
xf

In [None]:
# add a prefix to the columns names to distinguish them from the prior dataframe

xf = xf.add_prefix('pct_chg_')
xf

In [None]:
# create a new dataframe which concatentates both

dd = pd.concat([df, xf], axis=1)
dd