### DO NOT EDIT: This file was generated by a script that combines all .ipynb files in the current directory.

## Table of Contents
<a id='top-of-page'></a>

- [1-workspace_setup](#1-workspace_setup)
- [2-load_raw_db](#2-load_raw_db)
- [3-county_and_dist_tables](#3-county_and_dist_tables)
- [4-county_demographics](#4-county_demographics)
- [5-education](#5-education)
- [6-crime](#6-crime)
- [7-prepare_for_dashboard](#7-prepare_for_dashboard)


---
---
# `1-workspace_setup`

[BACK TO TOP ^](#top-of-page)

---
---


# Setup

### Table of Contents
1. [SQL Interface](#sql-interface)
2. [Helper Methods](#helper-methods)
2. [Pandas, extended inplace](#pandas-extended-inplace)
2. [Format District](#format-district)
2. [GeoDF](#geodf)
2. [GroupedDF](#groupeddf)

In [None]:
# pip install numpy pandas

#### `extend-inplace` - a simple library I built, inspired by this project.
- Docs & source here: [github.com/ryayoung/extend-inplace](https://github.com/ryayoung/extend-inplace)

In [None]:
# pip install extend-inplace

In [3]:
from typing import Iterable, Any
import pandas as pd
import numpy as np
import sqlite3 as sqlite
import re
from extend_inplace import Extend

## SQL Interface

- End-user will use `read_raw()`, `read_main()`, `write_raw()`, and `write_main()` to communicate with sqlite

In [None]:
get_con_raw = lambda: sqlite.connect("data_raw.db")
get_con_main = lambda: sqlite.connect("data_main.db")
sql = dict(
    raw = dict(con = get_con_raw, cache = dict()),
    main = dict(con = get_con_main, cache = dict()),
)

Helper methods for the end-user methods

In [None]:
def _to_sql(
    df: pd.DataFrame,
    name: str,
    con: str
) -> None | int:
    """
    Save/replace `df` under `name` in sqlite. Database depends on `con`
    """
    res = df.to_sql(name, con=sql[con]['con'](), index=False, if_exists='replace')
    return res


def _format_query(
    *args: tuple[str], # only one string arg processed
    **kwargs: dict[str, str],
) -> str:
    """
    Allow for more pythonic style of writing sql queries, for better
    readability, user experience, and user-error prevention. Underscores
    can be used to prefix/suffix python reserved word kwargs like FROM, IF, etc.
    --
    >>> _format_query("SELECT name, age FROM my_table")
    SELECT name, age FROM my_table
    >>> _format_query("my_table")
    SELECT * FROM my_table
    >>> _format_query("my_table", WHERE = "something = something")
    SELECT * FROM my_table WHERE something = something
    >>> _format_query("name, age", from_ = "my_table")
    SELECT name, age FROM my_table
    """
    kwarg_query = " ".join([f'{k.strip("_").upper()} {v}' for k,v in kwargs.items()])
    if len(args) == 1:
        if len(args[0]) == len(re.sub(r"\s+", "", args[0])):
            query = f"SELECT * FROM {args[0]} "
        elif "select" not in args[0].lower():
            query = f"SELECT {args[0]} "
        else:
            query = args[0] + " "
        return query + kwarg_query
    else:
        return kwarg_query


def _read_sql(
    *args: tuple[str, ...],
    con: str,
    **kwargs: dict[str, str],
) -> pd.DataFrame:
    """
    This method is specific to this script. It references global variables and funcs
    """
    query = _format_query(*args, **kwargs)

    if (cached := sql[con]['cache'].get(query, None)) is not None:
        return cached.copy()

    df = sql[con]['cache'][query] = pd.read_sql(query, con=sql[con]['con']())
    return df

End-user will use these read/write methods to communicate with sqlite

In [6]:
read_raw = lambda *args, **kwargs: _read_sql(*args, **kwargs, con='raw')
read_main = lambda *args, **kwargs: _read_sql(*args, **kwargs, con='main')
# def write_raw(*args, **kwargs):
    # _to_sql(*args, **kwargs, con='raw')
write_raw = lambda *args, **kwargs: _to_sql(*args, **kwargs, con='raw')
write_main = lambda *args, **kwargs: _to_sql(*args, **kwargs, con='main')

## Helper methods

In [None]:
def head(
    *dfs: tuple[pd.DataFrame, ...],
    n: int = 3,
    with_tail: bool = False
) -> None:
    '''
    Like display() and pd.DataFrame.head got married and had a kid.
    - `with_tail` will concat head and tail together.
    '''
    for df in dfs:
        print(f'{df.shape[1]} cols x {df.shape[0]} rows')
        if with_tail:
            display(pd.concat([df.head(n-1), df.tail(n-1)], axis=0))
        else:
            display(df.head(3))


def _flatten_iterable(
    args: Any
) -> tuple[Any, ...]:
    """
    Turns anything into a flattened tuple of non-iterable (str, bytes excluded) values

    >>> _flatten_iterable(int)
    (<class 'int'>,)
    >>> _flatten_iterable(['hi'])
    ('hi',)
    >>> _flatten_iterable(['hi', (1,3, [8]), [((3,3,3))]])
    ('hi', 1, 3, 8, 3, 3, 3)
    """

    def valid_iterable(e: Any) -> bool:
        if isinstance(e, Iterable) and not isinstance(e, (str, bytes)):
            return True
        return False

    def flatten(elems: Iterable[Any]):
        for e in elems:
            if valid_iterable(e):
                yield from flatten(e)
            else:
                yield e

    args = args if valid_iterable(args) else (args,)

    return tuple(flatten(args))


## Pandas, extended inplace
- Modifying pandas `DataFrame` and `Series` classes in-place with the following extra instance methods

In [None]:
@Extend(pd.Series)
def rename_vals_from_df(self, changes:pd.DataFrame) -> pd.Series:
    cols = changes.columns
    old, new = changes[cols[0]], changes[cols[1]]
    return self.map(dict(zip(old, new)))

In [None]:
@Extend(pd.DataFrame)
class _:
    def set_columns(self, *new) -> pd.DataFrame:
        new = _flatten_iterable(new)
        self.columns = new
        return self


    def rename_col(self, old:str, new:str) -> pd.DataFrame:
        """
        Simplify syntax for renaming one column
        """
        return self.rename(columns={old:new})


    def drop_cols(self, *columns) -> pd.DataFrame:
        """
        Simplify syntax of dropping columns
        """
        cols = _flatten_iterable(columns)
        cols = [c for c in cols if c in self.columns]
        return self.drop(columns=cols)


    def prefix_cols(self, cols:str or list, prefix:str) -> pd.DataFrame:
        """
        Like df.add_prefix(), but takes a subset of columns as first positional
        """
        if isinstance(cols, str):
            cols = [cols]
        return self.rename(columns={col: f'{prefix}{col}' for col in cols})


    def reset_multilevel_columns(
        self,
        *new_columns: tuple[str | Iterable[str], ...],
    ) -> pd.DataFrame:
        """
        Use this after df.pivot() to flatten and rename columns.
        """
        new_columns = _flatten_iterable(new_columns)
        self = self.reset_index()
        self.columns = self.columns.droplevel()
        self.columns.name = None
        self = self.set_columns(new_columns)
        return self


    def col_replace(
        self,
        text: str | dict,
        replacement: str | None = None
    ) -> pd.DataFrame:
        """
        Replace a pattern in each column NAME
        """
        if replacement is None:
            if isinstance(text, dict):
                to_replace = text
            else:
                raise ValueError("Multiple values must be passed as dict")
        else:
            to_replace = {text: replacement}
        
        for old, new in to_replace.items():
            for c in self.columns:
                if c != old:
                    self = self.rename(columns={c: c.replace(old, new)})
        return self


    def coerce_type(
        self,
        dtype: str or type,
        subset: list = None,
        exclude: list = None
    ) -> pd.DataFrame:
        """
        Iteratively try to set all columns to type
        """
        df = self.copy()
        cols = tuple(subset) if subset else tuple(self.columns)
        if exclude:
            cols = [c for c in cols if c not in exclude]
        for c in cols:
            try:
                df[c] = df[c].astype(dtype)
            except Exception:
                pass
        return df


    def insert_at(
        self,
        target: str | int,
        name: str,
        col: pd.Series
    ) -> pd.DataFrame:
        """
        Insert col before target col name, or to index.
        Like df.insert(), but takes a column name as location, instead of int """
        df = self.copy()
        if isinstance(target, int):
            idx = target
        else:
            idx = list(df.columns).index(target)
        df.insert(idx, name, col)
        return df


    def move_col(
        self,
        name: str,
        target: str | int
    ) -> pd.DataFrame:
        """
        Move col to before target col name, or to index.
        - Must not mutate original dataframe (so we can chain the func
        and re-run cells).
        - Placement must be correct: if target column is string, always
        place our column before the target column. If target is an index,
        the RESULTING dataframe must have our new column in the specified index.
        """
        cols = list(self.columns)

        if isinstance(target, int):
            idx = target
            cols.remove(name)
            cols.insert(idx, name)
        elif isinstance(target, str):
            idx = list(self.columns).index(target)
            cols[cols.index(name)] = "7dwIFmVgq5f1z"
            cols.insert(idx, name)
            cols.remove("7dwIFmVgq5f1z")

        return self[cols]


    def separate_by(
        self,
        to_match: list or str,
        index: list = [],
        keep: list = [],
        start: bool = False,
        end: bool = False,
        mode: str = "include",
    ) -> pd.DataFrame:
        """
        Given a df and a substring, filter for columns whose name does, or
        does not, contain a substring

        to_match
            text(s) to match
        index
            key columns to ignore (year, county, district, etc.)
        keep
            columns to ignore in matching. If "exclude", these columns will be removed
        start
            match only if column starts with `to_match` element, instead of contains
        end
            match only if column ends with `to_match` element, instead of contains
        mode
            If "include", returned df will include columns in `index`, `keep`, and matches
            If "exclude", returned df will exclude columns in `keep`, and matches
        """
        if not isinstance(to_match, list):
            to_match = [to_match]

        names = [item for sublist in [[c for c in self.columns if (
                c.startswith(txt) if start else c.endswith(txt) if end else txt in c
            )] for txt in to_match] for item in sublist]

        if mode == 'include':
            return self.copy()[index + keep + names]
        if mode == 'exclude':
            return self.copy().drop(columns = keep + names)


    def display(self, text: bool = None, head: bool = True) -> pd.DataFrame:
        """
        Display while chaining methods. Return self
        """
        if text is not None:
            print(text)
        to_display = self.head(3) if head else self
        display(to_display)
        return self
    

    def rename_vals_from_df(self, column: str, changes: pd.DataFrame) -> pd.DataFrame:
        """
        Renames values in a column using a 2-column dataframe (old, new)
        instead of dictionary.
        """
        df = self.copy()
        df[column] = df[column].rename_vals_from_df(changes)
        return df


    def rename_cols_from_df(self, changes: pd.DataFrame) -> pd.DataFrame:
        """
        Renames columns in df, mapping changes from a 2-column df (old, new)
        """
        cols = changes.columns
        old, new = changes[cols[0]], changes[cols[1]]
        return self.rename(columns=dict(zip(old, new)))

## Format District
- Apply the below function to all school district columns from source, to standardize their naming conventions to allow joins

In [None]:
def standardize_district_name(name: str) -> str:
    """
    Apply this iteratively (with pd.Series.apply())
    to school district name columns to standardize their naming conventions
    as best as possible prior to merging datasets with potentially
    very different naming conventions.
    """
    import re
    name = name.upper()
    name = re.sub('S/D', '', name)
    name = re.sub('-|\.|\(|\)|/|:', '', name)
    name = re.sub(' CONSOLIDATED', '', name)
    name = re.sub('\s?SCHOOL DISTRICT', '', name)
    
    # Number patterns
    name = re.sub(r' RENO\s?(\d+)', r'\1', name)
    name = re.sub(r' NO\s?(\d+)', r'\1', name)
    name = re.sub(r' RD\s?(\d+)', r'\1', name)
    name = re.sub(r' RJ\s?(\d+)', r'\1', name)
    name = re.sub(r' RE\s?(\d+)J?T?', r'\1', name)
    name = re.sub(r' R\s?(\d+)J?', r'\1', name)
    name = re.sub(r' C\s?(\d+)', r'\1', name)

    # Remove spaces
    name = re.sub('\s', '', name)

    # Number patterns (text at end)
    name = re.sub(r'(\d+)R', r'\1', name)
    name = re.sub(r'(\d+)J', r'\1', name)
    name = re.sub(r'(\d+)JT', r'\1', name)

    # Delete text parts
    name = re.sub('RURAL', '', name)
    name = re.sub('SCHOOLS', '', name)
    # name = re.sub('SCHOOLDISTRICT', '', name)
    name = re.sub('SCHOOLDIST', '', name)
    name = re.sub('WATERSHED', '', name)
    name = name.strip()

    # Replace full
    name = name.replace(r'GILCREST', 'WELDCOUNTY')
    name = name.replace(r'FLORENCE', 'FREMONT')
    name = name.replace(r'CONSOLIDATED1', 'CUSTERCOUNTY1')
    name = re.sub(r'(PUEBLOCITY)(\d+)', r'\1', name)
    name = re.sub(r'^CREEDE$', r'CREEDE1', name)

    name = name.strip()
    # Push number out
    name = re.sub(r'(.*?)(\d+)(.*)', r'\1\3 \2', name)
    return name


def join_conflicts(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    col: str
) -> pd.DataFrame:
    '''
    Use when trying to join columns and see values that aren't shared.
    There's DEFINITELY a better way to do this. But I'm lazy, and nobody cares!
    '''
    import pandas as pd

    col_items1 = sorted(df1[col])
    col_items2 = sorted(df2[col])

    items1_diff = [i for i in col_items1 if i not in col_items2]
    items2_diff = [i for i in col_items2 if i not in col_items1]

    # Make lists same length.
    if len(items1_diff) > len(items2_diff):
        items2_diff += [None] * (len(items1_diff) - len(items2_diff))
    elif len(items2_diff) > len(items1_diff):
        items1_diff += [None] * (len(items2_diff) - len(items1_diff))
    
    return pd.DataFrame(list(zip(items1_diff, items2_diff)))

## GeoDF

- Child class of `geopy.GeoDataFrame` to provide ease of use and additional functionality

In [None]:
import geopandas as gp
from shapely import wkt

class GeoDF(gp.GeoDataFrame):

    def __init__(self, df, geo=None, crs='epsg:4326'):
        if type(df) == str:
            df = pd.read_csv(df)
        if type(df) == pd.DataFrame:
            df = df.copy()

            cols = [c for c in df.columns if c.startswith('geo_')]
            for c in cols:
                df[c] = df[c].fillna('GEOMETRYCOLLECTION EMPTY')
                df[c] = gp.GeoSeries(df[c].apply(wkt.loads))

            if not geo:
                geo = cols[0]

            df['geometry'] = df[geo]

        super(GeoDF, self).__init__(df, crs=crs)
    

    def explore(self, tooltip=None, geo=None, **kwargs):
        if geo:
            self.set_geo(geo)
        
        if not tooltip:
            tooltip = self.columns[0]
            if 'county' in self.columns and 'dist' in self.columns:
                tooltip = ['county', 'dist']

        return super().loc[self['geometry'].astype(str) != 'GEOMETRYCOLLECTION EMPTY'].explore(tooltip=tooltip, **kwargs)


    def df(self):
        return self[self.geometry.astype(str) != 'GEOMETRYCOLLECTION EMPTY']


    def set_geo(self, geo, crs='epsg:4326'):
        self['geometry'] = self[geo]
    

    def copy(self):
        return GeoDF(super().copy())

## GroupedDF

- Lets us split a dataframe into organized groups based on column names.
- Useful for massive dataframes with too many columns to conveniently select columns manually

In [None]:
from copy import deepcopy

class GroupedDF:
    default_index = []
    groups: dict = None

    def __init__(self, df, index=[], custom={}, show_g_names=True):
        self.index = index
        if self.index == []: self.index = GroupedDF.default_index

        self.index = index
        self._df = deepcopy(df)
        self._show_g_names = show_g_names

        self._custom = custom
        self.refresh_groups()
    

    def refresh_groups(self):
        self._dict = {g: self._df.separate_by(g, self.index, start=True, mode='include') for g in GroupedDF.groups.keys()}

        if self._show_g_names == False:
            for k, v in self._dict.items():
                self._dict[k] = v.col_replace(f'{k}_', '')

        for name, cols in self._custom.items():
            self._dict[name] = self._df[cols]

        for k, v in self._dict.items():
            setattr(self, k, v)
    

    @classmethod
    def set_groups(cls, items: dict or list):

        if type(items) == list:
            cls.groups = {k: "" for k in items}
            return
        
        cls.groups = items


    @property
    def df(self):
        return self._df
    
    @df.setter
    def df(self, new):
        self._df = new
        self.refresh_groups()


    def __getattr__(self, name):
        return self._dict.get(name)

    def __getitem__(self, name):
        return self._dict[name]
    

    @property
    def dict(self):
        return self._dict
    
    @property
    def show_g_names(self):
        return self._show_g_names
    
    @show_g_names.setter
    def show_g_names(self, val:bool):
        self._show_g_names = val
        self.refresh_groups()
    

    def display(self, rows=3, exclude=[]):
        for k, v in self._dict.items():
            print(k, GroupedDF.groups[k], sep=': ')
            display(v.drop(columns=exclude).head(rows))
            print()

---
---
# `2-load_raw_db`

[BACK TO TOP ^](#top-of-page)

---
---


# Load data into sqlite
---
#### Note: by default, this will pull data directly from the source. Source dataset formatting can change at any time. If you encounter errors elsewhere in this pipeline, use backup data instead by setting `use_backup=True` below
(edit: `use_backup` defaults to True)

- With the Socrata ID and desired key name for all datasets:
  - Format a request url with the ID
  - Use `pd.read_csv()` to download the dataset into a dataframe.
  - Insert the table into sql, using dictionary key as table name

In [2]:
use_backup = True

In [3]:
dataset_info = {
    'districts': 'mm2p-ag5w',
    'counties': 'ahgn-r8s5',
    'census_counties_2012': 'f4n4-vnyx',
    'census_counties_2013': 'm3j7-raj9',
    'census_counties_2014': 'wshk-29g7',
    'census_counties_2015': 't48m-528x',
    'census_counties_2016': 'eghh-ua8y',
    'census_counties_2017': 'ewkj-ipn7',
    'census_counties_2018': 'xum2-smvh',
    'census_counties_2019': '8j3i-rjn4',
    'census_counties_field_desc': 'qten-sdpn',
    'county_population': 'eeah-cmy8',
    'crime_16_19': 'j6g4-gayk',
    'crime_97_15': '6vnq-az4b',
    'dist_grad_rate': 'cfyh-6xxg',
    'dist_mobility_demographics': 'rg84-k4d3',
    'dist_student_mobility': '6wcd-ysh5',
}

In [4]:
format_url = lambda id: f'https://data.colorado.gov/api/views/{id}/rows.csv?accessType=DOWNLOAD'
backup_file_fmt = lambda name: f"backup/{name}.parquet.gzip"


def query(txt: str, con: str) -> list:
    cur = sql[con]['con']().cursor()
    return cur.execute(txt).fetchall()


def table_exists(con: str, name):
    c = sql[con]['con']().cursor()
                
    # get the count of tables with the name
    c.execute(f"""
    SELECT
        count(name)
    FROM sqlite_master
    WHERE type='table'
        AND name='{name}'
    """)

    # if the count is 1, then table exists
    if c.fetchone()[0] == 1:
        return True
    return False

## Load tables and write to raw

In [5]:
for name, id in dataset_info.items():
    if table_exists('raw', name):
        continue
    if use_backup == True:
        df = pd.read_parquet(backup_file_fmt(name))
    else:
        df = pd.read_csv(format_url(id))
    write_raw(df, name)

## See tables in raw and main

In [6]:
raw_tables = query('''SELECT name FROM sqlite_master WHERE type='table';''', con='raw')
main_tables = query('''SELECT name FROM sqlite_master WHERE type='table';''', con='main')

print("RAW:")
print(*[t[0] for t in raw_tables], sep='\n')
print("\nMAIN:")
print(*[t[0] for t in main_tables], sep='\n')

RAW:
districts
counties
census_counties_2012
census_counties_2013
census_counties_2014
census_counties_2015
census_counties_2016
census_counties_2017
census_counties_2018
census_counties_2019
census_counties_field_desc
county_population
crime_16_19
crime_97_15
dist_grad_rate
dist_mobility_demographics
dist_student_mobility

MAIN:



### Save to backup?

In [7]:
def save_raw_data_to_backup():
    """
    No need to use this unless you want to backup current
    state of raw datasets as parquet files
    """
    for name in dataset_info.keys():
        df = read_raw(name)
        df.to_parquet(backup_file_fmt(name), compression='gzip', index=False)

# save_raw_data_to_backup()

### Clear databases

In [8]:
def empty_raw():
    raw_tables = query("SELECT name FROM sqlite_master WHERE type='table'", con='raw')
    con = get_con_raw()
    cur = con.cursor()
    for name in raw_tables:
        print(name[0])
        cur.execute(f"DROP TABLE IF EXISTS {name[0]}")
    con.commit()
    cur.execute("VACUUM;")

# empty_raw()

In [9]:
def empty_main():
    main_tables = query("SELECT name FROM sqlite_master WHERE type='table'", con='main')
    con = get_con_main()
    cur = con.cursor()
    for name in main_tables:
        print(name[0])
        cur.execute(f"DROP TABLE IF EXISTS {name[0]}")
    con.commit()
    cur.execute("VACUUM;")

# empty_main()

---
---
# `3-county_and_dist_tables`

[BACK TO TOP ^](#top-of-page)

---
---


# Reference tables: `county` and `district`
---
- Standardize key to reference with other tables in the future
- Store additional info for each county and district
- Use **Google Maps V3 API** to collect geographic points for the center of each county and district

## School Districts
- This will be difficult. There is no standard naming convention for districts, and many datasets have typos.
- Also, geo borders and address info are found in two different datasets (which, of course, use different naming conventions for districts)
- The `dist_grad_rate` dataset is the only one we've found which provides a county for each school district

In [25]:
head(
    read_raw('districts'),
    read_raw('dist_grad_rate')
)

19 cols x 178 rows


Unnamed: 0,the_geom,gid,lgid,source,modneeded,generalize,priority,lastupdate,id,lgtypeid,lgstatusid,abbrev_nam,mail_addre,alt_addres,mail_city,mail_state,mail_zip,url,prev_name
0,MULTIPOLYGON (((-106.59904239399998 39.0362792...,443,8900,Census TIGER SHP 2014,,,0,December 2015,1793,99,1,Buena Vista R-31 School District,PO Box 2027,,Buena Vista,CO,81211,www.bvschools.org,
1,MULTIPOLYGON (((-102.43672348799998 39.5852515...,444,64929,Census TIGER SHP 2014,,,0,December 2015,217,99,1,Burlington RE-6J School District,PO Box 369,,Burlington,CO,80807,www.burlingtonk12.org,
2,MULTIPOLYGON (((-104.28505092499995 39.5649584...,445,64908,Census TIGER SHP 2014,,,0,December 2015,857,99,1,Byers 32J School District,444 East Front Street,,Byers,CO,80103,byers32j.k12.co.us,


38 cols x 185 rows


Unnamed: 0,County Name,Organization Code,Organization Name,Students with Disabilities Final Grad Base,Students with Disabilities Graduates Total,Students with Disabilities Graduation Rate,Students with Disabilities Completers Total,Students with Disabilities Completion Rate,Limited English Proficient Final Grad Base,Limited English Proficient Graduates Total,...,Homeless Final Grad Base,Homeless Graduates Total,Homeless Graduation Rate,Homeless Completers Total,Homeless Completion Rate,Gifted-Talented Final Grad Base,Gifted-Talented Graduates Total,Gifted-Talented Graduation Rate,Gifted-Talented Completers Total,Gifted-Talented Completion Rate
0,,9999,STATE TOTAL,5775,3099,53.7,3222,55.8,6171,3289,...,2394,1175,49.1,1262,52.7,6604,6048,91.6,6156,93.2
1,ADAMS,10,MAPLETON 1,49,18,36.7,19,38.8,219,73,...,41,12,29.3,16,39.0,44,27,61.4,27,61.4
2,ADAMS,20,ADAMS 12 FIVE STAR SCHOOLS,250,118,47.2,127,50.8,379,257,...,106,62,58.5,65,61.3,227,201,88.5,208,91.6


In [26]:
dist_info = read_raw("""
    abbrev_nam AS district,
    the_geom AS geo_border,
    gid,
    lgid,
    id,
    mail_addre AS street,
    mail_city AS city,
    mail_state AS state,
    mail_zip AS zip,
    url
""",
FROM='districts'
)
head(dist_info)

10 cols x 178 rows


Unnamed: 0,district,geo_border,gid,lgid,id,street,city,state,zip,url
0,Buena Vista R-31 School District,MULTIPOLYGON (((-106.59904239399998 39.0362792...,443,8900,1793,PO Box 2027,Buena Vista,CO,81211,www.bvschools.org
1,Burlington RE-6J School District,MULTIPOLYGON (((-102.43672348799998 39.5852515...,444,64929,217,PO Box 369,Burlington,CO,80807,www.burlingtonk12.org
2,Byers 32J School District,MULTIPOLYGON (((-104.28505092499995 39.5649584...,445,64908,857,444 East Front Street,Byers,CO,80103,byers32j.k12.co.us


In [27]:
dist_county = read_raw("""
    `Organization Name` AS district,
    `Organization Code` AS code,
    `County Name` AS in_county
""",
FROM="dist_grad_rate",
WHERE="district != 'STATE TOTAL'"
)
head(dist_county)

3 cols x 183 rows


Unnamed: 0,district,code,in_county
0,MAPLETON 1,10,ADAMS
1,ADAMS 12 FIVE STAR SCHOOLS,20,ADAMS
2,ADAMS COUNTY 14,30,ADAMS


## Standardize district naming conventions
---

In [28]:
# standardize_district_name and join_conflicts defined in workspace setup
dist_info.district = dist_info.district.apply(standardize_district_name)
dist_county.district = dist_county.district.apply(standardize_district_name)

# See keys that aren't shared
dist_diff = join_conflicts(dist_info, dist_county, 'district')
dist_diff

Unnamed: 0,0,1
0,,CENTENNIALBOCES
1,,CHARTERSCHOOLINSTITUTE
2,,EXPEDITIONARYBOCES
3,,MOUNTAINBOCES
4,,SANJUANBOCES


#### Some districts have invalid counties listed. Fix it

In [29]:
dist_to_county_map = {
    # Where district == key, set county to val
    'CUSTERCOUNTY 1': 'CUSTER',
    'CHARTERSCHOOLINSTITUTE': 'DENVER',
    'MOUNTAINBOCES': 'CHAFFEE',
    'CENTENNIALBOCES': 'WELD',
    'SANJUANBOCES': 'LA PLATA',
    'EXPEDITIONARYBOCES': 'DENVER',
}
for dist_name, new_county_name in dist_to_county_map.items():
    dist_county.loc[dist_county.district == dist_name, 'in_county'] = new_county_name

### Merge district tables

In [30]:
district = dist_county.merge(dist_info, on='district', how='left')
head(district, with_tail=True)

12 cols x 183 rows


Unnamed: 0,district,code,in_county,geo_border,gid,lgid,id,street,city,state,zip,url
0,MAPLETON 1,10,ADAMS,MULTIPOLYGON (((-105.01581612299998 39.8144774...,593.0,1900.0,2210.0,5910 East 80th Avenue,Denver,CO,80229.0,www.mapleton.us/
1,ADAMSFIVESTAR 12,20,ADAMS,MULTIPOLYGON (((-105.05310614499996 39.9302934...,648.0,1901.0,2211.0,1500 E 128th Avenue,Thornton,CO,80241.0,www.adams12.org
181,SANJUANBOCES,9050,LA PLATA,,,,,,,,,
182,EXPEDITIONARYBOCES,9130,DENVER,,,,,,,,,


## County

In [31]:
county = read_raw("""
    COUNTY as county,
    the_geom as geo_border
""",
FROM="counties"
)
head(county)

2 cols x 64 rows


Unnamed: 0,county,geo_border
0,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...
1,ALAMOSA,MULTIPOLYGON (((-105.59917426201822 37.7521648...
2,ARAPAHOE,MULTIPOLYGON (((-103.70653410023402 39.7398580...


# Geocoding - Google Maps V3 API

In [32]:
from geopy.geocoders import GoogleV3
import geopandas as gp
api_kwargs = dict(provider = 'google', api_key = 'NVmnFlQnc_MiOFcCoz7dwIFmVgq5f1zncXCdwJp')

### County

In [33]:
try:
    county['geo_point'] = (
        gp.tools
        .geocode(county.county.str.capitalize() + " County, Colorado", **api_kwargs)
        ['geometry']
        .astype(str)
    )
except Exception:
    county['geo_point'] = np.nan
head(county)

3 cols x 64 rows


Unnamed: 0,county,geo_border,geo_point
0,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,GEOMETRYCOLLECTION EMPTY
1,ALAMOSA,MULTIPOLYGON (((-105.59917426201822 37.7521648...,GEOMETRYCOLLECTION EMPTY
2,ARAPAHOE,MULTIPOLYGON (((-103.70653410023402 39.7398580...,GEOMETRYCOLLECTION EMPTY


In [34]:
write_main(county, 'county')

64

### School District

Add full address column for geocoder to use

In [35]:
district.insert(0, 'address', (
    district.loc[
        ~ district.street.isna(),
        ['street', 'city', 'state', 'zip']
    ]
    .agg(', '.join, axis=1)
))

In [36]:
head(district, with_tail=True)

13 cols x 183 rows


Unnamed: 0,address,district,code,in_county,geo_border,gid,lgid,id,street,city,state,zip,url
0,"5910 East 80th Avenue, Denver, CO, 80229",MAPLETON 1,10,ADAMS,MULTIPOLYGON (((-105.01581612299998 39.8144774...,593.0,1900.0,2210.0,5910 East 80th Avenue,Denver,CO,80229.0,www.mapleton.us/
1,"1500 E 128th Avenue, Thornton, CO, 80241",ADAMSFIVESTAR 12,20,ADAMS,MULTIPOLYGON (((-105.05310614499996 39.9302934...,648.0,1901.0,2211.0,1500 E 128th Avenue,Thornton,CO,80241.0,www.adams12.org
181,,SANJUANBOCES,9050,LA PLATA,,,,,,,,,
182,,EXPEDITIONARYBOCES,9130,DENVER,,,,,,,,,


In [37]:
district['geo_point'] = np.nan
try:
    district.loc[ ~ district.address.isna(), 'geo_point'] = (
        gp.tools
        .geocode(district.address, **api_kwargs)
        ['geometry']
        .astype(str)
    )
except Exception:
    district['geo_point'] = np.nan

district = (district
    .drop_cols('address')
    .move_col('geo_point', 4)
)
head(district, with_tail=True)

13 cols x 183 rows


Unnamed: 0,district,code,in_county,geo_border,geo_point,gid,lgid,id,street,city,state,zip,url
0,MAPLETON 1,10,ADAMS,MULTIPOLYGON (((-105.01581612299998 39.8144774...,GEOMETRYCOLLECTION EMPTY,593.0,1900.0,2210.0,5910 East 80th Avenue,Denver,CO,80229.0,www.mapleton.us/
1,ADAMSFIVESTAR 12,20,ADAMS,MULTIPOLYGON (((-105.05310614499996 39.9302934...,GEOMETRYCOLLECTION EMPTY,648.0,1901.0,2211.0,1500 E 128th Avenue,Thornton,CO,80241.0,www.adams12.org
181,SANJUANBOCES,9050,LA PLATA,,,,,,,,,,
182,EXPEDITIONARYBOCES,9130,DENVER,,,,,,,,,,


In [38]:
write_main(district, 'district')

183

---
---
# `4-county_demographics`

[BACK TO TOP ^](#top-of-page)

---
---


# County Demographics
---

### Why use a separate dataset for population?
- Most population groups are present in the census data (gender, age, etc.) So why use an additional population dataset instead?
  - The population dataset claims to provide "actual" numbers, whereas the census data claims to provide "estimates"
  - The population dataset is more precise, with age groups of each individual age number, allowing us to make our own aggregated bins (adult, minor). The census data has age groups defined already, but in increments of 5, so the middle group is "15 to 19", but we need 18 and under!
  - The population dataset offers sub-aggregations: we have `minor_female` and `minor_male`, for instance, whereas the census data only offers age populations and gender populations separately
- So instead, we will use population dataset first, and add in additional groups from census data

In [19]:
INDEX = ['year', 'county']

## Population

> This data will supplement our census data in the next step of data prep. We're using the Population dataset because it's more accurate (census is just estimates), and it lets us create the age grouping ourselves. There are nearly 400,000 rows, because they give us population by year, county, and EACH individual age. In our case, we want to create an age grouping that separates students in school, and adults. So we chose >= 19, and < 19. It also has a 60 year timeframe. So number of rows = 60 years * 64 counties * 90 years of age.

In [20]:
df_raw = read_raw("""
    year,
    UPPER(county) AS county,
    age,
    malePopulation AS male,
    femalePopulation AS female,
    totalPopulation AS total
""",
FROM="county_population"
)
head(df_raw)

6 cols x 381504 rows


Unnamed: 0,year,county,age,male,female,total
0,1990,ADAMS,0,2354,2404,4758
1,1990,ADAMS,1,2345,2375,4720
2,1990,ADAMS,2,2413,2219,4632


### Age groups (< 19, >= 19)

In [21]:
df = df_raw.coerce_type(float, exclude=['year'])
df['age_range'] = "over18"
df.loc[df.age <= 18, 'age_range'] = 'under19'
df = df.drop_cols('age')
head(df)

df = (df
    .groupby(INDEX + ['age_range'])
    .sum()
    .reset_index()
)
df_grouped = df
head(df_grouped)

6 cols x 381504 rows


Unnamed: 0,year,county,male,female,total,age_range
0,1990,ADAMS,2354.0,2404.0,4758.0,under19
1,1990,ADAMS,2345.0,2375.0,4720.0,under19
2,1990,ADAMS,2413.0,2219.0,4632.0,under19


6 cols x 7808 rows


Unnamed: 0,year,county,age_range,male,female,total
0,1990,ADAMS,over18,90383.0,94282.0,184665.0
1,1990,ADAMS,under19,41519.0,39525.0,81044.0
2,1990,ALAMOSA,over18,4488.0,4823.0,9311.0


### Notice the `age_range` column. We should pivot those values out to their own columns, and mix with our existing columns
- First, pivot age_range into the male, female, and total columns
- We're left with a multilevel column index, so we drop a level and rename everything by hand.
- Lastly, restore the total, male, and female columns since they got split in half when pivoting.

In [22]:
df = (df_grouped
    .pivot(
        index=INDEX,
        columns='age_range',
        values=['male', 'female', 'total']
    )
    .reset_multilevel_columns(INDEX + ['over18', 'under19', 'under19_male', 'under19_female', 'over18_male', 'over18_female'])
)

df = (df
    .insert_at(2, 'female', df.under19_female + df.over18_female)
    .insert_at(2, 'male', df.under19_male + df.over18_male)
    .insert_at(2, 'total', df.under19 + df.over18)
)

pop_raw = df
head(pop_raw)

11 cols x 3904 rows


Unnamed: 0,year,county,total,male,female,over18,under19,under19_male,under19_female,over18_male,over18_female
0,1990,ADAMS,131902.0,278947.0,120569.0,90383.0,41519.0,94282.0,39525.0,184665.0,81044.0
1,1990,ALAMOSA,6677.0,14134.0,6423.0,4488.0,2189.0,4823.0,2117.0,9311.0,4306.0
2,1990,ARAPAHOE,191722.0,428121.0,166735.0,134481.0,57241.0,146820.0,54747.0,281301.0,111988.0


In [23]:
write_main(pop_raw, 'county_population')

3904

---
---
---

## Census Field Descriptions
---
- To supplement the "Census Counties ..." datasets, they've provided us a table with descriptions of each column name, for each historical standard of the census. Fortunately, the 2019 and 2012 census data (that's what we're using) uses the same standard: `acs_standard`
- This script does the following:
  - Filters the source dataframe to only include `acs_standard` column descriptions
  - Selects only necessary columns (column name, description)
  - Renames some values in column name, and removes some column name values we'll never use (geonum, geojson)

In [24]:
desc = read_raw("""
        apifieldname AS field_name,
        description
    """,
    FROM = "census_counties_field_desc",
    WHERE = "type = 'acs_standard' AND field_name NOT IN ('geonum', 'geojson')"
)
desc.loc[desc.field_name == 'geoname', 'field_name'] = 'county'

write_main(desc, 'census_counties_field_desc')

head(desc)

2 cols x 155 rows


Unnamed: 0,field_name,description
0,county,Geographic Area common name
1,pop,Population Estimate for the given time range
2,hispanic,Estimate for the Hispanic Population


---
---
---

# Engineering Census data
- Prepare census data for a variety of uses, such as visualization, aggregation and predictive modeling

In [25]:
# Each census year comes in a separate dataset
dem = (
    pd.concat([
            (read_raw(f'census_counties_{year}')
                .drop_cols('pop', 'geonum', 'the_geom')
                .assign(year=year)
                .rename_col('civ_ni_','civ_ni_p')
            )
        for year in range(2012, 2020)
    ])
    .copy() # avoid fragmentation caused by assign()
    .rename_col('geoname','county')
    .move_col('year', 0)
    .coerce_type(float, exclude='year')
)

dem.county = (
    dem.county
    .str.replace(" County, Colorado", "")
    .str.upper()
)
head(dem)

155 cols x 512 rows


Unnamed: 0,year,county,hispanic,white_nh,black_nh,ntvam_nh,asian_nh,hawpi_nh,other_nh,twoplus_nh,...,civ_ni_pop,disabled,pop16_pls,laborforce,civ_lf,emp,unemp,armedfrcs,not_lf,civ_ni_p
0,2012,ARAPAHOE,105174.0,364766.0,55629.0,2211.0,28067.0,1166.0,1267.0,16077.0,...,568663.0,49870.0,444215.0,320199.0,318041.0,292089.0,25952.0,2158.0,124016.0,568663.0
1,2012,MINERAL,15.0,671.0,9.0,5.0,1.0,0.0,0.0,1.0,...,702.0,129.0,681.0,391.0,391.0,370.0,21.0,0.0,290.0,702.0
2,2012,MONTROSE,8037.0,31799.0,186.0,74.0,227.0,49.0,33.0,589.0,...,40552.0,5649.0,32334.0,20137.0,20124.0,18110.0,2014.0,13.0,12197.0,40552.0


#### Select desired columns from census data

In [26]:
df = dem.copy()[INDEX + [
    'med_age',
    'households', 'avghhsize',
    'civ_lf', 'emp', 'unemp',
    'hispanic', 'white_nh', 'black_nh', 'asian_nh', 'ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh',
    'pop25plus', 'hsgrad_sc',
    'med_hh_inc', 'per_cap_in',
    'citz_birth', 'citz_nat', 'born_in_co',
    'pop_3pl', 'enrolled', 'undergrad',
    'gr_1_4', 'gr_5_8', 'gr_9_12',
    'med_hm_val', 'med_yr_blt',
    'housing_un', 'occ_hu',
    'own_occ_hu', 'v_l_50k', 'v50k_100k', 'v100k_150k', 'v150k_200k', 'v200k_250k', 'v250k_300k',
    'v300k_400k', 'v400k_500k', 'v500k_750k', 'v750k_1m', 'v_1m_plus',
    'b2000_2009', 'b1990_1999', 'b1980_1989', 'b1970_1979',
    'b1960_1969', 'b1950_1959', 'b1940_1949', 'b1939_e',
    'ps_uni', 'ps_below',
    'tot_l18', 'pov_l18',
]]

#### Group bins together

In [27]:
from typing import Callable
@Extend(pd.DataFrame)
def combine_cols(
    self,
    name: str | None = None,
    cols: list | None = None,
    items: dict | None = None,
    replace: bool = True,
    func: Callable[[Iterable[Any]], int | float] = sum
) -> pd.DataFrame:
    """
    Given a list of column names, create a new column with their sum, and
    position it before the first col in 'cols'. So if replace=True, then
    the old columns will effectively be replaced in their original position.
    To do multiple sums, pass 'items' as dict with names as keys and col list as vals
    """
    df = self.copy()
    if not items:
        items = {name: cols}
    for name, cols in items.items():
        new = func([df[c] for c in cols])
        df = df.insert_at(cols[0], name, new)
        if replace:
            df = df.drop_cols(cols)
    return df

In [28]:
# Create new variable for total citizens. Place it next to citz_birth
df = (df
    .insert_at('citz_birth', 'citz', df.citz_birth + df.citz_nat)
    .drop_cols('citz_nat')
    .combine_cols(items={
        'race_other': ['ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh'],
        'b1949_e': ['b1939_e', 'b1940_1949'],
        'v50k_150k':  ['v50k_100k', 'v100k_150k'],
        'v150k_250k': ['v150k_200k', 'v200k_250k'],
        'v250k_400k': ['v250k_300k', 'v300k_400k'],
        'v400k_750k': ['v400k_500k', 'v500k_750k'],
        'v750k_plus': ['v750k_1m', 'v_1m_plus'],
    })
)
head(df)

48 cols x 512 rows


Unnamed: 0,year,county,med_age,households,avghhsize,civ_lf,emp,unemp,hispanic,white_nh,...,b1990_1999,b1980_1989,b1970_1979,b1960_1969,b1950_1959,b1949_e,ps_uni,ps_below,tot_l18,pov_l18
0,2012,ARAPAHOE,35.7,223747.0,2.55,318041.0,292089.0,25952.0,105174.0,364766.0,...,33989.0,56011.0,62253.0,22258.0,16519.0,7165.0,568999.0,66945.0,144576.0,23054.0
1,2012,MINERAL,60.3,363.0,1.83,391.0,370.0,21.0,15.0,671.0,...,232.0,239.0,203.0,100.0,75.0,240.0,702.0,47.0,26.0,0.0
2,2012,MONTROSE,42.6,16732.0,2.41,20124.0,18110.0,2014.0,8037.0,31799.0,...,3750.0,2106.0,3581.0,1298.0,920.0,2333.0,40368.0,5565.0,9788.0,1927.0


#### Create nominal variables for housing price and housing age
- First, create a categorical variable whose values are the COLUMN NAME of the bin with the max value. For instance, if a given county has more houses in the `v50k_100k` range than any other range, the value at that row in the new column will be "v50k_100k"
- Next, create a nominal column from that categorical column, ordered so that a lower number means less desirable. So for prices, "v_l_50k" -> 1, and for year built, "b1939_e" -> 1

In [29]:
@Extend(pd.DataFrame)
class _:
    def add_ordinal(
        self,
        col: str,
        order: list,
        replace: bool = False
    ) -> pd.DataFrame:
        """
        Create ordinal col from existing categorical col. Pass an ascending list
        of categories. Example: Input ['A', 'B', 'C'] -> New column map: {'A': 1, 'B': 2, 'C': 3}
        """
        df = self.copy()
        new = self[col].map({k: i+1 for i, k in enumerate(order)})
        df = df.insert_at(col, f'{col}_ord', new)
        if replace:
            df = df.drop_cols(col)
        return df

    def add_binmax(
        self,
        name: str,
        cols: list,
        replace: bool = False
    ) -> pd.DataFrame:
        """
        Shorthand for df.idxmax(), but lets you choose location and replace given columns
        """
        df = self.copy()
        new = self[cols].idxmax(axis=1)
        df = df.insert_at(cols[0], name, new)
        if replace:
            df = df.drop_cols(cols)
        return df

In [30]:
blt_ascending = ['b1949_e','b1950_1959','b1960_1969','b1970_1979','b1980_1989','b1990_1999','b2000_2009']
prices_ascending = ['v_l_50k', 'v50k_150k', 'v150k_250k', 'v250k_400k', 'v400k_750k', 'v750k_plus']
df = (df
    .add_binmax('blt_freq_yr', blt_ascending)
    .add_ordinal('blt_freq_yr', blt_ascending)
    .add_binmax('hu_freq_val', prices_ascending)
    .add_ordinal('hu_freq_val', prices_ascending)
)

---

#### Rename columns in population and census data with a naming system that uses consistent prefixes to let us easily select groups and sub-groups of columns with a simple string match

In [31]:
rename_pop = pd.read_excel('column_renaming.xlsx', sheet_name='population_rename')

rename_dem = pd.read_excel('column_renaming.xlsx', sheet_name='census_rename')

display(rename_pop.fillna(''), rename_dem.fillna(''))

pop = pop_raw.rename_cols_from_df(rename_pop.dropna())
df = df.rename_cols_from_df(rename_dem.dropna())

Unnamed: 0,OLD,NEW
0,total,pop
1,,
2,male,gend_m
3,female,gend_f
4,,
5,over18,age_over18
6,under19,age_undr19
7,,
8,over18_male,gend_m_age_over18
9,over18_female,gend_f_age_over18


Unnamed: 0,OLD,NEW
0,med_age,age_median
1,,
2,per_cap_in,inc_per_cap
3,med_hh_inc,inc_hh_median
4,,
5,households,hh
6,avghhsize,hh_size_avg
7,,
8,pop25plus,hsgrad_pool
9,hsgrad_sc,hsgrad_graduated


## Merge population and census data

In [32]:
main = (pop
    .merge(df, on=INDEX)
    .move_col('age_median', 'age_over18')
)
head(main)

61 cols x 512 rows


Unnamed: 0,year,county,pop,gend_m,gend_f,age_median,age_over18,age_undr19,gend_m_age_undr19,gend_f_age_undr19,...,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,231571.0,487410.0,201960.0,32.4,162109.0,69462.0,162653.0,66249.0,...,30185.0,19615.0,20369.0,7,b2000_2009,6158.0,438171.0,62008.0,124375.0,25278.0
1,2012,ALAMOSA,7823.0,17115.0,6283.0,32.2,5622.0,2201.0,5748.0,2044.0,...,1405.0,654.0,591.0,1,b1949_e,1536.0,14622.0,3191.0,3817.0,758.0
2,2012,ARAPAHOE,292548.0,666719.0,233180.0,35.7,212207.0,80341.0,227254.0,76419.0,...,62253.0,22258.0,16519.0,4,b1970_1979,7165.0,568999.0,66945.0,144576.0,23054.0


## Calculations for groups
---

In [33]:
# GroupedDF defined in workspace setup
GroupedDF.default_index = INDEX
GroupedDF.set_groups(['age', 'gend', 'race', 'inc', 'hh', 'citz', 'hsgrad', 'civ_lf', 'ps', 'stud', 'hu', 'hu_blt', 'hu_oo'])

In [34]:
gd = GroupedDF(main, INDEX, custom={'hu': INDEX + ['hu', 'hu_occ']})
gd.display(3)

age: 


Unnamed: 0,year,county,age_median,age_over18,age_undr19
0,2012,ADAMS,32.4,162109.0,69462.0
1,2012,ALAMOSA,32.2,5622.0,2201.0
2,2012,ARAPAHOE,35.7,212207.0,80341.0



gend: 


Unnamed: 0,year,county,gend_m,gend_f,gend_m_age_undr19,gend_f_age_undr19,gend_m_age_over18,gend_f_age_over18
0,2012,ADAMS,487410.0,201960.0,162653.0,66249.0,324757.0,135711.0
1,2012,ALAMOSA,17115.0,6283.0,5748.0,2044.0,11367.0,4239.0
2,2012,ARAPAHOE,666719.0,233180.0,227254.0,76419.0,439465.0,156761.0



race: 


Unnamed: 0,year,county,race_hispanic,race_white,race_black,race_asian,race_other
0,2012,ADAMS,167556.0,235991.0,12970.0,15304.0,11175.0
1,2012,ALAMOSA,7185.0,7767.0,110.0,59.0,629.0
2,2012,ARAPAHOE,105174.0,364766.0,55629.0,28067.0,20721.0



inc: 


Unnamed: 0,year,county,inc_hh_median,inc_per_cap
0,2012,ADAMS,56633.0,24357.0
1,2012,ALAMOSA,38045.0,19657.0
2,2012,ARAPAHOE,60400.0,32845.0



hh: 


Unnamed: 0,year,county,hh,hh_size_avg
0,2012,ADAMS,151034.0,2.91
1,2012,ALAMOSA,5853.0,2.49
2,2012,ARAPAHOE,223747.0,2.55



citz: 


Unnamed: 0,year,county,citz,citz_birth,citz_co
0,2012,ADAMS,396172.0,376454.0,223907.0
1,2012,ALAMOSA,15122.0,14868.0,9542.0
2,2012,ARAPAHOE,519940.0,487576.0,223433.0



hsgrad: 


Unnamed: 0,year,county,hsgrad_pool,hsgrad_graduated
0,2012,ADAMS,275628.0,166731.0
1,2012,ALAMOSA,9424.0,5946.0
2,2012,ARAPAHOE,378792.0,199197.0



civ_lf: 


Unnamed: 0,year,county,civ_lf,civ_lf_employed
0,2012,ADAMS,236110.0,213794.0
1,2012,ALAMOSA,7171.0,6449.0
2,2012,ARAPAHOE,318041.0,292089.0



ps: 


Unnamed: 0,year,county,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,438171.0,62008.0,124375.0,25278.0
1,2012,ALAMOSA,14622.0,3191.0,3817.0,758.0
2,2012,ARAPAHOE,568999.0,66945.0,144576.0,23054.0



stud: 


Unnamed: 0,year,county,stud_enroll_pool,stud_enrolled,stud_undergrad,stud_1_4,stud_5_8,stud_9_12
0,2012,ADAMS,420756.0,117499.0,19299.0,28761.0,26645.0,24342.0
1,2012,ALAMOSA,14903.0,5362.0,2285.0,736.0,801.0,890.0
2,2012,ARAPAHOE,549701.0,153854.0,29388.0,33703.0,30902.0,33425.0



hu: 


Unnamed: 0,year,county,hu,hu_occ
0,2012,ADAMS,163245.0,151034.0
1,2012,ALAMOSA,6572.0,5853.0
2,2012,ARAPAHOE,238160.0,223747.0



hu_blt: 


Unnamed: 0,year,county,hu_blt_2000_plus,hu_blt_1990_1999,hu_blt_1980_1989,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950
0,2012,ADAMS,38682.0,27598.0,20368.0,30185.0,19615.0,20369.0,7,b2000_2009,6158.0
1,2012,ALAMOSA,650.0,866.0,862.0,1405.0,654.0,591.0,1,b1949_e,1536.0
2,2012,ARAPAHOE,39415.0,33989.0,56011.0,62253.0,22258.0,16519.0,4,b1970_1979,7165.0



hu_oo: 


Unnamed: 0,year,county,hu_oo,hu_oo_freq_val_ord,hu_oo_freq_val,hu_oo_lt_50,hu_oo_50_150,hu_oo_150_250,hu_oo_250_400,hu_oo_400_750,hu_oo_750_plus
0,2012,ADAMS,100108.0,3,v150k_250k,8578.0,19838.0,47583.0,17779.0,5427.0,903.0
1,2012,ALAMOSA,3702.0,2,v50k_150k,435.0,1599.0,1077.0,397.0,177.0,17.0
2,2012,ARAPAHOE,143158.0,3,v150k_250k,4207.0,22174.0,55935.0,38213.0,16339.0,6290.0





## Calculations
---

- **age, and gend**
  - `age_median`: (Existing)
  - `age_undr19_prop`: What percent of the population is under 19?
  - `gend_m_prop`: What percent of the population is male?
  - `age_undr19_gend_m_prop`: What percent of under-19 year old are male? (divide m_undr19 by undr19)
- **inc**
  - `inc_hh_med`: (Existing) Median household income
  - `inc_per_cap`: (Existing) Per capita income
- **hh**
  - `hh_size_avg`: (Existing) Average household size
- **race**
  - `race_{x}_prop`: What percent of the population is race x?
  - `race_prop_stdev`: What is the standard deviation of the race proportions? We need to calculate the proportions first, to normalize for the population size, that way, we can compare the standard deviations across groups
- **hsgrad**
  - `hsgrad_graduated_prop`: What percent of adults (age 25+) have a high school diploma or equivalent?
- **civ_lf**
  - `civ_lf_prop`: What percent of the population is in the civilian labor force?
  - `civ_lf_employed_prop`: What percent of the civilian labor force is employed?
- **ps**
  - `ps_total_prop`: What percent of people whose poverty status is known are below the poverty line?
  - `ps_undr18_total_prop`: What percent of under-18 people whose poverty status is known are below the poverty line?
  - `ps_undr18_prop`: What percent of people below the poverty line are under 18?
- **stud**
  - `stud_enrolled_prop`: Percent of people who could be enrolled in school that actually are enrolled
  - `stud_hs_prop`: What percent of gradeschool students (1-12) are high schoolers? (lower number indicates dropouts, which may associate with crime)
  - `stud_undergrad_prop`: What percent of enrolled students are undergraduates?
- **citz**
  - `citz_prop`: What percent of the population is a us citizen?
  - `citz_birth_prop`: What percent of us citizens were born in the us?
  - `citz_co_prop`: What percent of citizens were born in Colorado?
- **hu**
  - `hu_occ_prop`: Percent of homes which are occupied
  - `hu_blt_after1989`: Percent of homes which were built in the past 20 years
  - `hu_blt_nominal`: Convert hu_blt_mode_range to nominal, where the highest number corresponds to highest year range
- **hu_oo**
  - `hu_oo_prop`: Percent of occupied properties occupied by owner. The remaining percent is renter occupied
  - `hu_oo_lt_50_prop`: Percent of owner occupied properties worth less than $50,000
  - `hu_oo_750_plus_prop`: Percent of owner occupied properties worth $750,000 or more


In [35]:
df = main.copy()

df['age_over18_prop'] = df.age_over18 / df['pop']
df['age_undr19_prop'] = df.age_undr19 / df['pop']
df['gend_m_prop'] = df.gend_m / df['pop']
df['gend_f_prop'] = df.gend_f / df['pop']
df['age_undr19_gend_m_prop'] = df.gend_m_age_undr19 / df.age_undr19
df['age_undr19_gend_f_prop'] = df.gend_f_age_undr19 / df.age_undr19
df['age_over18_gend_m_prop'] = df.gend_m_age_over18 / df.age_over18
df['age_over18_gend_f_prop'] = df.gend_f_age_over18 / df.age_over18

df['gend_m_age_undr19_prop'] = df.gend_m_age_undr19 / df.gend_m
df['gend_m_age_over18_prop'] = df.gend_m_age_over18 / df.gend_m
df['gend_f_age_undr19_prop'] = df.gend_f_age_undr19 / df.gend_f
df['gend_f_age_over18_prop'] = df.gend_f_age_over18 / df.gend_f

race_base = GroupedDF(df, INDEX).race
race = df.copy()[INDEX]
for c in [c for c in race_base.columns if c not in INDEX]:
    race[f'{c}_prop'] = race_base[c] / df['pop']

race['race_prop_stdev'] = np.std(race.drop(columns=INDEX), axis=1)
df = df.merge(race, how='inner', on=INDEX)

df['hsgrad_graduated_prop'] = df.hsgrad_graduated / df.hsgrad_pool

df['civ_lf_prop'] = df.civ_lf / df['pop']
df['civ_lf_employed_prop'] = df.civ_lf_employed / df.civ_lf

df['ps_total_prop'] = df.ps_below / df.ps_known
df['ps_undr18_total_prop'] = df.ps_undr18_below / df.ps_undr18_known
df['ps_undr18_prop'] = df.ps_undr18_below / df.ps_below

df['stud_enrolled_prop'] = df.stud_enrolled / df.stud_enroll_pool
df['stud_hs_prop'] = df.stud_9_12 / (df.stud_1_4 + df.stud_5_8 + df.stud_9_12)
df['stud_undergrad_prop'] = df.stud_undergrad / df.stud_enrolled

df['citz_per_cap'] = df.citz / df['pop']
df['citz_birth_prop'] = df.citz_birth / df.citz
df['citz_co_prop'] = df.citz_co / df.citz

df['hu_per_cap'] = df.hu / df['pop']
df['hu_occ_prop'] = df.hu_occ / df.hu
df['hu_blt_2000_plus_prop'] = df.hu_blt_2000_plus / df.hu

df['hu_oo_prop'] = df.hu_oo / df.hu_occ

for hval in ['hu_oo_lt_50', 'hu_oo_50_150', 'hu_oo_150_250', 'hu_oo_250_400', 'hu_oo_400_750', 'hu_oo_750_plus']:
    df[f'{hval}_prop'] = df[hval] / df.hu_oo

for hyear in [
        'hu_blt_lt_1950', 'hu_blt_1950_1959', 'hu_blt_1960_1969',
        'hu_blt_1970_1979', 'hu_blt_1980_1989', 'hu_blt_1990_1999', 'hu_blt_2000_plus'
    ]:
    df[f'{hyear}_prop'] = df[hyear] / df.hu

# prop, counts = separate_by(df, ['prop', 'per_cap', 'median', 'avg', 'freq', 'med_hm_val', 'med_yr_blt'], index=INDEX)
filter_cols = ['prop', 'per_cap', 'median', 'avg', 'freq', 'med_hm_val', 'med_yr_blt']
prop = df.separate_by(filter_cols, index=INDEX, mode='include')
counts = df.separate_by(filter_cols, index=INDEX, mode='exclude')

gprop = GroupedDF(prop, INDEX, custom={'hu': INDEX + ['hu_per_cap', 'hu_occ_prop']})
gprop.display()

age: 


Unnamed: 0,year,county,age_over18_prop,age_undr19_prop,age_undr19_gend_m_prop,age_undr19_gend_f_prop,age_over18_gend_m_prop,age_over18_gend_f_prop,age_median
0,2012,ADAMS,0.70004,0.29996,2.341611,0.953744,2.003325,0.837159,32.4
1,2012,ALAMOSA,0.71865,0.28135,2.61154,0.928669,2.021878,0.754002,32.2
2,2012,ARAPAHOE,0.725375,0.274625,2.828618,0.951183,2.070926,0.738717,35.7



gend: 


Unnamed: 0,year,county,gend_m_prop,gend_f_prop,gend_m_age_undr19_prop,gend_m_age_over18_prop,gend_f_age_undr19_prop,gend_f_age_over18_prop
0,2012,ADAMS,2.104797,0.87213,0.333709,0.666291,0.32803,0.67197
1,2012,ALAMOSA,2.18778,0.803145,0.335846,0.664154,0.325322,0.674678
2,2012,ARAPAHOE,2.279007,0.797066,0.340854,0.659146,0.327725,0.672275



race: 


Unnamed: 0,year,county,race_hispanic_prop,race_white_prop,race_black_prop,race_asian_prop,race_other_prop,race_prop_stdev
0,2012,ADAMS,0.723562,1.019087,0.056009,0.066088,0.048257,0.409877
1,2012,ALAMOSA,0.918446,0.992842,0.014061,0.007542,0.080404,0.452841
2,2012,ARAPAHOE,0.35951,1.246859,0.190153,0.09594,0.070829,0.438949



inc: 


Unnamed: 0,year,county,inc_per_cap,inc_hh_median
0,2012,ADAMS,24357.0,56633.0
1,2012,ALAMOSA,19657.0,38045.0
2,2012,ARAPAHOE,32845.0,60400.0



hh: 


Unnamed: 0,year,county,hh_size_avg
0,2012,ADAMS,2.91
1,2012,ALAMOSA,2.49
2,2012,ARAPAHOE,2.55



citz: 


Unnamed: 0,year,county,citz_birth_prop,citz_co_prop,citz_per_cap
0,2012,ADAMS,0.950229,0.565176,1.710801
1,2012,ALAMOSA,0.983203,0.631001,1.933018
2,2012,ARAPAHOE,0.937754,0.429728,1.777281



hsgrad: 


Unnamed: 0,year,county,hsgrad_graduated_prop
0,2012,ADAMS,0.604913
1,2012,ALAMOSA,0.630942
2,2012,ARAPAHOE,0.525874



civ_lf: 


Unnamed: 0,year,county,civ_lf_prop,civ_lf_employed_prop
0,2012,ADAMS,1.019601,0.905485
1,2012,ALAMOSA,0.916656,0.899317
2,2012,ARAPAHOE,1.087141,0.9184



ps: 


Unnamed: 0,year,county,ps_total_prop,ps_undr18_total_prop,ps_undr18_prop
0,2012,ADAMS,0.141516,0.20324,0.407657
1,2012,ALAMOSA,0.218233,0.198585,0.237543
2,2012,ARAPAHOE,0.117654,0.159459,0.344372



stud: 


Unnamed: 0,year,county,stud_enrolled_prop,stud_hs_prop,stud_undergrad_prop
0,2012,ADAMS,0.279257,0.305236,0.164248
1,2012,ALAMOSA,0.359793,0.366708,0.426147
2,2012,ARAPAHOE,0.279887,0.340967,0.191012



hu: 


Unnamed: 0,year,county,hu_per_cap,hu_occ_prop
0,2012,ADAMS,0.704946,0.925198
1,2012,ALAMOSA,0.840087,0.890596
2,2012,ARAPAHOE,0.814089,0.939482



hu_blt: 


Unnamed: 0,year,county,hu_blt_2000_plus_prop,hu_blt_lt_1950_prop,hu_blt_1950_1959_prop,hu_blt_1960_1969_prop,hu_blt_1970_1979_prop,hu_blt_1980_1989_prop,hu_blt_1990_1999_prop,hu_blt_freq_yr_ord,hu_blt_freq_yr
0,2012,ADAMS,0.236957,0.037722,0.124776,0.120157,0.184906,0.12477,0.169059,7,b2000_2009
1,2012,ALAMOSA,0.098904,0.233719,0.089927,0.099513,0.213786,0.131163,0.131771,1,b1949_e
2,2012,ARAPAHOE,0.165498,0.030085,0.069361,0.093458,0.261392,0.235182,0.142715,4,b1970_1979



hu_oo: 


Unnamed: 0,year,county,hu_oo_prop,hu_oo_lt_50_prop,hu_oo_50_150_prop,hu_oo_150_250_prop,hu_oo_250_400_prop,hu_oo_400_750_prop,hu_oo_750_plus_prop,hu_oo_freq_val_ord,hu_oo_freq_val
0,2012,ADAMS,0.662818,0.085687,0.198166,0.475317,0.177598,0.054211,0.00902,3,v150k_250k
1,2012,ALAMOSA,0.632496,0.117504,0.431929,0.290924,0.107239,0.047812,0.004592,2,v50k_150k
2,2012,ARAPAHOE,0.639821,0.029387,0.154892,0.390722,0.266929,0.114133,0.043937,3,v150k_250k





In [36]:
write_main(prop, 'county_stats_normalized')
write_main(counts, 'county_stats_counts')
write_main(df, 'county_stats')

512

---
---
# `5-education`

[BACK TO TOP ^](#top-of-page)

---
---


# Education data prep

### This script combines the following 3 datasets, aggregates them by county, redesigns column naming structure, and re-calculates rates:
1. District Student Mobility/Stability Statistics 2011-2012 **by Instructional Program/Service Type**
2. District Student Mobility/Stability Statistics 2011-2012 **by Gender & Race/Ethnicity**
3. District Graduation Data Statistics 2011-2012 **by Instructional Program Service Type**
## Reference: Column Naming conventions

- This dataset is designed so you should never have to look at the columns to find the name of one (since there are around 140 columns). Just look here for reference instead.
- For instance, to get the rate for any variable, just use `_rate` after a variable. So `graduated` becomes `graduated_rate`

| Type | Naming | Example |
| - | - | - |
| County Total | variable | `stable` |
| Count | group + variable | `disabled_stable` |
| Rate | group + variable + "rate" | `disabled_stable_rate` |
| Group Total | group + group total | `disabled_pupil_total` |

<br>

#### Mobility/Stability columns

| GROUPS | VARIABLES | GROUP TOTALS |
| - | - | - |
| disabled | stable | pupil_total |
| limited_eng | mobile | 
| poor | mobile_instances |
| migrant | 
| title_1 | 
| homeless |
| gifted |
| male |
| female |
| white |
| asian |
| black |
| hispanic |

<br>

#### Graduation columns

| GROUPS | VARIABLES | GROUP TOTALS |
| - | - | - |
| disabled | graduated | grad_base_total |
| limited_eng | completed |
| poor |
| migrant |
| title_1 |
| homeless |
| gifted |

<br>

**What are group totals?**
- Notice they aren't just called "total". This is because, for graduation data, we don't care about the total number of students. We care about the total number of students who are actually in the pool for graduation. So, we call it `grad_base_total` and use that when calculating rate

**Rates are calculated by dividing a variable by its group total, then multiplying by 100**

---
---
---

In [38]:
filtr = "`Organization Name` != 'STATE TOTAL'"

grad_raw = read_raw('dist_grad_rate', WHERE=filtr)
mob_raw = read_raw('dist_student_mobility', WHERE=filtr)
mob_dem_raw = read_raw('dist_mobility_demographics', WHERE=filtr)

head(grad_raw, mob_raw, mob_dem_raw)

38 cols x 183 rows


Unnamed: 0,County Name,Organization Code,Organization Name,Students with Disabilities Final Grad Base,Students with Disabilities Graduates Total,Students with Disabilities Graduation Rate,Students with Disabilities Completers Total,Students with Disabilities Completion Rate,Limited English Proficient Final Grad Base,Limited English Proficient Graduates Total,...,Homeless Final Grad Base,Homeless Graduates Total,Homeless Graduation Rate,Homeless Completers Total,Homeless Completion Rate,Gifted-Talented Final Grad Base,Gifted-Talented Graduates Total,Gifted-Talented Graduation Rate,Gifted-Talented Completers Total,Gifted-Talented Completion Rate
0,ADAMS,10,MAPLETON 1,49,18,36.7,19,38.8,219,73,...,41,12,29.3,16,39.0,44,27,61.4,27,61.4
1,ADAMS,20,ADAMS 12 FIVE STAR SCHOOLS,250,118,47.2,127,50.8,379,257,...,106,62,58.5,65,61.3,227,201,88.5,208,91.6
2,ADAMS,30,ADAMS COUNTY 14,59,32,54.2,32,54.2,170,86,...,99,52,52.5,57,57.6,30,27,90.0,27,90.0


60 cols x 183 rows


Unnamed: 0,School Year,Org. Code,Organization Name,Category,Total Pupil Count (All students),Total Stable Pupil Count (All Students),Total Stability Rate (All Students),Total Mobile Student Count (All students),Total Student Mobility Rate (All students),Total Instances of Mobility (All students),...,Homeless Student Mobility Rate,Homeless Instances of Mobility,Homeless Mobility Incidence Rate,Gifted & Talented Pupil Count,Gifted & Talented Stable Student Count,Gifted & Talented Stability Rate,Gifted & Talented Mobile Student Count,Gifted & Talented Student Mobility Rate,Gifted & Talented Instances of Mobility,Gifted & Talented Mobility Incidence Rate
0,20112012,10,MAPLETON 1,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),9037,5077,56.2,3919,43.4,4133,...,32.7,79,36.9,250,205,82.0,44,17.6,47,18.8
1,20112012,20,ADAMS 12 FIVE STAR SCHOOLS,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),49889,34283,68.7,15424,30.9,16854,...,57.2,481,68.2,3590,3225,89.8,361,10.1,404,11.3
2,20112012,30,ADAMS COUNTY 14,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),8265,5510,66.7,3038,36.8,3397,...,49.7,529,59.7,377,317,84.1,75,19.9,89,23.6


74 cols x 183 rows


Unnamed: 0,School Year,Org. Code,Organization Name,Category,Total Pupil Count,Total Stable Student Count,Total Stability Rate,Total Mobile Student Count,Total Student Mobility Rate,Total Instances Of Mobility,...,Total Native Hawaiian or Other Pacific Islander Student Mobility Rate,Total Native Hawaiian or Other Pacific Islander Instances Of Mobility,Total Native Hawaiian or Other Pacific Islander Mobility Incidence Rate,Total Two or More Races Pupil Count,Total Two or More Races Stable Student Count,Total Two or More Races Stability Rate,Total Two or More Races Mobile Student Count,Total Two or More Races Student Mobility Rate,Total Two or More Races Instances Of Mobility,Total Two or More Races Mobility Incidence Rate
0,20112012,10,MAPLETON 1,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),9037,5077,56.2,3919,43.4,4133,...,70.8,17,70.8,219,129,58.9,90,41.1,91,41.6
1,20112012,20,ADAMS 12 FIVE STAR SCHOOLS,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),49889,34283,68.7,15424,30.9,16854,...,45.3,42,48.8,662,455,68.7,203,30.7,222,33.5
2,20112012,30,ADAMS COUNTY 14,1DISTRICT TOTALS (INCLUDING ALTERNATIVE SCHOOLS),8265,5510,66.7,3038,36.8,3397,...,0.0,0,0.0,55,28,50.9,26,47.3,28,50.9


In [39]:
def format_cols(df):
    df = (df
        .drop_cols('County Name', 'Organization Code', 'School Year', 'Org. Code', 'Category')
        .rename_col('Organization Name', 'district')
        .rename(columns={c: re.sub(r"\s|-", "_", c.lower()) for c in df.columns})
    )
    df = df.rename(columns={c: re.sub(r"\.|\(|\)|\&", "", c).replace('__', '_') for c in df.columns})
    return df

grad_raw = format_cols(grad_raw)
mob_raw = format_cols(mob_raw)
mob_dem_raw = format_cols(mob_dem_raw)
head(grad_raw)

36 cols x 183 rows


Unnamed: 0,district,students_with_disabilities_final_grad_base,students_with_disabilities_graduates_total,students_with_disabilities_graduation_rate,students_with_disabilities_completers_total,students_with_disabilities_completion_rate,limited_english_proficient_final_grad_base,limited_english_proficient_graduates_total,limited_english_proficient_graduation_rate,limited_english_proficient_completers_total,...,homeless_final_grad_base,homeless_graduates_total,homeless_graduation_rate,homeless_completers_total,homeless_completion_rate,gifted_talented_final_grad_base,gifted_talented_graduates_total,gifted_talented_graduation_rate,gifted_talented_completers_total,gifted_talented_completion_rate
0,MAPLETON 1,49,18,36.7,19,38.8,219,73,33.3,76,...,41,12,29.3,16,39.0,44,27,61.4,27,61.4
1,ADAMS 12 FIVE STAR SCHOOLS,250,118,47.2,127,50.8,379,257,67.8,261,...,106,62,58.5,65,61.3,227,201,88.5,208,91.6
2,ADAMS COUNTY 14,59,32,54.2,32,54.2,170,86,50.6,88,...,99,52,52.5,57,57.6,30,27,90.0,27,90.0


### Before joining, make sure all district columns match

In [40]:
sorted(grad_raw.district) == sorted(mob_raw.district) == sorted(mob_dem_raw.district)

True

### Merge

In [41]:
# Remove the columns duplicated across mobility demographics and mobility datasets
mob_dem = mob_dem_raw.drop_cols(
    'total_pupil_count', 'total_stable_student_count', 'total_stability_rate', 'total_mobile_student_count',
    'total_student_mobility_rate', 'total_instances_of_mobility', 'total_mobility_incidence_rate'
)

# Combine the two mobility datasets
df_raw_dist = (
    mob_raw
    .merge(mob_dem, on=['district'])
    .merge(grad_raw, on=['district'])
)
head(df_raw_dist)

155 cols x 183 rows


Unnamed: 0,district,total_pupil_count_all_students,total_stable_pupil_count_all_students,total_stability_rate_all_students,total_mobile_student_count_all_students,total_student_mobility_rate_all_students,total_instances_of_mobility_all_students,total_mobility_incidence_rate_all_students,students_with_disabilities_pupil_count,students_with_disabilities_stable_student_count,...,homeless_final_grad_base,homeless_graduates_total,homeless_graduation_rate,homeless_completers_total,homeless_completion_rate,gifted_talented_final_grad_base,gifted_talented_graduates_total,gifted_talented_graduation_rate,gifted_talented_completers_total,gifted_talented_completion_rate
0,MAPLETON 1,9037,5077,56.2,3919,43.4,4133,45.7,735,469,...,41,12,29.3,16,39.0,44,27,61.4,27,61.4
1,ADAMS 12 FIVE STAR SCHOOLS,49889,34283,68.7,15424,30.9,16854,33.8,4339,3001,...,106,62,58.5,65,61.3,227,201,88.5,208,91.6
2,ADAMS COUNTY 14,8265,5510,66.7,3038,36.8,3397,41.1,876,636,...,99,52,52.5,57,57.6,30,27,90.0,27,90.0


## Column Name Manipulation
---

In [42]:
df = df_raw_dist.copy()

## Filter out columns based on substrings
1. Remove all rates. They got messed up when we aggregated by county
2. Remove native american and native hawaiian because the group sizes are very small and values are 0 for a lot of counties.
3. Remove "two_or_more_races" because it's inconsistent, and difficult to compare groups

In [43]:
df = (df
    .separate_by("rate", mode='exclude')
    .separate_by("american_indian", mode='exclude')
    .separate_by("native_hawaiian", mode='exclude')
    .separate_by("two_or_more", mode='exclude')
)

### Standardize group names, then shorten group names
- Graduation data has `limited_english_proficient` and `econ_disadvant` 
- Mobility data `english_language_learners` and `economically_disadvantaged`

**Standardize these to `limited_english` and `econ_disadvant`, and shorten the others**

In [44]:
df = (df
    .col_replace({
        # Mobility/Stability groups
        "limited_english_proficient": "limited_eng",
        "english_language_learners": "limited_eng",
        "economically_disadvantaged": "poor",
        "econ_disadvant": "poor",
        "students_with_disabilities": "disabled",
        "gifted_talented": "gifted",
        # Demographics
        "black_or_african_american": "black",
        "hispanic_or_latino": "hispanic",
        # Graduation data
        "final_grad_base": "grad_base_total",
        "graduates_total": "graduated",
        "completers_total": "completed",
        # Mobility/Stability data
        "instances_of_mobility": "mobile_instances",
        "pupil_count": "pupil_total",
        "_student_count": "",
        # Variable totals
        "_all_students": "",
        "total_": "",
    })
    .rename_col('stable_pupil_total', 'stable')
)
df_dist_counts = df
head(df_dist_counts, with_tail=True)

78 cols x 183 rows


Unnamed: 0,district,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,MAPLETON 1,9037,5077,3919,4133,735,469,261,279,2863,...,5,218,118,124,41,12,16,44,27,27
1,ADAMS 12 FIVE STAR SCHOOLS,49889,34283,15424,16854,4339,3001,1325,1501,6141,...,12,224,80,98,106,62,65,227,201,208
181,SAN JUAN BOCES,84,0,84,84,5,0,5,5,0,...,0,0,0,0,0,0,0,1,0,1
182,EXPEDITIONARY BOCES,402,302,100,101,40,34,6,6,16,...,0,0,0,0,0,0,0,0,0,0


#### Standardize district names

In [45]:
# standardize_district_name defined in workspace setup
df.district = df.district.apply(standardize_district_name)
head(df)

78 cols x 183 rows


Unnamed: 0,district,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,MAPLETON 1,9037,5077,3919,4133,735,469,261,279,2863,...,5,218,118,124,41,12,16,44,27,27
1,ADAMSFIVESTAR 12,49889,34283,15424,16854,4339,3001,1325,1501,6141,...,12,224,80,98,106,62,65,227,201,208
2,ADAMSCOUNTY 14,8265,5510,3038,3397,876,636,266,311,3826,...,4,419,296,301,99,52,57,30,27,27


In [46]:
write_main(df_dist_counts, 'education_dist_counts')

183

#### Bring in county column

In [47]:
dist_county = read_main('select district, in_county as county from district')
head(dist_county)

2 cols x 183 rows


Unnamed: 0,district,county
0,MAPLETON 1,ADAMS
1,ADAMSFIVESTAR 12,ADAMS
2,ADAMSCOUNTY 14,ADAMS


#### Make sure all districts match across datasets

In [48]:
# join_conflicts defined in workspace setup
conflicts = join_conflicts(df, dist_county, 'district')
assert len(conflicts) == 0

df = df.merge(dist_county, on='district').move_col('county', 1)
head(df)

79 cols x 183 rows


Unnamed: 0,district,county,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,MAPLETON 1,ADAMS,9037,5077,3919,4133,735,469,261,279,...,5,218,118,124,41,12,16,44,27,27
1,ADAMSFIVESTAR 12,ADAMS,49889,34283,15424,16854,4339,3001,1325,1501,...,12,224,80,98,106,62,65,227,201,208
2,ADAMSCOUNTY 14,ADAMS,8265,5510,3038,3397,876,636,266,311,...,4,419,296,301,99,52,57,30,27,27


### Create county grouping

In [49]:
df_county_counts = (df
    .groupby(['county'])
    .sum(numeric_only=True)
    .reset_index()
)
head(df_county_counts)

78 cols x 63 rows


Unnamed: 0,county,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,ADAMS,98546,67272,31222,33925,8848,6263,2588,2896,20773,...,33,935,529,559,360,190,204,402,337,345
1,ALAMOSA,2775,1882,885,950,223,159,63,66,368,...,4,28,22,23,6,6,6,0,0,0
2,ARAPAHOE,124639,94109,30134,32269,11842,9461,2354,2568,25370,...,9,488,202,213,243,96,102,909,820,828


In [50]:
write_main(df_county_counts, 'education_county_counts')

63

## Calculate Rates
---

- This code is very confusing, but basically I'm just trying to dynamically divide each statistic by its parent's group total to get a percentage, and multiply by 100 to get a rate.
- For example, `disabled_stable` / `stable` gets the percent of stable students who are disabled. Then, `stable` / `pupil_total` gets the percent of all students who are stable, and so on.

In [51]:
def get_rates(df, index):
    df = df.copy()
    df_rates = df.copy()[index]

    for c in ['stable', 'mobile', 'mobile_instances']:
        group_rate = (df[c] / df['pupil_total'] * 100).round(2).fillna(0)
        df_rates[f"{c}_rate"] = group_rate
        df[f"{c}_rate"] = group_rate

    # Calculate rates dynamically
    for group in [
            'disabled', 'limited_eng', 'poor', 'migrant', 'title_1', 'homeless', 'gifted',
            'male', 'female', 'white', 'black', 'hispanic', 'asian']:

        for c in [c for c in df.columns if group in c and "total" not in c]:
            var = c.replace(f"{group}_", '')

            if var in ['graduated', 'completed']:
                new = df[c] / df[f"{group}_grad_base_total"]
            else:
                new = df[c] / df[f"{group}_pupil_total"]
            
            new = (new * 100).round(2).fillna(0)
            df_rates[f"{c}_rate"] = new
            df[f"{c}_rate"] = new

    return df, df_rates

In [52]:
df_dist_all, df_dist_rates = get_rates(df_dist_counts, ['district'])
df_county_all, df_county_rates = get_rates(df_county_counts, ['county'])

In [53]:
head(df_dist_all, df_dist_counts, df_dist_rates, df_county_all, df_county_counts, df_county_rates)

137 cols x 183 rows


Unnamed: 0,district,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,MAPLETON 1,9037,5077,3919,4133,735,469,261,279,2863,...,52.08,48.04,51.4,53.63,60.31,39.19,42.19,47.22,52.78,53.7
1,ADAMSFIVESTAR 12,49889,34283,15424,16854,4339,3001,1325,1501,6141,...,32.27,55.98,43.94,47.41,67.3,32.23,36.81,81.07,18.84,21.15
2,ADAMSCOUNTY 14,8265,5510,3038,3397,876,636,266,311,3826,...,47.45,48.2,51.8,53.6,68.67,35.2,39.57,80.0,20.0,20.0


78 cols x 183 rows


Unnamed: 0,district,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,MAPLETON 1,9037,5077,3919,4133,735,469,261,279,2863,...,5,218,118,124,41,12,16,44,27,27
1,ADAMSFIVESTAR 12,49889,34283,15424,16854,4339,3001,1325,1501,6141,...,12,224,80,98,106,62,65,227,201,208
2,ADAMSCOUNTY 14,8265,5510,3038,3397,876,636,266,311,3826,...,4,419,296,301,99,52,57,30,27,27


60 cols x 183 rows


Unnamed: 0,district,stable_rate,mobile_rate,mobile_instances_rate,disabled_stable_rate,disabled_mobile_rate,disabled_mobile_instances_rate,disabled_graduated_rate,disabled_completed_rate,limited_eng_stable_rate,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,MAPLETON 1,56.18,43.37,45.73,63.81,35.51,37.96,36.73,38.78,66.01,...,52.08,48.04,51.4,53.63,60.31,39.19,42.19,47.22,52.78,53.7
1,ADAMSFIVESTAR 12,68.72,30.92,33.78,69.16,30.54,34.59,47.2,50.8,67.56,...,32.27,55.98,43.94,47.41,67.3,32.23,36.81,81.07,18.84,21.15
2,ADAMSCOUNTY 14,66.67,36.76,41.1,72.6,30.37,35.5,54.24,54.24,73.05,...,47.45,48.2,51.8,53.6,68.67,35.2,39.57,80.0,20.0,20.0


137 cols x 63 rows


Unnamed: 0,county,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,ADAMS,98546,67272,31222,33925,8848,6263,2588,2896,20773,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,ALAMOSA,2775,1882,885,950,223,159,63,66,368,...,35.54,57.14,42.86,42.86,70.15,29.47,32.69,64.0,36.0,36.0
2,ARAPAHOE,124639,94109,30134,32269,11842,9461,2354,2568,25370,...,21.16,67.47,32.03,34.57,72.76,26.75,29.2,78.59,21.3,22.56


78 cols x 63 rows


Unnamed: 0,county,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,ADAMS,98546,67272,31222,33925,8848,6263,2588,2896,20773,...,33,935,529,559,360,190,204,402,337,345
1,ALAMOSA,2775,1882,885,950,223,159,63,66,368,...,4,28,22,23,6,6,6,0,0,0
2,ARAPAHOE,124639,94109,30134,32269,11842,9461,2354,2568,25370,...,9,488,202,213,243,96,102,909,820,828


60 cols x 63 rows


Unnamed: 0,county,stable_rate,mobile_rate,mobile_instances_rate,disabled_stable_rate,disabled_mobile_rate,disabled_mobile_instances_rate,disabled_graduated_rate,disabled_completed_rate,limited_eng_stable_rate,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,ADAMS,68.26,31.68,34.43,70.78,29.25,32.73,47.54,50.1,69.99,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,ALAMOSA,67.82,31.89,34.23,71.3,28.25,29.6,86.67,93.33,72.01,...,35.54,57.14,42.86,42.86,70.15,29.47,32.69,64.0,36.0,36.0
2,ARAPAHOE,75.51,24.18,25.89,79.89,19.88,21.69,51.26,52.06,74.94,...,21.16,67.47,32.03,34.57,72.76,26.75,29.2,78.59,21.3,22.56


## Save
---

In [54]:
write_main(df_dist_all, 'education_dist')
write_main(df_dist_rates, 'education_dist_rates')

write_main(df_county_all, 'education_county')
write_main(df_county_rates, 'education_county_rates')

63

---
---
# `6-crime`

[BACK TO TOP ^](#top-of-page)

---
---


# Crime data prep
---
- Cleaning records from source
- Refactoring
- Standardizing county names
- Generating new columns based on date (day of week, quarter, year)
- Combining the 97-15 data and 16-19 data

## Crime 16-19

In [34]:
head(read_raw('crime_16_19'))

9 cols x 1851996 rows


Unnamed: 0,pub_agency_name,county_name,incident_date,incident_hour,offense_name,crime_against,offense_category_name,offense_group,age_num
0,Westminster,JEFFERSON; ADAMS,08/26/2017,17.0,Aggravated Assault,Person,Assault Offenses,A,52.0
1,Westminster,JEFFERSON; ADAMS,11/22/2017,20.0,Aggravated Assault,Person,Assault Offenses,A,29.0
2,Westminster,JEFFERSON; ADAMS,12/28/2017,21.0,Motor Vehicle Theft,Property,Motor Vehicle Theft,A,


In [35]:
df = read_raw("""
    NULL AS year,
    county_name AS county,
    pub_agency_name AS police_dept,
    incident_date AS date,
    NULL AS quarter,
    NULL AS month,
    NULL AS day_of_week,
    incident_hour AS hour,
    age_num AS age,
    crime_against,
    offense_name,
    offense_category_name AS offense_category
FROM crime_16_19
""")
head(df)

12 cols x 1851996 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,,JEFFERSON; ADAMS,Westminster,08/26/2017,,,,17.0,52.0,Person,Aggravated Assault,Assault Offenses
1,,JEFFERSON; ADAMS,Westminster,11/22/2017,,,,20.0,29.0,Person,Aggravated Assault,Assault Offenses
2,,JEFFERSON; ADAMS,Westminster,12/28/2017,,,,21.0,,Property,Motor Vehicle Theft,Motor Vehicle Theft


In [36]:
# Filter
df = df.loc[
    (~ df.police_dept.isin(['State Patrol', 'Colorado Bureau of Investigation'])) &
    (~ df.police_dept.isna())
]

df = df.copy() # This avoids setting on copy of a slice warning later

# Since we're focused on county and not police department, replace dual county
# police department county values (Ex: "JEFFERSON; ADAMS") with just the primary (Ex: "JEFFERSON")
df.county = (
    df.county
    .str.split('; ')
    .str[0]
    .str.upper()
)

# Convert to datetime and parse out date parts
df['date'] = pd.to_datetime(df.date, infer_datetime_format=True)

df.year = df.date.dt.year.copy()
df.quarter = df.date.dt.quarter.copy()
df.month = df.date.dt.month.copy()
df.day_of_week = df.date.dt.day_of_week.copy()

df_16_19 = df
head(df_16_19)

12 cols x 1845650 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,2017,JEFFERSON,Westminster,2017-08-26,3,8,5,17.0,52.0,Person,Aggravated Assault,Assault Offenses
1,2017,JEFFERSON,Westminster,2017-11-22,4,11,2,20.0,29.0,Person,Aggravated Assault,Assault Offenses
2,2017,JEFFERSON,Westminster,2017-12-28,4,12,3,21.0,,Property,Motor Vehicle Theft,Motor Vehicle Theft


## Crime 97-15

In [37]:
head(read_raw('crime_97_15'))

10 cols x 4952282 rows


Unnamed: 0,agency_name,agency_type_name,city_name,primary_county,incident_hour,offense_name,crime_against,offense_category_name,age_num,incident_date
0,Lyons Police Department,City,Lyons,Boulder,,,,,,
1,Kremmling Police Department,City,Kremmling,Grand,,,,,,
2,Oak Creek Police Department,City,Oak Creek,Routt,,,,,,


In [38]:
df = read_raw("""
    NULL AS year,
    UPPER(primary_county) AS county,
    agency_name AS police_dept,
    incident_date AS date,
    NULL AS quarter,
    NULL AS month,
    NULL AS day_of_week,
    incident_hour AS hour,
    age_num AS age,
    crime_against,
    offense_name,
    offense_category_name AS offense_category
    """,
    FROM="crime_97_15"
)
head(df)

12 cols x 4952282 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,,BOULDER,Lyons Police Department,,,,,,,,,
1,,GRAND,Kremmling Police Department,,,,,,,,,
2,,ROUTT,Oak Creek Police Department,,,,,,,,,


In [39]:
# Filter
df = df.loc[
    (~ df.police_dept.isin(['State Patrol', 'Colorado Bureau of Investigation'])) &
    (~ df.date.isna())
]
df = df.copy() # This avoids setting on copy of a slice warning later

df.date = pd.to_datetime(df.date, infer_datetime_format=True)

df.year = df.date.dt.year
df.quarter = df.date.dt.quarter
df.month = df.date.dt.month
df.day_of_week = df.date.dt.day_of_week

df_97_15 = df
head(df_97_15)

12 cols x 4925016 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
24,1997,BOULDER,Longmont Police Department,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
25,1997,BOULDER,Longmont Police Department,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
26,1997,KIT CARSON,Kit Carson County Sheriff's Office,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses


## Combine into single dataset
---

In [40]:
# columns must be the same before concat on axis 0
assert list(df_16_19.columns) == list(df_97_15.columns)
df_all = (
    pd.concat([df_97_15, df_16_19], axis=0)
    .drop_cols('police_dept')
    .reset_index(drop=True)
)
head(df_all)

11 cols x 6770666 rows


Unnamed: 0,year,county,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,1997,BOULDER,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
1,1997,BOULDER,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
2,1997,KIT CARSON,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses


In [41]:
write_main(df_all, 'crime_records')

6770666

---
---
---

# Aggregating crime data

`crime_agg_category.csv`: 32 columns (crime counts broken down by offense **category**)

`crime_agg_name.csv`: 54 columns (crime counts broken down by offense **name**)

### Creates two aggregated datasets. Both include:
- First, ALL values in `crime_against`, `offense_name`, and `offense_category` are first renamed to shorter alternatives in *snake_case*. This was done in preparation for dummifying those columns, to make for friendly column names.
- Data is grouped by year and county, and include the following aggregated columns:
  - Crime count
  - Average age
  - Mode quarter (which quarter had the most crimes?)
  - Mode month (which month had the most crimes?)
  - Mode day of week (mon-fri => 1-7)
  - Mode hour of day (military time)
  - The original `crime_against` column was dummified, and summed during aggregation, to show total crime counts for each:
    - `against_person`
    - `against_society`
    - `against_property`
    - `not_a_crime`

#### Additional columns in `crime_agg_category` data:
- The original `offense_category` column was dummified, and then summed during aggregation, showing total crime counts broken up by offense category

#### Additional columns in `crime_agg_name` data:
- The original `offense_name` column was dummified, and then summed during aggregation, showing total crime counts broken up by offense name.

In [43]:
df_raw = read_main('crime_records')
head(df_raw)

11 cols x 6770666 rows


Unnamed: 0,year,county,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,1997,BOULDER,1997-03-14 00:00:00,1,3,4,,15.0,Person,Fondling,Sex Offenses
1,1997,BOULDER,1997-07-02 00:00:00,3,7,2,21.0,14.0,Property,Arson,Arson
2,1997,KIT CARSON,1997-01-20 00:00:00,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses


#### Remap all values in categorical columns based on excel sheet
- We created 3 tables by hand in excel to rename EACH value in offense_name, offense_category and crime_against. This needed to be done in order to create dummy columns with friendly names.

In [44]:
read_sht = lambda sheet: pd.read_excel('column_renaming.xlsx', sheet_name=sheet)

display( read_sht('offense_name'), read_sht('offense_category'), read_sht('crime_against'))


Unnamed: 0,OLD,NEW
0,Simple Assault,assault_simple
1,Intimidation,intimidation
2,Fondling,fondling
3,Rape,rape
4,Impersonation,impersonation
5,Robbery,robbery
6,Arson,arson
7,Destruction/Damage/Vandalism of Property,property_damage
8,Theft From Motor Vehicle,theft_from_vehicle
9,Burglary/Breaking & Entering,burglary


Unnamed: 0,OLD,NEW
0,Assault Offenses,assault
1,Sex Offenses,sex_offense
2,Fraud Offenses,fraud
3,Robbery,robbery
4,Arson,arson
5,Destruction/Damage/Vandalism of Property,property_damage
6,Larceny/Theft Offenses,larceny_theft
7,Burglary/Breaking & Entering,burglary
8,Homicide Offenses,homicide
9,Drug/Narcotic Offenses,drug


Unnamed: 0,OLD,NEW
0,Person,against_person
1,Property,against_property
2,Society,against_society
3,Not a Crime,not_a_crime


In [45]:
df_refactored = (df_raw.copy()
    .rename_vals_from_df("offense_name", read_sht("offense_name"))
    .rename_vals_from_df("offense_category", read_sht("offense_category"))
    .rename_vals_from_df("crime_against", read_sht("crime_against"))
)
head(df_refactored)

11 cols x 6770666 rows


Unnamed: 0,year,county,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,1997,BOULDER,1997-03-14 00:00:00,1,3,4,,15.0,against_person,fondling,sex_offense
1,1997,BOULDER,1997-07-02 00:00:00,3,7,2,21.0,14.0,against_property,arson,arson
2,1997,KIT CARSON,1997-01-20 00:00:00,1,1,0,22.0,58.0,against_person,assault_simple,assault


### Aggregated datasets
1. Version 1: includes crime_category dummy sums
2. Version 2: includes crime_name dummy sums

In [46]:
INDEX = ['year', 'county']
df = df_refactored.copy()

@Extend(pd.DataFrame)
def dummies_special(self, include:str, exclude:str) -> pd.DataFrame:
    return (
        pd.get_dummies(self,
            columns=['crime_against', include],
            prefix="", prefix_sep=""
        )
        .drop_cols(exclude, 'date', 'quarter', 'month', 'day_of_week', 'hour', 'age')
        .groupby(INDEX)
        .sum()
        .reset_index()
    )

dum_cat = df.dummies_special('offense_category', 'offense_name')
dum_name = df.dummies_special('offense_name', 'offense_category')

In [47]:
# Convert these to modes
mode_cols = ['quarter', 'month', 'day_of_week', 'hour']
df_modes = df[INDEX + mode_cols]

# pd.Series.mode() is tough. When there's multiple modes, the cell value
# becomes a np.ndarray. So there's mixed values... You can't even safely index
# it because sometimes those arrays are EMPTY :(
# I applied the following function to fix this.
def first_in_list(x):
    """ pd.Series.mode returns ndarray when multiple modes. Safely convert to float """
    if type(x) == np.ndarray:
        if x.size > 0:
            return float(x[0])
        return np.nan
    return float(x)

# df.applymap() is just like apply but instead of acting on an axis, it acts on each cell in df
df_modes = (df_modes
    .groupby(INDEX)
    .agg(pd.Series.mode)
    .applymap(first_in_list)
    .reset_index()
    .rename(columns={c: f'{c}_mode' for c in mode_cols})
)

head(df_modes)

6 cols x 1397 rows


Unnamed: 0,year,county,quarter_mode,month_mode,day_of_week_mode,hour_mode
0,1997,ADAMS,1.0,3.0,0.0,17.0
1,1997,ALAMOSA,3.0,8.0,5.0,18.0
2,1997,ARAPAHOE,3.0,8.0,4.0,18.0


In [48]:
# Convert count and average
df_count = (df
    [INDEX + ['date']]
    .groupby(INDEX)
    .count()
    .reset_index()
    .rename(columns={'date': 'cr_count'})
)
df_avg = (df
    [INDEX + ['age']]
    .groupby(INDEX)
    .mean()
    .reset_index()
    .rename(columns={'age': 'age_avg'})
)
head(df_count, df_avg)

3 cols x 1397 rows


Unnamed: 0,year,county,cr_count
0,1997,ADAMS,22947
1,1997,ALAMOSA,404
2,1997,ARAPAHOE,37555


3 cols x 1397 rows


Unnamed: 0,year,county,age_avg
0,1997,ADAMS,24.582071
1,1997,ALAMOSA,27.098901
2,1997,ARAPAHOE,25.209156


#### Stitching everything together

In [49]:
# Numerical aggregations: counts, avgs, modes
df = (df_count
    .merge(df_avg, on=INDEX)
    .merge(df_modes, on=INDEX)
)
head(df)

8 cols x 1397 rows


Unnamed: 0,year,county,cr_count,age_avg,quarter_mode,month_mode,day_of_week_mode,hour_mode
0,1997,ADAMS,22947,24.582071,1.0,3.0,0.0,17.0
1,1997,ALAMOSA,404,27.098901,3.0,8.0,5.0,18.0
2,1997,ARAPAHOE,37555,25.209156,3.0,8.0,4.0,18.0


### Bring in total county population by year. This will be needed to calculate proportions

In [50]:
pop = read_main("""
    year,
    county,
    total as pop
    """,
    FROM="county_population",
    WHERE="year >= 1997 AND year <= 2019"
)

head(pop)

3 cols x 1472 rows


Unnamed: 0,year,county,pop
0,1997,ADAMS,167740.0
1,1997,ALAMOSA,7173.0
2,1997,ARAPAHOE,225524.0


In [51]:
# Summed aggregations for dummies: 2 versions
def create_merged_version(df, dummy_df) -> pd.DataFrame:
    global pop
    return (df
        .merge(dummy_df, on=INDEX)
        .merge(pop, on=INDEX)
        .move_col('pop', 2)
    )
df_cat = create_merged_version(df, dum_cat)
df_name = create_merged_version(df, dum_name)
head(df_cat, df_name)

33 cols x 1397 rows


Unnamed: 0,year,county,pop,cr_count,age_avg,quarter_mode,month_mode,day_of_week_mode,hour_mode,against_person,...,kidnapping,larceny_theft,porn,property_damage,prostitution,robbery,sex_offense,stolen_property,vehicle_theft,weapon_law
0,1997,ADAMS,167740.0,22947,24.582071,1.0,3.0,0.0,17.0,3047,...,50,8023,1,5467,14,189,316,245,1317,274
1,1997,ALAMOSA,7173.0,404,27.098901,3.0,8.0,5.0,18.0,101,...,0,165,0,73,0,0,9,0,6,4
2,1997,ARAPAHOE,225524.0,37555,25.209156,3.0,8.0,4.0,18.0,4568,...,249,14345,0,5856,198,238,434,421,2537,695


55 cols x 1397 rows


Unnamed: 0,year,county,pop,cr_count,age_avg,quarter_mode,month_mode,day_of_week_mode,hour_mode,against_person,...,shoplifting,sodomy,stolen_property,theft_from_building,theft_from_vehicle,theft_from_vending_machine,vehicle_part_theft,vehicle_theft,weapon_law,wire_fraud
0,1997,ADAMS,167740.0,22947,24.582071,1.0,3.0,0.0,17.0,3047,...,1323,3,245,1232,1733,53,1293,1317,274,0
1,1997,ALAMOSA,7173.0,404,27.098901,3.0,8.0,5.0,18.0,101,...,0,0,0,14,7,10,1,6,4,1
2,1997,ARAPAHOE,225524.0,37555,25.209156,3.0,8.0,4.0,18.0,4568,...,4134,0,421,2283,3867,72,677,2537,695,2


In [52]:
INDEX = ['year', 'county']

BASE_COLS = ['pop', 'cr_count', 'age_avg', 'quarter_mode', 'month_mode', 'day_of_week_mode', 'hour_mode', 'against_person', 'against_property', 'against_society', 'not_a_crime']

df_base = (df_cat
    .copy()
    [INDEX + BASE_COLS]
    .insert_at(5, 'cr_rate', df_cat.cr_count / df_cat['pop'] * 100_000)
)
for i in ['against_person', 'against_property', 'against_society']:
    df_base[f'{i}_rate'] = df_base[i] / df_base['pop'] * 100_000

df_cat = df_cat.drop(columns=BASE_COLS)
df_name = df_name.drop(columns=BASE_COLS)

head(df_base, df_cat, df_name)

17 cols x 1397 rows


Unnamed: 0,year,county,pop,cr_count,age_avg,cr_rate,quarter_mode,month_mode,day_of_week_mode,hour_mode,against_person,against_property,against_society,not_a_crime,against_person_rate,against_property_rate,against_society_rate
0,1997,ADAMS,167740.0,22947,24.582071,13680.100155,1.0,3.0,0.0,17.0,3047,17766,2134,0,1816.501729,10591.391439,1272.206987
1,1997,ALAMOSA,7173.0,404,27.098901,5632.231981,3.0,8.0,5.0,18.0,101,264,39,0,1408.057995,3680.468423,543.705563
2,1997,ARAPAHOE,225524.0,37555,25.209156,16652.329686,3.0,8.0,4.0,18.0,4568,28573,4414,0,2025.505046,12669.605009,1957.219631


22 cols x 1397 rows


Unnamed: 0,year,county,arson,assault,bribery,burglary,drug,embezzlement,extortion,forgery,...,kidnapping,larceny_theft,porn,property_damage,prostitution,robbery,sex_offense,stolen_property,vehicle_theft,weapon_law
0,1997,ADAMS,86,2672,2,1931,1845,26,5,244,...,50,8023,1,5467,14,189,316,245,1317,274
1,1997,ALAMOSA,1,92,0,13,35,0,0,3,...,0,165,0,73,0,0,9,0,6,4
2,1997,ARAPAHOE,138,3872,0,3200,3514,191,12,672,...,249,14345,0,5856,198,238,434,421,2537,695


44 cols x 1397 rows


Unnamed: 0,year,county,arson,assault_aggravated,assault_simple,bribery,burglary,credit_card_machine_fraud,drug_equipment,drug_narcotic,...,shoplifting,sodomy,stolen_property,theft_from_building,theft_from_vehicle,theft_from_vending_machine,vehicle_part_theft,vehicle_theft,weapon_law,wire_fraud
0,1997,ADAMS,86,602,1954,2,1931,44,725,1120,...,1323,3,245,1232,1733,53,1293,1317,274,0
1,1997,ALAMOSA,1,14,72,0,13,0,8,27,...,0,0,0,14,7,10,1,6,4,1
2,1997,ARAPAHOE,138,671,2956,0,3200,194,916,2598,...,4134,0,421,2283,3867,72,677,2537,695,2


### Calculate rates and separate them

In [53]:
def add_rates(df):
    result = df.copy()[INDEX]
    for c in [c for c in df.columns if c not in df_base.columns]:
        result[f'{c}_rate'] = df[c] / df_base['pop'] * 100_000
    return result

df_cat_rate = add_rates(df_cat)
df_name_rate = add_rates(df_name)
df_base = df_base.drop_cols('pop')

In [54]:
df_base_rate = df_base[INDEX + ['cr_rate', 'age_avg', 'quarter_mode', 'month_mode', 'day_of_week_mode', 'hour_mode', 'against_person_rate', 'against_property_rate', 'against_society_rate']]
df_base_count = df_base[INDEX + ['cr_count', 'against_person', 'against_property', 'against_society', 'not_a_crime']]

In [55]:
head(df_base_rate)

11 cols x 1397 rows


Unnamed: 0,year,county,cr_rate,age_avg,quarter_mode,month_mode,day_of_week_mode,hour_mode,against_person_rate,against_property_rate,against_society_rate
0,1997,ADAMS,13680.100155,24.582071,1.0,3.0,0.0,17.0,1816.501729,10591.391439,1272.206987
1,1997,ALAMOSA,5632.231981,27.098901,3.0,8.0,5.0,18.0,1408.057995,3680.468423,543.705563
2,1997,ARAPAHOE,16652.329686,25.209156,3.0,8.0,4.0,18.0,2025.505046,12669.605009,1957.219631


In [56]:
head(df_base_count)

7 cols x 1397 rows


Unnamed: 0,year,county,cr_count,against_person,against_property,against_society,not_a_crime
0,1997,ADAMS,22947,3047,17766,2134,0
1,1997,ALAMOSA,404,101,264,39,0
2,1997,ARAPAHOE,37555,4568,28573,4414,0


In [57]:
head(df_cat)

22 cols x 1397 rows


Unnamed: 0,year,county,arson,assault,bribery,burglary,drug,embezzlement,extortion,forgery,...,kidnapping,larceny_theft,porn,property_damage,prostitution,robbery,sex_offense,stolen_property,vehicle_theft,weapon_law
0,1997,ADAMS,86,2672,2,1931,1845,26,5,244,...,50,8023,1,5467,14,189,316,245,1317,274
1,1997,ALAMOSA,1,92,0,13,35,0,0,3,...,0,165,0,73,0,0,9,0,6,4
2,1997,ARAPAHOE,138,3872,0,3200,3514,191,12,672,...,249,14345,0,5856,198,238,434,421,2537,695


In [58]:
head(df_cat_rate)

22 cols x 1397 rows


Unnamed: 0,year,county,arson_rate,assault_rate,bribery_rate,burglary_rate,drug_rate,embezzlement_rate,extortion_rate,forgery_rate,...,kidnapping_rate,larceny_theft_rate,porn_rate,property_damage_rate,prostitution_rate,robbery_rate,sex_offense_rate,stolen_property_rate,vehicle_theft_rate,weapon_law_rate
0,1997,ADAMS,51.269822,1592.941457,1.192321,1151.18636,1099.916537,15.500179,2.980804,145.463217,...,29.808036,4782.997496,0.596161,3259.210683,8.34625,112.674377,188.386789,146.059378,785.143675,163.348039
1,1997,ALAMOSA,13.941168,1282.587481,0.0,181.235188,487.940889,0.0,0.0,41.823505,...,0.0,2300.292765,0.0,1017.705284,0.0,0.0,125.470514,0.0,83.64701,55.764673
2,1997,ARAPAHOE,61.190827,1716.890442,0.0,1418.917721,1558.149022,84.691651,5.320941,297.972721,...,110.409535,6360.742094,0.0,2596.619429,87.795534,105.532005,192.440716,186.676363,1124.935705,308.171192


In [59]:
head(df_name)

44 cols x 1397 rows


Unnamed: 0,year,county,arson,assault_aggravated,assault_simple,bribery,burglary,credit_card_machine_fraud,drug_equipment,drug_narcotic,...,shoplifting,sodomy,stolen_property,theft_from_building,theft_from_vehicle,theft_from_vending_machine,vehicle_part_theft,vehicle_theft,weapon_law,wire_fraud
0,1997,ADAMS,86,602,1954,2,1931,44,725,1120,...,1323,3,245,1232,1733,53,1293,1317,274,0
1,1997,ALAMOSA,1,14,72,0,13,0,8,27,...,0,0,0,14,7,10,1,6,4,1
2,1997,ARAPAHOE,138,671,2956,0,3200,194,916,2598,...,4134,0,421,2283,3867,72,677,2537,695,2


In [60]:
head(df_name_rate)

44 cols x 1397 rows


Unnamed: 0,year,county,arson_rate,assault_aggravated_rate,assault_simple_rate,bribery_rate,burglary_rate,credit_card_machine_fraud_rate,drug_equipment_rate,drug_narcotic_rate,...,shoplifting_rate,sodomy_rate,stolen_property_rate,theft_from_building_rate,theft_from_vehicle_rate,theft_from_vending_machine_rate,vehicle_part_theft_rate,vehicle_theft_rate,weapon_law_rate,wire_fraud_rate
0,1997,ADAMS,51.269822,358.888756,1164.898057,1.192321,1151.18636,26.231072,432.216526,667.700012,...,788.720639,1.788482,146.059378,734.470013,1033.146536,31.596518,770.835817,785.143675,163.348039,0.0
1,1997,ALAMOSA,13.941168,195.176356,1003.764115,0.0,181.235188,0.0,111.529346,376.411543,...,0.0,0.0,0.0,195.176356,97.588178,139.411683,13.941168,83.64701,55.764673,13.941168
2,1997,ARAPAHOE,61.190827,297.52931,1310.725244,0.0,1418.917721,86.021887,406.165197,1151.983824,...,1833.06433,0.0,186.676363,1012.309111,1714.673383,31.925649,300.18978,1124.935705,308.171192,0.886824


### Output

In [61]:
write_main(df_base_count, 'crime_agg_base_count')
write_main(df_base_rate, 'crime_agg_base_rate')
write_main(df_cat, 'crime_agg_category')
write_main(df_name, 'crime_agg_name')
write_main(df_cat_rate, 'crime_agg_category_rate')
write_main(df_name_rate, 'crime_agg_name_rate')

1397

---
---
# `7-prepare_for_dashboard`

[BACK TO TOP ^](#top-of-page)

---
---


In [3]:
dem_norm = read_main('county_stats_normalized')
dem_cnt = read_main('county_stats_counts')

edu_norm = read_main('education_county_rates')
edu_cnt = read_main('education_county_counts')

cr_base_rate = read_main('crime_agg_base_rate')
cr_base_count = read_main('crime_agg_base_count')
cr_cat = read_main('crime_agg_category')
cr_name = read_main('crime_agg_name')
cr_cat_rate = read_main('crime_agg_category_rate')
cr_name_rate = read_main('crime_agg_name_rate')

head(dem_norm, dem_cnt, edu_norm, edu_cnt, cr_base_count, cr_base_rate, cr_cat, cr_name, cr_cat_rate, cr_name_rate)

58 cols x 512 rows


Unnamed: 0,year,county,age_over18_prop,age_undr19_prop,gend_m_prop,gend_f_prop,age_undr19_gend_m_prop,age_undr19_gend_f_prop,age_over18_gend_m_prop,age_over18_gend_f_prop,...,hu_per_cap,age_median,inc_hh_median,hh_size_avg,hu_oo_freq_val_ord,hu_oo_freq_val,hu_blt_freq_yr_ord,hu_blt_freq_yr,med_hm_val,med_yr_blt
0,2012,ADAMS,0.70004,0.29996,2.104797,0.87213,2.341611,0.953744,2.003325,0.837159,...,0.704946,32.4,56633.0,2.91,3,v150k_250k,7,b2000_2009,188100.0,1983.0
1,2012,ALAMOSA,0.71865,0.28135,2.18778,0.803145,2.61154,0.928669,2.021878,0.754002,...,0.840087,32.2,38045.0,2.49,2,v50k_150k,1,b1949_e,133200.0,1974.0
2,2012,ARAPAHOE,0.725375,0.274625,2.279007,0.797066,2.828618,0.951183,2.070926,0.738717,...,0.814089,35.7,60400.0,2.55,3,v150k_250k,4,b1970_1979,230900.0,1982.0


51 cols x 512 rows


Unnamed: 0,year,county,pop,gend_m,gend_f,age_over18,age_undr19,gend_m_age_undr19,gend_f_age_undr19,gend_m_age_over18,...,hu_blt_1990_1999,hu_blt_1980_1989,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_lt_1950,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,231571.0,487410.0,201960.0,162109.0,69462.0,162653.0,66249.0,324757.0,...,27598.0,20368.0,30185.0,19615.0,20369.0,6158.0,438171.0,62008.0,124375.0,25278.0
1,2012,ALAMOSA,7823.0,17115.0,6283.0,5622.0,2201.0,5748.0,2044.0,11367.0,...,866.0,862.0,1405.0,654.0,591.0,1536.0,14622.0,3191.0,3817.0,758.0
2,2012,ARAPAHOE,292548.0,666719.0,233180.0,212207.0,80341.0,227254.0,76419.0,439465.0,...,33989.0,56011.0,62253.0,22258.0,16519.0,7165.0,568999.0,66945.0,144576.0,23054.0


60 cols x 63 rows


Unnamed: 0,county,stable_rate,mobile_rate,mobile_instances_rate,disabled_stable_rate,disabled_mobile_rate,disabled_mobile_instances_rate,disabled_graduated_rate,disabled_completed_rate,limited_eng_stable_rate,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,ADAMS,68.26,31.68,34.43,70.78,29.25,32.73,47.54,50.1,69.99,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,ALAMOSA,67.82,31.89,34.23,71.3,28.25,29.6,86.67,93.33,72.01,...,35.54,57.14,42.86,42.86,70.15,29.47,32.69,64.0,36.0,36.0
2,ARAPAHOE,75.51,24.18,25.89,79.89,19.88,21.69,51.26,52.06,74.94,...,21.16,67.47,32.03,34.57,72.76,26.75,29.2,78.59,21.3,22.56


78 cols x 63 rows


Unnamed: 0,county,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,disabled_mobile_instances,limited_eng_pupil_total,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,ADAMS,98546,67272,31222,33925,8848,6263,2588,2896,20773,...,33,935,529,559,360,190,204,402,337,345
1,ALAMOSA,2775,1882,885,950,223,159,63,66,368,...,4,28,22,23,6,6,6,0,0,0
2,ARAPAHOE,124639,94109,30134,32269,11842,9461,2354,2568,25370,...,9,488,202,213,243,96,102,909,820,828


7 cols x 1397 rows


Unnamed: 0,year,county,cr_count,against_person,against_property,against_society,not_a_crime
0,1997,ADAMS,22947,3047,17766,2134,0
1,1997,ALAMOSA,404,101,264,39,0
2,1997,ARAPAHOE,37555,4568,28573,4414,0


11 cols x 1397 rows


Unnamed: 0,year,county,cr_rate,age_avg,quarter_mode,month_mode,day_of_week_mode,hour_mode,against_person_rate,against_property_rate,against_society_rate
0,1997,ADAMS,13680.100155,24.582071,1.0,3.0,0.0,17.0,1816.501729,10591.391439,1272.206987
1,1997,ALAMOSA,5632.231981,27.098901,3.0,8.0,5.0,18.0,1408.057995,3680.468423,543.705563
2,1997,ARAPAHOE,16652.329686,25.209156,3.0,8.0,4.0,18.0,2025.505046,12669.605009,1957.219631


22 cols x 1397 rows


Unnamed: 0,year,county,arson,assault,bribery,burglary,drug,embezzlement,extortion,forgery,...,kidnapping,larceny_theft,porn,property_damage,prostitution,robbery,sex_offense,stolen_property,vehicle_theft,weapon_law
0,1997,ADAMS,86,2672,2,1931,1845,26,5,244,...,50,8023,1,5467,14,189,316,245,1317,274
1,1997,ALAMOSA,1,92,0,13,35,0,0,3,...,0,165,0,73,0,0,9,0,6,4
2,1997,ARAPAHOE,138,3872,0,3200,3514,191,12,672,...,249,14345,0,5856,198,238,434,421,2537,695


44 cols x 1397 rows


Unnamed: 0,year,county,arson,assault_aggravated,assault_simple,bribery,burglary,credit_card_machine_fraud,drug_equipment,drug_narcotic,...,shoplifting,sodomy,stolen_property,theft_from_building,theft_from_vehicle,theft_from_vending_machine,vehicle_part_theft,vehicle_theft,weapon_law,wire_fraud
0,1997,ADAMS,86,602,1954,2,1931,44,725,1120,...,1323,3,245,1232,1733,53,1293,1317,274,0
1,1997,ALAMOSA,1,14,72,0,13,0,8,27,...,0,0,0,14,7,10,1,6,4,1
2,1997,ARAPAHOE,138,671,2956,0,3200,194,916,2598,...,4134,0,421,2283,3867,72,677,2537,695,2


22 cols x 1397 rows


Unnamed: 0,year,county,arson_rate,assault_rate,bribery_rate,burglary_rate,drug_rate,embezzlement_rate,extortion_rate,forgery_rate,...,kidnapping_rate,larceny_theft_rate,porn_rate,property_damage_rate,prostitution_rate,robbery_rate,sex_offense_rate,stolen_property_rate,vehicle_theft_rate,weapon_law_rate
0,1997,ADAMS,51.269822,1592.941457,1.192321,1151.18636,1099.916537,15.500179,2.980804,145.463217,...,29.808036,4782.997496,0.596161,3259.210683,8.34625,112.674377,188.386789,146.059378,785.143675,163.348039
1,1997,ALAMOSA,13.941168,1282.587481,0.0,181.235188,487.940889,0.0,0.0,41.823505,...,0.0,2300.292765,0.0,1017.705284,0.0,0.0,125.470514,0.0,83.64701,55.764673
2,1997,ARAPAHOE,61.190827,1716.890442,0.0,1418.917721,1558.149022,84.691651,5.320941,297.972721,...,110.409535,6360.742094,0.0,2596.619429,87.795534,105.532005,192.440716,186.676363,1124.935705,308.171192


44 cols x 1397 rows


Unnamed: 0,year,county,arson_rate,assault_aggravated_rate,assault_simple_rate,bribery_rate,burglary_rate,credit_card_machine_fraud_rate,drug_equipment_rate,drug_narcotic_rate,...,shoplifting_rate,sodomy_rate,stolen_property_rate,theft_from_building_rate,theft_from_vehicle_rate,theft_from_vending_machine_rate,vehicle_part_theft_rate,vehicle_theft_rate,weapon_law_rate,wire_fraud_rate
0,1997,ADAMS,51.269822,358.888756,1164.898057,1.192321,1151.18636,26.231072,432.216526,667.700012,...,788.720639,1.788482,146.059378,734.470013,1033.146536,31.596518,770.835817,785.143675,163.348039,0.0
1,1997,ALAMOSA,13.941168,195.176356,1003.764115,0.0,181.235188,0.0,111.529346,376.411543,...,0.0,0.0,0.0,195.176356,97.588178,139.411683,13.941168,83.64701,55.764673,13.941168
2,1997,ARAPAHOE,61.190827,297.52931,1310.725244,0.0,1418.917721,86.021887,406.165197,1151.983824,...,1833.06433,0.0,186.676363,1012.309111,1714.673383,31.925649,300.18978,1124.935705,308.171192,0.886824


In [4]:
INDEX = ['year', 'county']

def non_idx_cols(df, index=INDEX):
    return [col for col in df.columns if col not in index]

In [5]:
cr_name = cr_name.prefix_cols(non_idx_cols(cr_name), 'CRIME_NAME_COUNT__')
cr_cat = cr_cat.prefix_cols(non_idx_cols(cr_cat), 'CRIME_CATEGORY_COUNT__')

cr_name_rate = cr_name_rate.prefix_cols(non_idx_cols(cr_name_rate), 'CRIME_NAME_NORM__')
cr_cat_rate = cr_cat_rate.prefix_cols(non_idx_cols(cr_cat_rate), 'CRIME_CATEGORY_NORM__')

cr_base_count = cr_base_count.prefix_cols(non_idx_cols(cr_base_count), 'CRIME_BASE_COUNT__')
cr_base_rate = cr_base_rate.prefix_cols(non_idx_cols(cr_base_rate), 'CRIME_BASE_NORM__')

dem_cnt = dem_cnt.prefix_cols(non_idx_cols(dem_cnt), 'CENSUS_COUNT__')
dem_norm = dem_norm.prefix_cols(non_idx_cols(dem_norm), 'CENSUS_NORM__')

edu_cnt = edu_cnt.prefix_cols(non_idx_cols(edu_cnt, ['county']), 'EDU_COUNT__')
edu_norm = edu_norm.prefix_cols(non_idx_cols(edu_norm, ['county']), 'EDU_NORM__')

head(cr_name, cr_cat, cr_name_rate, cr_cat_rate, cr_base_count, cr_base_rate, dem_cnt, dem_norm, edu_cnt, edu_norm)

44 cols x 1397 rows


Unnamed: 0,year,county,CRIME_NAME_COUNT__arson,CRIME_NAME_COUNT__assault_aggravated,CRIME_NAME_COUNT__assault_simple,CRIME_NAME_COUNT__bribery,CRIME_NAME_COUNT__burglary,CRIME_NAME_COUNT__credit_card_machine_fraud,CRIME_NAME_COUNT__drug_equipment,CRIME_NAME_COUNT__drug_narcotic,...,CRIME_NAME_COUNT__shoplifting,CRIME_NAME_COUNT__sodomy,CRIME_NAME_COUNT__stolen_property,CRIME_NAME_COUNT__theft_from_building,CRIME_NAME_COUNT__theft_from_vehicle,CRIME_NAME_COUNT__theft_from_vending_machine,CRIME_NAME_COUNT__vehicle_part_theft,CRIME_NAME_COUNT__vehicle_theft,CRIME_NAME_COUNT__weapon_law,CRIME_NAME_COUNT__wire_fraud
0,1997,ADAMS,86,602,1954,2,1931,44,725,1120,...,1323,3,245,1232,1733,53,1293,1317,274,0
1,1997,ALAMOSA,1,14,72,0,13,0,8,27,...,0,0,0,14,7,10,1,6,4,1
2,1997,ARAPAHOE,138,671,2956,0,3200,194,916,2598,...,4134,0,421,2283,3867,72,677,2537,695,2


22 cols x 1397 rows


Unnamed: 0,year,county,CRIME_CATEGORY_COUNT__arson,CRIME_CATEGORY_COUNT__assault,CRIME_CATEGORY_COUNT__bribery,CRIME_CATEGORY_COUNT__burglary,CRIME_CATEGORY_COUNT__drug,CRIME_CATEGORY_COUNT__embezzlement,CRIME_CATEGORY_COUNT__extortion,CRIME_CATEGORY_COUNT__forgery,...,CRIME_CATEGORY_COUNT__kidnapping,CRIME_CATEGORY_COUNT__larceny_theft,CRIME_CATEGORY_COUNT__porn,CRIME_CATEGORY_COUNT__property_damage,CRIME_CATEGORY_COUNT__prostitution,CRIME_CATEGORY_COUNT__robbery,CRIME_CATEGORY_COUNT__sex_offense,CRIME_CATEGORY_COUNT__stolen_property,CRIME_CATEGORY_COUNT__vehicle_theft,CRIME_CATEGORY_COUNT__weapon_law
0,1997,ADAMS,86,2672,2,1931,1845,26,5,244,...,50,8023,1,5467,14,189,316,245,1317,274
1,1997,ALAMOSA,1,92,0,13,35,0,0,3,...,0,165,0,73,0,0,9,0,6,4
2,1997,ARAPAHOE,138,3872,0,3200,3514,191,12,672,...,249,14345,0,5856,198,238,434,421,2537,695


44 cols x 1397 rows


Unnamed: 0,year,county,CRIME_NAME_NORM__arson_rate,CRIME_NAME_NORM__assault_aggravated_rate,CRIME_NAME_NORM__assault_simple_rate,CRIME_NAME_NORM__bribery_rate,CRIME_NAME_NORM__burglary_rate,CRIME_NAME_NORM__credit_card_machine_fraud_rate,CRIME_NAME_NORM__drug_equipment_rate,CRIME_NAME_NORM__drug_narcotic_rate,...,CRIME_NAME_NORM__shoplifting_rate,CRIME_NAME_NORM__sodomy_rate,CRIME_NAME_NORM__stolen_property_rate,CRIME_NAME_NORM__theft_from_building_rate,CRIME_NAME_NORM__theft_from_vehicle_rate,CRIME_NAME_NORM__theft_from_vending_machine_rate,CRIME_NAME_NORM__vehicle_part_theft_rate,CRIME_NAME_NORM__vehicle_theft_rate,CRIME_NAME_NORM__weapon_law_rate,CRIME_NAME_NORM__wire_fraud_rate
0,1997,ADAMS,51.269822,358.888756,1164.898057,1.192321,1151.18636,26.231072,432.216526,667.700012,...,788.720639,1.788482,146.059378,734.470013,1033.146536,31.596518,770.835817,785.143675,163.348039,0.0
1,1997,ALAMOSA,13.941168,195.176356,1003.764115,0.0,181.235188,0.0,111.529346,376.411543,...,0.0,0.0,0.0,195.176356,97.588178,139.411683,13.941168,83.64701,55.764673,13.941168
2,1997,ARAPAHOE,61.190827,297.52931,1310.725244,0.0,1418.917721,86.021887,406.165197,1151.983824,...,1833.06433,0.0,186.676363,1012.309111,1714.673383,31.925649,300.18978,1124.935705,308.171192,0.886824


22 cols x 1397 rows


Unnamed: 0,year,county,CRIME_CATEGORY_NORM__arson_rate,CRIME_CATEGORY_NORM__assault_rate,CRIME_CATEGORY_NORM__bribery_rate,CRIME_CATEGORY_NORM__burglary_rate,CRIME_CATEGORY_NORM__drug_rate,CRIME_CATEGORY_NORM__embezzlement_rate,CRIME_CATEGORY_NORM__extortion_rate,CRIME_CATEGORY_NORM__forgery_rate,...,CRIME_CATEGORY_NORM__kidnapping_rate,CRIME_CATEGORY_NORM__larceny_theft_rate,CRIME_CATEGORY_NORM__porn_rate,CRIME_CATEGORY_NORM__property_damage_rate,CRIME_CATEGORY_NORM__prostitution_rate,CRIME_CATEGORY_NORM__robbery_rate,CRIME_CATEGORY_NORM__sex_offense_rate,CRIME_CATEGORY_NORM__stolen_property_rate,CRIME_CATEGORY_NORM__vehicle_theft_rate,CRIME_CATEGORY_NORM__weapon_law_rate
0,1997,ADAMS,51.269822,1592.941457,1.192321,1151.18636,1099.916537,15.500179,2.980804,145.463217,...,29.808036,4782.997496,0.596161,3259.210683,8.34625,112.674377,188.386789,146.059378,785.143675,163.348039
1,1997,ALAMOSA,13.941168,1282.587481,0.0,181.235188,487.940889,0.0,0.0,41.823505,...,0.0,2300.292765,0.0,1017.705284,0.0,0.0,125.470514,0.0,83.64701,55.764673
2,1997,ARAPAHOE,61.190827,1716.890442,0.0,1418.917721,1558.149022,84.691651,5.320941,297.972721,...,110.409535,6360.742094,0.0,2596.619429,87.795534,105.532005,192.440716,186.676363,1124.935705,308.171192


7 cols x 1397 rows


Unnamed: 0,year,county,CRIME_BASE_COUNT__cr_count,CRIME_BASE_COUNT__against_person,CRIME_BASE_COUNT__against_property,CRIME_BASE_COUNT__against_society,CRIME_BASE_COUNT__not_a_crime
0,1997,ADAMS,22947,3047,17766,2134,0
1,1997,ALAMOSA,404,101,264,39,0
2,1997,ARAPAHOE,37555,4568,28573,4414,0


11 cols x 1397 rows


Unnamed: 0,year,county,CRIME_BASE_NORM__cr_rate,CRIME_BASE_NORM__age_avg,CRIME_BASE_NORM__quarter_mode,CRIME_BASE_NORM__month_mode,CRIME_BASE_NORM__day_of_week_mode,CRIME_BASE_NORM__hour_mode,CRIME_BASE_NORM__against_person_rate,CRIME_BASE_NORM__against_property_rate,CRIME_BASE_NORM__against_society_rate
0,1997,ADAMS,13680.100155,24.582071,1.0,3.0,0.0,17.0,1816.501729,10591.391439,1272.206987
1,1997,ALAMOSA,5632.231981,27.098901,3.0,8.0,5.0,18.0,1408.057995,3680.468423,543.705563
2,1997,ARAPAHOE,16652.329686,25.209156,3.0,8.0,4.0,18.0,2025.505046,12669.605009,1957.219631


51 cols x 512 rows


Unnamed: 0,year,county,CENSUS_COUNT__pop,CENSUS_COUNT__gend_m,CENSUS_COUNT__gend_f,CENSUS_COUNT__age_over18,CENSUS_COUNT__age_undr19,CENSUS_COUNT__gend_m_age_undr19,CENSUS_COUNT__gend_f_age_undr19,CENSUS_COUNT__gend_m_age_over18,...,CENSUS_COUNT__hu_blt_1990_1999,CENSUS_COUNT__hu_blt_1980_1989,CENSUS_COUNT__hu_blt_1970_1979,CENSUS_COUNT__hu_blt_1960_1969,CENSUS_COUNT__hu_blt_1950_1959,CENSUS_COUNT__hu_blt_lt_1950,CENSUS_COUNT__ps_known,CENSUS_COUNT__ps_below,CENSUS_COUNT__ps_undr18_known,CENSUS_COUNT__ps_undr18_below
0,2012,ADAMS,231571.0,487410.0,201960.0,162109.0,69462.0,162653.0,66249.0,324757.0,...,27598.0,20368.0,30185.0,19615.0,20369.0,6158.0,438171.0,62008.0,124375.0,25278.0
1,2012,ALAMOSA,7823.0,17115.0,6283.0,5622.0,2201.0,5748.0,2044.0,11367.0,...,866.0,862.0,1405.0,654.0,591.0,1536.0,14622.0,3191.0,3817.0,758.0
2,2012,ARAPAHOE,292548.0,666719.0,233180.0,212207.0,80341.0,227254.0,76419.0,439465.0,...,33989.0,56011.0,62253.0,22258.0,16519.0,7165.0,568999.0,66945.0,144576.0,23054.0


58 cols x 512 rows


Unnamed: 0,year,county,CENSUS_NORM__age_over18_prop,CENSUS_NORM__age_undr19_prop,CENSUS_NORM__gend_m_prop,CENSUS_NORM__gend_f_prop,CENSUS_NORM__age_undr19_gend_m_prop,CENSUS_NORM__age_undr19_gend_f_prop,CENSUS_NORM__age_over18_gend_m_prop,CENSUS_NORM__age_over18_gend_f_prop,...,CENSUS_NORM__hu_per_cap,CENSUS_NORM__age_median,CENSUS_NORM__inc_hh_median,CENSUS_NORM__hh_size_avg,CENSUS_NORM__hu_oo_freq_val_ord,CENSUS_NORM__hu_oo_freq_val,CENSUS_NORM__hu_blt_freq_yr_ord,CENSUS_NORM__hu_blt_freq_yr,CENSUS_NORM__med_hm_val,CENSUS_NORM__med_yr_blt
0,2012,ADAMS,0.70004,0.29996,2.104797,0.87213,2.341611,0.953744,2.003325,0.837159,...,0.704946,32.4,56633.0,2.91,3,v150k_250k,7,b2000_2009,188100.0,1983.0
1,2012,ALAMOSA,0.71865,0.28135,2.18778,0.803145,2.61154,0.928669,2.021878,0.754002,...,0.840087,32.2,38045.0,2.49,2,v50k_150k,1,b1949_e,133200.0,1974.0
2,2012,ARAPAHOE,0.725375,0.274625,2.279007,0.797066,2.828618,0.951183,2.070926,0.738717,...,0.814089,35.7,60400.0,2.55,3,v150k_250k,4,b1970_1979,230900.0,1982.0


78 cols x 63 rows


Unnamed: 0,county,EDU_COUNT__pupil_total,EDU_COUNT__stable,EDU_COUNT__mobile,EDU_COUNT__mobile_instances,EDU_COUNT__disabled_pupil_total,EDU_COUNT__disabled_stable,EDU_COUNT__disabled_mobile,EDU_COUNT__disabled_mobile_instances,EDU_COUNT__limited_eng_pupil_total,...,EDU_COUNT__migrant_completed,EDU_COUNT__title_1_grad_base_total,EDU_COUNT__title_1_graduated,EDU_COUNT__title_1_completed,EDU_COUNT__homeless_grad_base_total,EDU_COUNT__homeless_graduated,EDU_COUNT__homeless_completed,EDU_COUNT__gifted_grad_base_total,EDU_COUNT__gifted_graduated,EDU_COUNT__gifted_completed
0,ADAMS,98546,67272,31222,33925,8848,6263,2588,2896,20773,...,33,935,529,559,360,190,204,402,337,345
1,ALAMOSA,2775,1882,885,950,223,159,63,66,368,...,4,28,22,23,6,6,6,0,0,0
2,ARAPAHOE,124639,94109,30134,32269,11842,9461,2354,2568,25370,...,9,488,202,213,243,96,102,909,820,828


60 cols x 63 rows


Unnamed: 0,county,EDU_NORM__stable_rate,EDU_NORM__mobile_rate,EDU_NORM__mobile_instances_rate,EDU_NORM__disabled_stable_rate,EDU_NORM__disabled_mobile_rate,EDU_NORM__disabled_mobile_instances_rate,EDU_NORM__disabled_graduated_rate,EDU_NORM__disabled_completed_rate,EDU_NORM__limited_eng_stable_rate,...,EDU_NORM__white_mobile_instances_rate,EDU_NORM__black_stable_rate,EDU_NORM__black_mobile_rate,EDU_NORM__black_mobile_instances_rate,EDU_NORM__hispanic_stable_rate,EDU_NORM__hispanic_mobile_rate,EDU_NORM__hispanic_mobile_instances_rate,EDU_NORM__asian_stable_rate,EDU_NORM__asian_mobile_rate,EDU_NORM__asian_mobile_instances_rate
0,ADAMS,68.26,31.68,34.43,70.78,29.25,32.73,47.54,50.1,69.99,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,ALAMOSA,67.82,31.89,34.23,71.3,28.25,29.6,86.67,93.33,72.01,...,35.54,57.14,42.86,42.86,70.15,29.47,32.69,64.0,36.0,36.0
2,ARAPAHOE,75.51,24.18,25.89,79.89,19.88,21.69,51.26,52.06,74.94,...,21.16,67.47,32.03,34.57,72.76,26.75,29.2,78.59,21.3,22.56


In [6]:
df = (
    # CRIME
    cr_name
    .merge(cr_cat, on=INDEX)
    .merge(
        (
            cr_name_rate
            .merge(cr_cat_rate, on=INDEX)
        ),
        on=INDEX
    )
    .merge(
        (
            cr_base_count
            .merge(cr_base_rate, on=INDEX)
        ),
        on=INDEX
    )
    # DEMOGRAPHICS
    .merge(
        (
            dem_cnt
            .merge(dem_norm, on=INDEX)
        ),
        on=INDEX
    )
    # EDUCATION
    .merge(
        (
            edu_cnt
            .merge(edu_norm, on=['county'])
        ),
        on=['county']
    )
)

In [7]:
# Change all numbers to floats, and round
numeric_cols = df.select_dtypes(include=[np.number])
df[numeric_cols.columns] = numeric_cols.astype(float).round(2)
df.year = df.year.astype(int)
head(df)

381 cols x 490 rows


Unnamed: 0,year,county,CRIME_NAME_COUNT__arson,CRIME_NAME_COUNT__assault_aggravated,CRIME_NAME_COUNT__assault_simple,CRIME_NAME_COUNT__bribery,CRIME_NAME_COUNT__burglary,CRIME_NAME_COUNT__credit_card_machine_fraud,CRIME_NAME_COUNT__drug_equipment,CRIME_NAME_COUNT__drug_narcotic,...,EDU_NORM__white_mobile_instances_rate,EDU_NORM__black_stable_rate,EDU_NORM__black_mobile_rate,EDU_NORM__black_mobile_instances_rate,EDU_NORM__hispanic_stable_rate,EDU_NORM__hispanic_mobile_rate,EDU_NORM__hispanic_mobile_instances_rate,EDU_NORM__asian_stable_rate,EDU_NORM__asian_mobile_rate,EDU_NORM__asian_mobile_instances_rate
0,2012,ADAMS,105.0,1029.0,2835.0,2.0,2311.0,535.0,3211.0,3522.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,2013,ADAMS,73.0,899.0,2649.0,7.0,2144.0,437.0,1965.0,2204.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
2,2014,ADAMS,123.0,1033.0,2863.0,8.0,2073.0,526.0,1945.0,2145.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54


In [8]:
## TEMPORARY - REVERSE OUR WORK
df.columns = [c if '__' not in c else c.split('__')[1] for c in df.columns]
df = df.loc[:, ~ df.columns.duplicated()]
head(df)

353 cols x 490 rows


Unnamed: 0,year,county,arson,assault_aggravated,assault_simple,bribery,burglary,credit_card_machine_fraud,drug_equipment,drug_narcotic,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,2012,ADAMS,105.0,1029.0,2835.0,2.0,2311.0,535.0,3211.0,3522.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,2013,ADAMS,73.0,899.0,2649.0,7.0,2144.0,437.0,1965.0,2204.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
2,2014,ADAMS,123.0,1033.0,2863.0,8.0,2073.0,526.0,1945.0,2145.0,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54


In [9]:
write_main(df, 'county_EVERYTHING')

490

## District

In [10]:
df_dist = read_main('education_dist')
df_geo_dist = read_main('''
    d.district,
    c.county,
    d.geo_border AS geo_district_border,
    c.geo_border AS geo_county_border,
    d.geo_point AS geo_district_point,
    c.geo_point AS geo_county_point
FROM district AS d
INNER JOIN county AS c
    ON c.county = d.in_county
''')

In [11]:
df_dist = (df_geo_dist
    .merge(df_dist, on=['district'])
    [ ~ df_geo_dist.geo_district_border.isna()]
)

head(df_dist)

142 cols x 178 rows


Unnamed: 0,district,county,geo_district_border,geo_county_border,geo_district_point,geo_county_point,pupil_total,stable,mobile,mobile_instances,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,MAPLETON 1,ADAMS,MULTIPOLYGON (((-105.01581612299998 39.8144774...,MULTIPOLYGON (((-103.70574149517748 39.9999110...,GEOMETRYCOLLECTION EMPTY,GEOMETRYCOLLECTION EMPTY,9037,5077,3919,4133,...,52.08,48.04,51.4,53.63,60.31,39.19,42.19,47.22,52.78,53.7
1,ADAMSFIVESTAR 12,ADAMS,MULTIPOLYGON (((-105.05310614499996 39.9302934...,MULTIPOLYGON (((-103.70574149517748 39.9999110...,GEOMETRYCOLLECTION EMPTY,GEOMETRYCOLLECTION EMPTY,49889,34283,15424,16854,...,32.27,55.98,43.94,47.41,67.3,32.23,36.81,81.07,18.84,21.15
2,ADAMSCOUNTY 14,ADAMS,MULTIPOLYGON (((-104.96883410999999 39.7910064...,MULTIPOLYGON (((-103.70574149517748 39.9999110...,GEOMETRYCOLLECTION EMPTY,GEOMETRYCOLLECTION EMPTY,8265,5510,3038,3397,...,47.45,48.2,51.8,53.6,68.67,35.2,39.57,80.0,20.0,20.0


In [12]:
write_main(df_dist, 'district_EVERYTHING')

178

## Generate combined notebook output

In [13]:
from combine_notebooks import combine

combine(
    filename="8-combined (AUTO GENERATED FILE)",
)