These notebooks are used to compare a base and scenario, from expanded surveys or model outputs, in H5 format. To run: from the menu bar above, choose **Cell -> Run All ** or run lines individually. Use the toggle button below to hide/show the raw Python code.

## Person and Household

*Summaries:*
    - Total People & Workers
    - Household Size
    - County Population
    - Age Distribution
    - Worker Types
    - Distance to Work
    - Transit Pass Ownership
    - Vehicle Ownership
---

In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [None]:
import os
import numpy as np
import pandas as pd
import h5py
import pylab as P
from IPython.display import display, display_pretty, Javascript, HTML
from pandas_highcharts.core import serialize
from pandas_highcharts.display import display_charts
import matplotlib.pyplot as plt

# Show charts in notebook
%pylab inline

In [None]:
# Set main model directory to parent directory

# 2006 survey
survey06_dir = r'R:\SoundCast\releases\TransportationFutures2010\scripts\summarize'


# 2014 survey
survey14_dir = r'R:\SoundCast\releases\TransportationFutures2010\scripts\summarize'
# survey14_dir = r'R:\SoundCast\estimation\2014\P5'

In [None]:
# Read Model Scenario Results
scen = h5py.File(survey06_dir + r'/survey-expanded.h5','r+')
scen_name = '2006 Survey'

In [None]:
# Read Base Data
base_file = r'/survey-expanded.h5'
# base_dir = survey14_dir

base = h5py.File(survey14_dir + base_file ,'r+')
base_name = '2014 Survey'

In [None]:
def build_df(h5file, h5table, var_dict, survey_file=False):
    ''' Convert H5 into dataframe '''
    data = {}
    if survey_file:
        # survey h5 have nested data structure, different than daysim_outputs
        for col_name, var in var_dict.iteritems():
            data[col_name] = [i[0] for i in h5file[h5table][var][:]]
    else:
        for col_name, var in var_dict.iteritems():
            data[col_name] = [i for i in h5file[h5table][var][:]]

    return pd.DataFrame(data)

In [None]:
tripdict={'Household ID': 'hhno',
            'Person Number': 'pno',
            'Travel Time':'travtime',
            'Travel Cost': 'travcost',
            'Travel Distance': 'travdist',
            'Mode': 'mode',
            'Purpose':'dpurp',
            'Departure Time': 'deptm',
            'Expansion Factor': 'trexpfac'}

In [None]:
base.keys()

In [None]:
# trip_scen = build_df(h5file=scen, h5table='Trip', var_dict=tripdict, survey_file=False)
# trip_base = build_df(h5file=base, h5table='Trip', var_dict=tripdict, survey_file=False)

In [None]:
persondict={'Household ID': 'hhno',
            'Person Number': 'pno',
            'Transit Pass': 'ptpass',
            'Auto Time to Work': 'pwautime',
            'Auto Distance to Work': 'pwaudist',
            'Worker Type': 'pwtyp',
            'Student Type': 'pstyp',
            'Usual Commute Mode': 'pwtaz',
            'Workplace TAZ': 'pwtaz',
            'Age': 'pagey',
            'Person Type': 'pptyp',
            'Expansion Factor': 'psexpfac'}

In [None]:
person_scen = build_df(h5file=scen, h5table='Person', var_dict=persondict, survey_file=False)
person_base = build_df(h5file=base, h5table='Person', var_dict=persondict, survey_file=False)

In [None]:
# Create unique ID for person by concatenating household ID and person number 
person_scen['personID'] = (person_scen['Household ID'].astype('str')+person_scen['Person Number'].astype('str')).astype('int')
person_base['personID'] = (person_base['Household ID'].astype('str')+person_base['Person Number'].astype('str')).astype('int')

In [None]:
hhdict={'Household ID': 'hhno',
        'Household Size': 'hhsize',
        'Household Vehicles': 'hhvehs',
        'Household Workers': 'hhwkrs',
        'Household Income': 'hhincome',
        'Household TAZ': 'hhtaz',
        'Expansion Factor': 'hhexpfac'}

In [None]:
hh_scen = build_df(h5file=scen, h5table='Household', var_dict=hhdict, survey_file=False)
hh_base = build_df(h5file=base, h5table='Household', var_dict=hhdict, survey_file=False)

In [None]:
# Add labels for worker type
labels = {
  0: "Not a worker",  
  1: "Full-time worker",
  2: "Part-time worker",
}

person_base['Worker Type'] = ([labels[x] for x in person_base['Worker Type']])
person_scen['Worker Type'] = ([labels[x] for x in person_scen['Worker Type']])

In [None]:
# # Add labels for student type
# labels = {
#   0: "Not a student",  
#   1: "Full-time student",
#   2: "Part-time student",
# }

# person_base['Student Type'] = ([labels[x] for x in person_base['Student Type']])
# person_scen['Student Type'] = ([labels[x] for x in person_scen['Student Type']])

In [None]:
# Join household records to person records
hh_per_scen = pd.merge(left=person_scen, right=hh_scen,on='Household ID',suffixes=('_p','_h'))
hh_per_base = pd.merge(left=person_base, right=hh_base,on='Household ID',suffixes=('_p','_h'))

In [None]:
# Join household geography
taz_geog = pd.read_csv(r'utils/taz_lookup.csv')
taz_geog.reindex
hh_per_scen_home_geog = pd.merge(hh_per_scen, taz_geog, left_on='Household TAZ', right_on='TAZ')
hh_per_base_home_geog = pd.merge(hh_per_base, taz_geog, left_on='Household TAZ', right_on='TAZ')

In [None]:
# Join workplace geography
hh_per_scen_work_geog = pd.merge(hh_per_scen, taz_geog, left_on='Workplace TAZ', right_on='TAZ')
hh_per_base_work_geog = pd.merge(hh_per_base, taz_geog, left_on='Workplace TAZ', right_on='TAZ')

## Total People

In [None]:
print scen_name + ": "+ str(person_scen['personID'].count()).split('.')[0]
print base_name + ": "+ str(person_base['personID'].count()).split('.')[0]

## Average Household Size

In [None]:
print scen_name + ": "+ str(hh_scen['Household Size'].mean())
print base_name + ": "+ str(hh_base['Household Size'].mean())

## Household Size Distribution

In [None]:
df = pd.DataFrame([hh_scen.groupby('Household Size').count()['Household ID'],
                   hh_base.groupby('Household Size').count()['Household ID']]).T
df.columns=([scen_name,base_name])
df

In [None]:
# Distribution
df_dist = pd.DataFrame([df[scen_name]/person_scen.count()['Household ID'],
                   df[base_name]/person_base.count()['Household ID']]).T
df_dist
display_charts(df_dist, kind='bar', title='Household Size Distribution')

---

## Age Distribution

In [None]:
# 2006 survey age is continuous variable
# 2014 is categorical
from collections import OrderedDict


# lookup for 2014 names (which are averaged based on min and max of bin)
age_cat = OrderedDict([(2, '   <5'),
           (8, ' 5-11'),
           (14, '12-15'),
           (17, '16-17'),
          (21, '18-24'),
          (30, '25-34'),
          (40, '35-44'),
          (50, '44-54'),
          (60, '55-64'),
          (70, '65-74'),
          (80, '75-84'),
          (90, '85+')])


person_base['Age'] = [age_cat[x] for x in person_base['Age'].as_matrix()]

In [None]:
# convert 2006 into categorical
age_bins = [0,4,11,15,17,24,34,44,54,64,74,84,1000]
person_scen['Age'] = pd.cut(person_scen['Age'],age_bins,labels=age_cat.values())

In [None]:
df = pd.DataFrame([person_scen.groupby('Age').count()['Household ID'],
                   person_base.groupby('Age').count()['Household ID']]).T
df.columns=([scen_name,base_name])
df

In [None]:
# Distribution
df_new = pd.DataFrame([df[scen_name]/df[scen_name].sum(),
             df[base_name]/df[base_name].sum()]).T
df_new

display_charts(df_new, title='Age', kind='bar')

---

## Workers Types

In [None]:
df = pd.DataFrame([person_scen.groupby('Worker Type').count()['personID'],
                   person_base.groupby('Worker Type').count()['personID']]).T
df.columns=([scen_name,base_name])
df

---

** Distribution of Worker Types**

In [None]:
# Distribution
df_dist = pd.DataFrame([df[scen_name]/person_scen.count()['personID'],
                   df[base_name]/person_base.count()['personID']]).T

In [None]:
display_charts(df_dist, kind='bar', title='Worker Type Distribution')

---

## Workers by County

In [None]:
hh_per_scen_work_geog

In [None]:
# Workers by Workplace County

scen_workers = hh_per_scen_work_geog[hh_per_scen_work_geog['Worker Type'] <> 'Not a worker']
base_workers = hh_per_base_work_geog[hh_per_base_work_geog['Worker Type'] <> 'Not a worker']

df = scen_workers.groupby('County').count()['personID']
df=df[df.index<>0]
df=pd.DataFrame(df)
df.columns=[scen_name]

df_base = base_workers.groupby('County').count()['personID']
df_base=df_base[df_base.index<>0]
df[base_name]=df_base
df

# Also add in the CTPP here
# 2006-2010 CTPP

ctpp_workers = {'King':53625,'Kitsap':6475,'Pierce':15705,'Snohomish':15810}

df['CTPP 06-10'] = pd.DataFrame(ctpp_workers.values(),index=ctpp_workers.keys())

In [None]:
df

In [None]:
# Distribution
df_dist = pd.DataFrame([df[scen_name]/df[scen_name].sum(),
                       df[base_name]/df[base_name].sum(),
                       df['CTPP 06-10']/df['CTPP 06-10'].sum()]).T

In [None]:
display_charts(df_dist, kind='bar', title='Worker Distribution by County')

---

## Distance to Work

In [None]:
# Filter out non-workers
scen_df = (person_scen[person_scen['Worker Type'] <> 'Not a worker'])
base_df = (person_base[person_base['Worker Type'] <> 'Not a worker'])

df = pd.DataFrame([scen_df.groupby('Age').mean()['Auto Distance to Work'],
              base_df.groupby('Age').mean()['Auto Distance to Work']]).T
df.columns=([scen_name,base_name])
df.fillna(0,inplace=True)
df

In [None]:
display_charts(df, title='Distance to Work', kind='bar')

---

## Transit Pass Ownership

In [None]:
df = pd.DataFrame([person_scen.groupby('Transit Pass').mean()['Auto Distance to Work'],
              person_base.groupby('Transit Pass').mean()['Auto Distance to Work']]).T
df.columns=([scen_name,base_name])

In [None]:
df.index = ['No Pass', 'Transit Pass']
df

---

## Auto Ownership

In [None]:
new_hh_scen = hh_scen[hh_scen.index>=0]
df = pd.DataFrame([new_hh_scen.groupby('Household Vehicles').count()['Household ID'],
                   hh_base.groupby('Household Vehicles').count()['Household ID']]).T
df.columns=([scen_name,base_name])
df=df[df.index>=0]
df

In [None]:
# Distribution
df_new = pd.DataFrame([df[scen_name]/df[scen_name].sum(),
             df[base_name]/df[base_name].sum()]).T

In [None]:
display_charts(df_new, title='Vehicles per Household', kind='bar', xlim=(0,6))

### Auto Ownership by Income

In [None]:
# Create common income ranges
def map_income(df, in_field, out_field):
    
    # Define categories
    incmap = {}
    for i in range(0, 20000):
        incmap.update({i: ' <20k'})
    for i in range(20000, 40000):
        incmap.update({i: '20k-40k'})
    for i in range(40000, 60000):
        incmap.update({i: '40k-60k'})
    for i in range(60000, 75000):
        incmap.update({i: '60k-75k'})
    for i in range(75000, 100000):
        incmap.update({i: '75k-100k'})
    for i in range(100000, 150000):
        incmap.update({i: '100k-150k'})
    for i in range(150000, int(df[in_field].max())+1):
        incmap.update({i: '>150k'})

    df[out_field] = df[in_field].map(incmap)
    return df

In [None]:
hh_scen = map_income(hh_scen, in_field='Household Income', out_field='Income')
hh_base = map_income(hh_base, in_field='Household Income', out_field='Income')

In [None]:
hh_base.groupby('Household Income').count()

In [None]:
df_scen = pd.pivot_table(data=hh_scen, index='Household Vehicles', columns=['Income'], 
                    aggfunc='count', fill_value=False, margins=True)['Household ID']
df_base = pd.pivot_table(data=hh_base, index='Household Vehicles', columns=['Income'], 
                    aggfunc='count', fill_value=False, margins=True)['Household ID']

# Sort the columns
df_scen = df_scen[[' <20k','20k-40k','40k-60k','60k-75k',
                   '75k-100k','100k-150k','>150k']]
df_base = df_base[[' <20k','20k-40k','40k-60k','60k-75k',
                   '75k-100k','100k-150k','>150k']]

In [None]:
df_scen

---