# Cross-Classification Refinement 
For 2014 Survey data for 4k

Using a decision (classification) tree approach to update trip rates
Based on guidance from this report: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.539.3755&rep=rep1&type=pdf

Using python tools to build a decision tree for generating classifications
This process helps combine classes with small samle sizes to match existing crossclass definitions

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split

In [71]:
# Load household records to prepare data and generate cross-classification
hh = pd.read_excel(r'J:\Projects\Surveys\HHTravel\Survey2014\Data\Final database\Release 4\2014-pr3-M-hhsurvey-households.xlsx',
                  sheetname='Data')

# Reclassify household columns for cross-classification

# Number of Workers 0 -> 3+
hh['numworkers_crossclass'] = hh['numworkers']
hh.ix[hh['numworkers'] >= 3, 'numworkers_crossclass'] = 3
hh['numworkers_crossclass'] = hh['numworkers_crossclass'].astype('int')

# Household size
hh['hhsize_crossclass'] = hh['hhsize']
hh.ix[hh['hhsize'] >= 4, 'hhsize_crossclass'] = 4
hh['hhsize_crossclass'] = hh['hhsize_crossclass'].astype('int')

# Household income
# Note that exact ranges from 2014 do not match 2006
# old ranges: 30, 60, 90+
# new ranges: 35, 75, 100+
hh['income_crossclass'] = hh['hh_income_detailed_imp']
hh.ix[hh['hh_income_detailed_imp'] <= 3, 'income_crossclass'] = 1    # $35k  
hh.ix[(hh['hh_income_detailed_imp'] > 3) & (hh['hh_income_detailed_imp'] <= 5),    # $35-75k
           'income_crossclass'] = 2
hh.ix[(hh['hh_income_detailed_imp'] > 5) & (hh['hh_income_detailed_imp'] <= 6),    # $75-100k
           'income_crossclass'] = 3
hh.ix[hh['hh_income_detailed_imp'] >= 7, 'income_crossclass'] = 4    # >$100k

In [79]:
# Data needs to be in format of each household on a row and columns for trips by purpose

# Load trip data and compute 4k purposes

######
# NOTE: different bins will be created based on which trip file is used
# For now, we are using the GPS trip weights to create the bins
# and using this one throughout
######
# GPS Weighted trips (in Daysim format)
# trip = pd.read_csv(r'R:\SoundCastDocuments\2014Estimation\Files_From_Mark_2014\gps_weights_11_3_16\formatted\skims_attached\tripP14_w.dat')

# Non GPS weighted
trip = pd.read_csv(r'R:\SoundCast\estimation\2014\Surveys\2014 Survey\P5\tripP5.dat', delim_whitespace=True)

# Separate college student trips from regional survey trips
college_trips = trip[trip['hhno'] < 14000000]
trip = trip[trip['hhno'] >= 14000000]

# Home-Based Work (HBW) Trips, directly from home-to-work and work-to-home
trip.ix[(trip['opurp'] == 0) & (trip['dpurp'] == 1), '4k_purp'] = 'HBW'
trip.ix[(trip['opurp'] == 1) & (trip['dpurp'] == 0), '4k_purp'] = 'HBW'

# Home-Based shopping
trip.ix[(trip['opurp'] == 0) & (trip['dpurp'] == 5), '4k_purp'] = 'Home-Based Shopping'
trip.ix[(trip['opurp'] == 5) & (trip['dpurp'] == 0), '4k_purp'] = 'Home-Based Shopping'

# Home-Based School
trip.ix[(trip['opurp'] == 0) & (trip['dpurp'] == 2), '4k_purp'] = 'School'
trip.ix[(trip['opurp'] == 2) & (trip['dpurp'] == 0), '4k_purp'] = 'School'

# Home-Based Other
trip.ix[(trip['opurp'] == 0) & (trip['dpurp'].isin([3,4,6,7,8,9,10])), '4k_purp'] = 'Home-Based Other'
trip.ix[(trip['opurp'].isin([3,4,6,7,8,9,10]) & (trip['dpurp'] == 0)), '4k_purp'] = 'Home-Based Other'

# NHB Work-to-Other
trip.ix[(trip['opurp'] == 1) & (trip['dpurp'] != 0) & (trip['4k_purp'].isnull()), '4k_purp'] = 'NHB WtO'
trip.ix[(trip['opurp'] != 0) & (trip['dpurp'] == 1) & (trip['4k_purp'].isnull()), '4k_purp'] = 'NHB WtO'

# # NHB Other-to-Other (Destination and Origins are neither work nor home)
trip.ix[(trip['opurp'] != 1) & (trip['dpurp'] != 0) & (trip['opurp'] != 0) 
        & (trip['dpurp'] != 1) & (trip['4k_purp'].isnull()), '4k_purp'] = 'NHB OtO'




In [73]:
# Export GPS Trips with 4k_purp
# trip.to_csv(r'R:\4K\2014\Trip Generation\Trip Rates\trip_2014_survey_gps.csv', index=False)

In [74]:
# Apply a decision tree to figure out which rates should be averaged and combined

min_samples_size = 50
seed_value = 99
int_conversion = 1000

# Reorganize the trip record data
# Group by household id
_df = trip.groupby(['hhno','4k_purp']).sum()[['trexpfac']]
_df = _df.unstack()
_df.fillna(0,inplace=True)
_df = _df.reset_index()
purp_cols = ['HBW','Home-Based Other','Home-Based Shopping','NHB OtO','NHB WtO','Home-Based School']
_df.columns = ['hhno']+purp_cols

# # Join hh information 
df = pd.merge(_df, hh[['hhid','hhsize_crossclass','numworkers_crossclass', 'income_crossclass']], 
         left_on='hhno', right_on='hhid',how='left')

# Get total trips per household
df['tot_trips'] = df['HBW']+df['Home-Based Other']+df['Home-Based Shopping']+df['NHB OtO']+df['NHB WtO']+df['Home-Based School']

# For decision tree analysis, need a target (y) and features (x)
# We are modeling trips as a function of HH size, number of workers, and income classes

# Features for classification
features = ["hhsize_crossclass","numworkers_crossclass","income_crossclass"]
X = df[features]

# Decision trees are used to predict total trips
# since we are classifying based on household characteristics and not trips
# we can use total trips as dependent variable to generate bins

# Must be in integer form for prediction
# Convert to int by multiplying by factor
df['tot_trips_int'] = (df['tot_trips']*int_conversion).astype('int')

# Define the target y value as trips by purpsoe
y = df['tot_trips_int']

# Create training and testing sample (maybe not necessary...)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create decision tree, set min sample size and random seed
# This part generates the optimal category separations
estimator = DecisionTreeClassifier(min_samples_leaf=min_samples_size, random_state=seed_value)

# Export for visualization
# visualize_tree(estimator, features)

# Use the new estimator object to assign categories to original data
estimator.fit(X_train, y_train)
new_categories = estimator.apply(X)
df['optimum_grouping'] = new_categories

In [75]:
# Group by old and new categories to apply new cross class blending
# For rows that have the same value for 'new_grouping' col, results for these should be averaged
df_cat = df.groupby(['optimum_grouping','hhsize_crossclass','income_crossclass','numworkers_crossclass']).count()

In [76]:
df_cat = df_cat.reset_index()

In [77]:
hh = pd.merge(hh, df_cat[['hhsize_crossclass','income_crossclass','numworkers_crossclass','optimum_grouping']], 
         how='left', on=['hhsize_crossclass','income_crossclass','numworkers_crossclass'])

In [89]:
# Total households by cross-classification group (hhsize, numworkers, and income)
total_hh = hh.groupby(['optimum_grouping']).sum()[['expwt_2']]
total_hh = total_hh.reset_index()

# Join the category values
total_hh = pd.merge(total_hh, df_cat[['hhsize_crossclass','numworkers_crossclass','income_crossclass','optimum_grouping']], 
                    how='left', on='optimum_grouping')
total_hh.to_clipboard()

In [87]:
hh.groupby(['optimum_grouping']).count()[['expwt_2']].to_clipboard()

In [80]:
# Calculate trips rates

# Load trip files
# GPS Weighted trips (in Daysim format)


# Merge household grouping to trip
trip_hh = pd.merge(trip, hh, left_on='hhno', right_on='hhid', how='left')

In [86]:
# Trips by Purpose
# purp = 'HBW'
# purp = 'Home-Based Shopping'
# purp = 'Home-Based Other'
# purp = 'School'
purp = 'NHB OtO'
# purp = 'NHB WtO'
_df = trip_hh[trip_hh['4k_purp'] == purp].groupby(['optimum_grouping']).sum()[['trexpfac']]
_df = _df.reset_index()
_df = pd.merge(_df, df_cat[['hhsize_crossclass','numworkers_crossclass','income_crossclass','optimum_grouping']], 
                    how='left', on='optimum_grouping')
_df.to_clipboard()

# Get trip totals by purpose
# trip_hh[trip_hh['4k_purp'] == purp]['trexpfac'].sum()

In [56]:
# Get trip totals
trip_hh.groupby('4k_purp').count()

Unnamed: 0_level_0,hhno,pno,day,tour,half,tseg,tsvid,opurp,dpurp,oadtyp,...,h_parcelID_0,prev_parcelID_0,h_PIN_10,h_PSRCPIN_10,prev_PIN_10,prev_PSRCPIN_10,numworkers_crossclass,hhsize_crossclass,income_crossclass,optimum_grouping
4k_purp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HBW,7848,7848,7848,7848,7848,7848,7848,7848,7848,7848,...,7848,7848,7848,7848,2235,7848,7848,7848,7848,7848
Home-Based Other,16462,16462,16462,16462,16462,16462,16462,16462,16462,16462,...,16462,16462,16462,16462,3868,16462,16462,16462,16462,16462
Home-Based Shopping,5172,5172,5172,5172,5172,5172,5172,5172,5172,5172,...,5172,5172,5172,5172,1269,5172,5172,5172,5172,5172
NHB OtO,8202,8202,8202,8202,8202,8202,8202,8202,8202,8202,...,8202,8202,8202,8202,1865,8202,8202,8202,8202,8202
NHB WtO,5900,5900,5900,5900,5900,5900,5900,5900,5900,5900,...,5900,5900,5900,5900,1802,5900,5900,5900,5900,5900
School,2496,2496,2496,2496,2496,2496,2496,2496,2496,2496,...,2496,2496,2496,2496,642,2496,2496,2496,2496,2496


In [65]:
trip_hh.groupby('4k_purp').sum()['trexpfac'].sum()

12960491.149999935