The purpose of this notebook is to create a one-hot encoding of the dataset

In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
%load_ext autoreload
# # the "1" means: always reload modules marked with "%aimport"
%autoreload 2

from __future__ import absolute_import, division, print_function
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
# from tqdm import tqdm
# import warnings

# sns.set_context("poster", font_scale=0.9)
sns.set_context("notebook", font_scale=1.0)
pd.set_option('display.max_columns', 100)

# add the data functions to the path
src_data_dir = os.path.join(os.getcwd(), os.pardir, 'src/data')
sys.path.append(src_data_dir)

# functions to load the data
import homeless_dataset as hd

In [2]:
# load in and process the data in separate sheets

df_client = hd.process_data_client(simplify_strings=True)

df_enroll = hd.process_data_enrollment(simplify_strings=True)

# Only keep rows with entry dates starting in 2012
df_enroll = df_enroll[df_enroll['Entry Date'] >= '2012']
# Only keep rows with exit dates before 2016-06-01
df_enroll = df_enroll[df_enroll['Exit Date'] <= '2016-06-01']

df_disability = hd.process_data_disability(simplify_strings=True)

df_healthins = hd.process_data_healthins(simplify_strings=True)

df_benefit = hd.process_data_benefit(simplify_strings=True)

df_income = hd.process_data_income(simplify_strings=True)

df_project = hd.process_data_project(simplify_strings=True)

df_service = hd.process_data_service(simplify_strings=True)

df_bedinv = hd.process_data_bedinventory(simplify_strings=True)

In [3]:
# Join the client information with enrollment information.
# Inner join because we want to only keep individuals
# for whom we have both client and enrollment information.
df = df_client.merge(df_enroll, how='inner', left_index=True, right_index=True)

# just choose the first non-cash benefit; this is too simple!
# TODO: join on the exact Project ID, and possible Date
df = df.merge(df_benefit.reset_index().groupby(by=['Personal ID'])[['Non-Cash Benefit']].nth(0),
              how='left', left_index=True, right_index=True)
# # possible fix for above, but this isn't working properly (results in too many rows);
# # probably need date too, but they do not align
# df.reset_index().merge(df_benefit.reset_index()[['Personal ID', 'Project Entry ID', 'Non-Cash Benefit']].drop_duplicates(),
#                        how='left',
#                        on=['Personal ID', 'Project Entry ID'],
#                       ).drop_duplicates().set_index('Personal ID')

df['Non-Cash Benefit'] = df['Non-Cash Benefit'].fillna('None')

# add information about their disability status
# just choose the first disability; this is too simple!
# TODO: join on the exact Project ID
df = df.merge(df_disability.reset_index().groupby(by=['Personal ID'])[['Disability Type']].nth(0),
              how='left', left_index=True, right_index=True)
# # possible fix for above, but this isn't working properly (results in too many rows);
# # probably need date too, but they do not align
# df.reset_index().merge(df_disability.reset_index()[['Personal ID', 'Project Entry ID', 'Disability Type']].drop_duplicates(),
#                        how='left',
#                        on=['Personal ID', 'Project Entry ID'],
#                       ).drop_duplicates().set_index('Personal ID')

df['Disability Type'] = df['Disability Type'].fillna('None')

# add Project Type Code to DataFrame
df = df.merge(df_project[['Project Name',
                          'Project Type Code',
                          'Address City',
                          'Address Postal Code',
                         ]], left_on=['Project ID'], right_index=True)

# sort by entry date
df = df.sort_values('Entry Date')

In [4]:
# rename the columns to have no spaces
df = hd.rename_columns(df)

In [5]:
df.shape

(58209, 33)

In [6]:
df.head(2)

Unnamed: 0_level_0,race,ethnicity,gender,veteran_status,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,noncash_benefit,disability_type,project_name,project_type_code,address_city,address_postal_code
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
214224,white,nonlatino,male,False,203474,60,0,2012-01-01,2013-09-08,2938,cat1homeless,hotel,,203474,True,False,,,False,False,NaT,False,,False,616,,False,,physical,SCz - HSC - Page Smith Community House,transitionalhousing,Santa Cruz,95060
194592,white,nonlatino,male,True,150135,53,91103,2012-01-02,2013-08-31,2134,cat1homeless,family,,150135,True,False,,,False,False,NaT,False,,False,607,,False,foodstamps,mentalhealth,MOSBE Veterans Transition Center - Outreach,other,Marina,93933


In [7]:
df['project_type_code'].value_counts()

emergencyshelter              43815
servicesonly                   4652
other                          2529
homelessnessprevention         2213
transitionalhousing            2012
rapidrehousing                 1767
streetoutreach                 1049
permanentsupportivehousing      172
Name: project_type_code, dtype: int64

In [8]:
df['noncash_benefit'].value_counts()

None                  36075
foodstamps            19760
other                   880
publichousing           696
wic                     542
tanfother               179
tanftransportation       40
tanfchildcare            32
temprental                5
Name: noncash_benefit, dtype: int64

# One-hot encoding

In [9]:
features_binary = [
    'veteran_status',
    'disabling_condition',
    'continuously_homeless_one_year',
    'chronic_homeless',
    'in_permanent_housing',
    'domestic_violence_victim',
    'dv_currently_fleeing',
    'head_of_household',
]

features_quant = [
    'client_age_at_entry',
    'times_homeless_past_three_years',
    'months_homeless_this_time',
    'months_ago_dv_occurred',
    'days_enrolled',
    'days_to_residential_move_in',
]

features_date = [
    'entry_date',
    'exit_date',
    'residential_move_in_date',
    ]

features_one_hot = [
    'race',
    'ethnicity',
    'gender',
    'housing_status_project_start',
    'living_situation_before_program_entry',
    'noncash_benefit',
    'disability_type',
    'project_type_code',
]

features_not_using = [
    'project_entry_id',
    'last_permanent_zip',
    'project_id',
    'client_location',
    'household_id',
    'project_name',
    'address_city',
    'address_postal_code',
]

In [10]:
df = df.drop(features_not_using, axis=1)

df, cols = hd.encode_categorical_features(df, features_one_hot, astype='int')

In [12]:
df.head()

Unnamed: 0_level_0,race,ethnicity,gender,veteran_status,client_age_at_entry,entry_date,exit_date,housing_status_project_start,living_situation_before_program_entry,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,noncash_benefit,disability_type,project_type_code,race_other,race_pacificisl,race_black,race_asian,race_unknown,race_amerindian,race_white,ethnicity_nonlatino,ethnicity_unknown,ethnicity_latino,gender_transftom,gender_transmtof,gender_male,gender_female,gender_other,gender_unknown,housing_status_project_start_cat4fleeingdv,housing_status_project_start_unknown,housing_status_project_start_cat1homeless,housing_status_project_start_cat2risklosing,housing_status_project_start_cat3homelessfedstatutes,housing_status_project_start_atrisk,housing_status_project_start_housed,living_situation_before_program_entry_unknown,living_situation_before_program_entry_permanenthousing,living_situation_before_program_entry_rentalvash,living_situation_before_program_entry_hotel,living_situation_before_program_entry_family,living_situation_before_program_entry_friend,living_situation_before_program_entry_streets,living_situation_before_program_entry_owned,living_situation_before_program_entry_detoxcenter,living_situation_before_program_entry_safehaven,living_situation_before_program_entry_hospital,living_situation_before_program_entry_jail,living_situation_before_program_entry_rentalother,living_situation_before_program_entry_transitionalhousing,living_situation_before_program_entry_rental,living_situation_before_program_entry_other,living_situation_before_program_entry_rentalgdptip,living_situation_before_program_entry_ownedsubsidy,living_situation_before_program_entry_longtermcare,living_situation_before_program_entry_hospitalpsych,living_situation_before_program_entry_foster,living_situation_before_program_entry_halfwayhouse,living_situation_before_program_entry_emershelter,noncash_benefit_tanfchildcare,noncash_benefit_publichousing,noncash_benefit_foodstamps,noncash_benefit_tanfother,noncash_benefit_none,noncash_benefit_tanftransportation,noncash_benefit_other,noncash_benefit_wic,noncash_benefit_temprental,disability_type_alcohol,disability_type_substance,disability_type_mentalhealth,disability_type_hivaids,disability_type_developmental,disability_type_drug,disability_type_alcoholdrug,disability_type_none,disability_type_physical,disability_type_chronichealth,project_type_code_emergencyshelter,project_type_code_other,project_type_code_streetoutreach,project_type_code_servicesonly,project_type_code_rapidrehousing,project_type_code_transitionalhousing,project_type_code_permanentsupportivehousing,project_type_code_homelessnessprevention
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1
214224,white,nonlatino,male,False,60,2012-01-01,2013-09-08,cat1homeless,hotel,True,False,,,False,False,NaT,False,,False,616,,False,,physical,transitionalhousing,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
194592,white,nonlatino,male,True,53,2012-01-02,2013-08-31,cat1homeless,family,True,False,,,False,False,NaT,False,,False,607,,False,foodstamps,mentalhealth,other,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
179384,white,latino,female,False,50,2012-01-03,2016-01-28,cat1homeless,streets,False,False,,,False,False,NaT,False,,False,1486,,True,,,servicesonly,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
183019,white,latino,male,False,28,2012-01-03,2012-10-26,cat1homeless,streets,False,False,,,False,False,NaT,False,,False,297,,False,,,servicesonly,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
182885,black,nonlatino,male,False,48,2012-01-04,2013-05-02,cat1homeless,emershelter,False,False,,,False,False,NaT,False,,False,484,,False,,,servicesonly,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [13]:
df = df.drop(features_one_hot, axis=1)

In [14]:
df.shape

(58209, 90)

In [15]:
# save it for easy loading
filename = os.path.join(os.getenv('HOME'), 'Dropbox', 'C4SF-datasci-homeless', 'processed', 'homeless_row_per_enrollment_one_hot_encode.csv')
df.to_csv(filename)

In [16]:
# set up to count the number of times a person was in the system
df['enrollments'] = 1

# for each person, calculate the mean for their one-hot encoded features
agg_one_hot = {}
for col in cols:
    agg_one_hot[col] = 'mean'

# create feature vectors for each person by subselecting or aggregating their enrollments;
# one row per person
agg = {
    # binary
    'veteran_status': 'max',
    'disabling_condition': 'max',
    'continuously_homeless_one_year': 'max',
    'chronic_homeless': 'max',
    'domestic_violence_victim': 'max',
    'dv_currently_fleeing': 'max',
    'head_of_household': 'max',
    # quantitative
    'enrollments': 'sum',
    'client_age_at_entry': 'last',
    'times_homeless_past_three_years': 'last',
    'months_homeless_this_time': 'last',
    'months_ago_dv_occurred': 'last',
    'days_enrolled': 'sum',
    #     # categorical
    #     'race': 'first',
    #     'ethnicity': 'first',
    #     'gender': 'first',
    #     'housing_status_project_start': 'last',
    #     'living_situation_before_program_entry': 'last',
    #     'non_cash_benefit': 'last',
    #     'disability_type': 'last',
    #     'project_type_code': 'last',
    # outcome related
    'in_permanent_housing': 'last',
    'days_to_residential_move_in': 'last',
    }
agg.update(agg_one_hot)
df_features = df.reset_index().groupby(by=['Personal ID']).agg(agg)

# convert booleans to integers
for col in features_binary:
    df_features[col] = df_features[col].astype(int)

In [17]:
# number of people in the dataset
df_features.shape

(11362, 88)

In [18]:
# glance at the data
df_features.head()

Unnamed: 0_level_0,race_unknown,living_situation_before_program_entry_transitionalhousing,client_age_at_entry,months_ago_dv_occurred,disability_type_substance,housing_status_project_start_cat1homeless,disability_type_none,living_situation_before_program_entry_rental,race_amerindian,ethnicity_unknown,gender_transmtof,ethnicity_latino,living_situation_before_program_entry_emershelter,project_type_code_homelessnessprevention,noncash_benefit_publichousing,race_other,continuously_homeless_one_year,race_asian,noncash_benefit_tanfchildcare,gender_male,living_situation_before_program_entry_rentalgdptip,disability_type_chronichealth,project_type_code_rapidrehousing,noncash_benefit_temprental,race_black,veteran_status,noncash_benefit_tanftransportation,living_situation_before_program_entry_longtermcare,dv_currently_fleeing,living_situation_before_program_entry_rentalother,living_situation_before_program_entry_hospitalpsych,housing_status_project_start_cat2risklosing,project_type_code_transitionalhousing,in_permanent_housing,gender_other,noncash_benefit_other,head_of_household,living_situation_before_program_entry_hotel,living_situation_before_program_entry_permanenthousing,living_situation_before_program_entry_friend,noncash_benefit_none,living_situation_before_program_entry_safehaven,race_white,disability_type_alcohol,noncash_benefit_tanfother,disability_type_physical,project_type_code_emergencyshelter,gender_transftom,enrollments,times_homeless_past_three_years,living_situation_before_program_entry_streets,days_to_residential_move_in,housing_status_project_start_cat4fleeingdv,living_situation_before_program_entry_unknown,living_situation_before_program_entry_detoxcenter,living_situation_before_program_entry_ownedsubsidy,chronic_homeless,living_situation_before_program_entry_halfwayhouse,housing_status_project_start_cat3homelessfedstatutes,housing_status_project_start_atrisk,living_situation_before_program_entry_family,months_homeless_this_time,ethnicity_nonlatino,project_type_code_permanentsupportivehousing,living_situation_before_program_entry_hospital,project_type_code_streetoutreach,disability_type_developmental,housing_status_project_start_unknown,living_situation_before_program_entry_rentalvash,housing_status_project_start_housed,gender_unknown,disability_type_hivaids,living_situation_before_program_entry_other,domestic_violence_victim,noncash_benefit_wic,living_situation_before_program_entry_owned,project_type_code_other,disability_type_mentalhealth,disability_type_drug,gender_female,living_situation_before_program_entry_foster,race_pacificisl,noncash_benefit_foodstamps,disabling_condition,project_type_code_servicesonly,days_enrolled,living_situation_before_program_entry_jail,disability_type_alcoholdrug
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1
173781,0,0.0,35,12.0,0,0.5,1,0.5,0,0,0,1,0.5,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0,0.0,1,0,0,0,0.5,0,2,2.0,0.0,,0.0,0.0,0.0,0.0,0,0.0,0.0,0.5,0.0,,0,0.0,0.0,0.5,0,0.0,0.0,0.0,0,0,0.0,1,0,0.0,0.0,0,0,1,0.0,0,1,0,0.0,147,0.0,0
173782,0,0.0,10,12.0,0,1.0,1,0.0,0,0,0,1,1.0,0.0,0,0,1,0,0,1,0,0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,1,0.0,1,0,0,0,1.0,0,1,2.0,0.0,,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,1,0,0.0,0.0,0,0,0,0.0,0,0,0,0.0,147,0.0,0
173783,0,0.0,12,12.0,0,1.0,1,0.0,0,0,0,1,1.0,0.0,0,0,1,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,1,0.0,1,0,0,0,1.0,0,1,2.0,0.0,,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,1,0,0.0,0.0,0,0,1,0.0,0,0,0,0.0,147,0.0,0
173803,0,0.0,32,12.0,0,1.0,1,0.0,0,0,0,1,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,1.0,0,0.0,1,0,0,0,1.0,0,1,,0.0,,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,1,0,0.0,0.0,0,0,1,0.0,0,1,0,0.0,78,0.0,0
173804,0,0.0,11,,0,1.0,1,0.0,0,0,0,1,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,1.0,1,0.0,1,0,0,0,1.0,0,1,,0.0,,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0,0,0.0,0.0,0,0,1,0.0,0,0,0,0.0,78,0.0,0


In [19]:
df_features[sorted(df_features.columns)].describe()

Unnamed: 0,chronic_homeless,client_age_at_entry,continuously_homeless_one_year,days_enrolled,days_to_residential_move_in,disability_type_alcohol,disability_type_alcoholdrug,disability_type_chronichealth,disability_type_developmental,disability_type_drug,disability_type_hivaids,disability_type_mentalhealth,disability_type_none,disability_type_physical,disability_type_substance,disabling_condition,domestic_violence_victim,dv_currently_fleeing,enrollments,ethnicity_latino,ethnicity_nonlatino,ethnicity_unknown,gender_female,gender_male,gender_other,gender_transftom,gender_transmtof,gender_unknown,head_of_household,housing_status_project_start_atrisk,housing_status_project_start_cat1homeless,housing_status_project_start_cat2risklosing,housing_status_project_start_cat3homelessfedstatutes,housing_status_project_start_cat4fleeingdv,housing_status_project_start_housed,housing_status_project_start_unknown,in_permanent_housing,living_situation_before_program_entry_detoxcenter,living_situation_before_program_entry_emershelter,living_situation_before_program_entry_family,living_situation_before_program_entry_foster,living_situation_before_program_entry_friend,living_situation_before_program_entry_halfwayhouse,living_situation_before_program_entry_hospital,living_situation_before_program_entry_hospitalpsych,living_situation_before_program_entry_hotel,living_situation_before_program_entry_jail,living_situation_before_program_entry_longtermcare,living_situation_before_program_entry_other,living_situation_before_program_entry_owned,living_situation_before_program_entry_ownedsubsidy,living_situation_before_program_entry_permanenthousing,living_situation_before_program_entry_rental,living_situation_before_program_entry_rentalgdptip,living_situation_before_program_entry_rentalother,living_situation_before_program_entry_rentalvash,living_situation_before_program_entry_safehaven,living_situation_before_program_entry_streets,living_situation_before_program_entry_transitionalhousing,living_situation_before_program_entry_unknown,months_ago_dv_occurred,months_homeless_this_time,noncash_benefit_foodstamps,noncash_benefit_none,noncash_benefit_other,noncash_benefit_publichousing,noncash_benefit_tanfchildcare,noncash_benefit_tanfother,noncash_benefit_tanftransportation,noncash_benefit_temprental,noncash_benefit_wic,project_type_code_emergencyshelter,project_type_code_homelessnessprevention,project_type_code_other,project_type_code_permanentsupportivehousing,project_type_code_rapidrehousing,project_type_code_servicesonly,project_type_code_streetoutreach,project_type_code_transitionalhousing,race_amerindian,race_asian,race_black,race_other,race_pacificisl,race_unknown,race_white,times_homeless_past_three_years,veteran_status
count,11362.0,11362.0,11362.0,11362.0,747.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,1910.0,1726.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,5202.0,11362.0
mean,0.059849,33.62225,0.185443,207.853459,72.552878,0.027196,0.038021,0.058704,0.012322,0.019275,0.014082,0.049375,0.748108,0.032741,0.000176,0.390512,0.183154,0.001672,5.12313,0.458282,0.521827,0.019891,0.424749,0.571642,0.00044,0.000704,0.001936,0.000528,0.409347,0.040058,0.637567,0.138795,0.000668,0.002992,0.067985,0.111934,0.101039,0.021072,0.127916,0.139699,0.003,0.074934,0.001227,0.009835,0.007196,0.048752,0.008951,0.000198,0.024681,0.004601,0.000502,0.002629,0.168741,8.8e-05,0.025968,0.015308,0.004257,0.189169,0.038544,0.082733,16.102618,6.976246,0.251364,0.707446,0.01285,0.009505,0.001496,0.002552,0.00088,0.000264,0.013642,0.385074,0.130945,0.078596,0.005743,0.076366,0.17396,0.055596,0.093721,0.064601,0.020419,0.107551,0.001232,0.015842,0.020243,0.770111,1.34679,0.192836
std,0.237217,20.246602,0.388673,330.189494,81.557515,0.162661,0.191257,0.235081,0.110322,0.137495,0.117834,0.21666,0.434119,0.177965,0.013267,0.487887,0.38681,0.040861,18.586563,0.498278,0.499545,0.139631,0.494327,0.494863,0.020974,0.026527,0.043962,0.022975,0.491735,0.18867,0.467006,0.338497,0.02408,0.053675,0.238917,0.30679,0.301393,0.138403,0.318047,0.33939,0.053441,0.254198,0.033638,0.095248,0.079153,0.208796,0.090282,0.013472,0.149394,0.066101,0.020654,0.048399,0.367939,0.009382,0.155115,0.113651,0.062561,0.378361,0.181653,0.265918,9.192029,9.96952,0.433817,0.454955,0.112631,0.097035,0.038654,0.050459,0.029655,0.016248,0.116005,0.465628,0.327913,0.251764,0.068417,0.250502,0.366711,0.217953,0.270303,0.245832,0.141435,0.309827,0.035082,0.124871,0.140836,0.42078,1.122937,0.394543
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,16.0,0.0,26.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
50%,0.0,34.0,0.0,96.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,0.0,51.0,0.0,258.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,12.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0
max,1.0,101.0,1.0,5040.0,730.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,577.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,24.0,24.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0
