The purpose of this notebook is to encode categorical features using one-hot encoding.

Future usage of the dataset may need to consider dropping a reference category so this reference case is encoded when all values are zero.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
%load_ext autoreload
# # the "1" means: always reload modules marked with "%aimport"
%autoreload 2

from __future__ import absolute_import, division, print_function
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
# from tqdm import tqdm
# import warnings

# sns.set_context("poster", font_scale=0.9)
sns.set_context("notebook", font_scale=1.0)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# add the data functions to the path
src_data_dir = os.path.join(os.getcwd(), os.pardir, 'src/data')
sys.path.append(src_data_dir)

# functions to load the data
import homeless_dataset as hd

In [2]:
datadir_raw = os.path.join(os.getenv('HOME'), 'Dropbox', 'C4SF-datasci-homeless', 'raw')
datadir_proc = os.path.join(os.getenv('HOME'), 'Dropbox', 'C4SF-datasci-homeless', 'processed')

In [3]:
# load in and process the data in separate sheets

df_client = hd.process_data_client(simplify_strings=True, datadir=datadir_raw)

df_enroll = hd.process_data_enrollment(simplify_strings=True, datadir=datadir_raw)

# Only keep rows with entry dates starting in 2012
df_enroll = df_enroll[df_enroll['Entry Date'] >= '2012']
# Only keep rows with exit dates before 2016-06-01
df_enroll = df_enroll[df_enroll['Exit Date'] <= '2016-06-01']

df_project = hd.process_data_project(simplify_strings=True, datadir=datadir_raw)

# calculate the numer of times in permanent housing
# only choosing one entry per day if enrolled multiple times in one day
df_times_in_ph = df_enroll[['Entry Date', 'In Permanent Housing']].reset_index().groupby(
    by=['Personal ID', 'Entry Date'], as_index=False).max().set_index(
    'Personal ID')[['In Permanent Housing']].astype(int).groupby(
    level=0).sum()
df_times_in_ph = df_times_in_ph.rename(columns={'In Permanent Housing': 'times_in_permanent_housing'})
df_client = df_client.merge(df_times_in_ph, how='left', left_index=True, right_index=True)
df_client['times_in_permanent_housing'] = df_client['times_in_permanent_housing'].fillna(value=0).astype(int)

# not currently using
# df_income = hd.process_data_income(simplify_strings=True, datadir=datadir_raw)
# df_service = hd.process_data_service(simplify_strings=True, datadir=datadir_raw)
# df_bedinv = hd.process_data_bedinventory(simplify_strings=True, datadir=datadir_raw)

In [4]:
# Join the client information with enrollment information.
# Inner join because we want to only keep individuals
# for whom we have both client and enrollment information.
df = df_client.merge(df_enroll, how='inner', left_index=True, right_index=True)

# add Project Type Code to DataFrame
df = df.merge(df_project[['Project Name',
                          'Project Type Code',
                          'Address City',
                          'Address Postal Code',
                         ]], left_on=['Project ID'], right_index=True)

# sort by entry date
df = df.sort_values('Entry Date')

# rename the columns to have no spaces
df = hd.rename_columns(df)

In [5]:
df.head(3)

Unnamed: 0_level_0,race,ethnicity,gender,veteran_status,times_in_permanent_housing,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,project_name,project_type_code,address_city,address_postal_code
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
214224,white,nonlatino,male,False,0,203474,60,0,2012-01-01,2013-09-08,2938,cat1homeless,hotel,,203474,True,False,,,False,False,NaT,False,,False,616,,False,SCz - HSC - Page Smith Community House,transitionalhousing,Santa Cruz,95060
194592,white,nonlatino,male,True,1,150135,53,91103,2012-01-02,2013-08-31,2134,cat1homeless,family,,150135,True,False,,,False,False,NaT,False,,False,607,,False,MOSBE Veterans Transition Center - Outreach,other,Marina,93933
179384,white,latino,female,False,0,155873,50,93245,2012-01-03,2016-01-28,2086,cat1homeless,streets,CA-506,155873,False,False,,,False,False,NaT,False,,False,1486,,True,MOSBE The Salvation Army - Good Samaritan Center,servicesonly,,93955


# Encode yes/no for some categorical features

- noncash_benefit
- health_insurance
- disability_type

One row per person, with columns for each non-benefit, insurance, and disability type.

These tables describe whether a person was logged as having ever received a particular non-cash benefit, insurance type, or having had a disability. It is not possible to align when these were logged with the project entry date, so this is the best we can do to make use of these features.

TODO: If not one-hot encoding, another way to encode the above categories is to turn the category values from the last X years into a string of the sorted set of values.


In [6]:
# read data and process

# process non-cash benefits
df_benefit = hd.process_data_benefit(simplify_strings=True, datadir=datadir_raw)
# remove any empty rows
df_benefit = df_benefit.dropna(subset=['Non-Cash Benefit'])
# rename the columns to have no spaces
df_benefit = hd.rename_columns(df_benefit)

# process health insurance
df_healthins = hd.process_data_healthins(simplify_strings=True, datadir=datadir_raw)
df_healthins.loc[df_healthins['Health Insurance'] == 'unknown', 'Health Insurance'] = np.nan
# remove any empty rows
df_healthins = df_healthins.dropna(subset=['Health Insurance'])
# rename the columns to have no spaces
df_healthins = hd.rename_columns(df_healthins)

# process disability
df_disability = hd.process_data_disability(simplify_strings=True, datadir=datadir_raw)
# remove any empty rows
df_disability = df_disability.dropna(subset=['Disability Type'])
# rename the columns to have no spaces
df_disability = hd.rename_columns(df_disability)

# simplify into one row per person

# did they ever receive any of these benefits?
df_benefit, cols = hd.encode_categorical_features(df_benefit, ['noncash_benefit'], astype='int')
df_benefit = df_benefit[cols].groupby(level=0).max()

# did they ever have any of these health insurances?
df_healthins, cols = hd.encode_categorical_features(df_healthins, ['health_insurance'], astype='int')
df_healthins = df_healthins[cols].groupby(level=0).max()

# did they ever have any of these disabilities?
df_disability, cols = hd.encode_categorical_features(df_disability, ['disability_type'], astype='int')
df_disability = df_disability[cols].groupby(level=0).max()

# Categorical variables

Sum the number of times each of these categories occurs:
- housing_status_project_start
- living_situation_before_program_entry
- project_type_code

TODO: If not one-hot encoding, another way to encode the above categories is to turn the category values from the last X years into a string of the sorted set of values.

Boolean (1/0) for these:
- race
- ethnicity
- gender


In [7]:
# whether to encode categorical variables
# True = split each value into its own numeric 1/0 column
# False = keep as strings
encode_categ = False

# whether to drop a reference column
drop_ref = False

if encode_categ == True:
    # initialize
    cols_categorical_max = []
    cols_categorical_sum = []
    
    if drop_ref == True:
        ref_cols_max = [
            'race_white',
            'ethnicity_nonlatino',
            'gender_male',
            # 'race_unknown',
            # 'ethnicity_unknown',
            # 'gender_unknown',
        ]

        ref_cols_sum = [
            'housing_status_project_start_other',
            'living_situation_before_program_entry_other',
            'project_type_code_other',
            #'housing_status_project_start_unknown',
            #'living_situation_before_program_entry_unknown',
            #'project_type_code_other',
        ]


In [8]:
# collapse some categories to make things simpler

# race
df.loc[(df.loc[:, 'race'] == 'other'), 'race'] = 'unknown'

# gender
df.loc[(df.loc[:, 'gender'] == 'other'), 'gender'] = 'unknown'

df['transsexual'] = False
df.loc[(df.loc[:, 'gender'] == 'transmtof') | (df.loc[:, 'gender'] == 'transftom'), 'transsexual'] = True

df.loc[(df.loc[:, 'gender'] == 'transmtof'), 'gender'] = 'female'
df.loc[(df.loc[:, 'gender'] == 'transftom'), 'gender'] = 'male'

In [9]:
# drop those with unknown demographic information

# number of enrollment entries
# gender: 89
# ethnicity: 2242
# race: 1986

# df = df.loc[~(df.loc[:, 'gender'] == 'unknown') &
#             ~(df.loc[:, 'ethnicity'] == 'unknown') &
#             ~(df.loc[:, 'race'] == 'unknown'), :]


In [10]:
df.head(1)

Unnamed: 0_level_0,race,ethnicity,gender,veteran_status,times_in_permanent_housing,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,project_name,project_type_code,address_city,address_postal_code,transsexual
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
214224,white,nonlatino,male,False,0,203474,60,0,2012-01-01,2013-09-08,2938,cat1homeless,hotel,,203474,True,False,,,False,False,NaT,False,,False,616,,False,SCz - HSC - Page Smith Community House,transitionalhousing,Santa Cruz,95060,False


In [11]:
# these columns will receive the max value, resulting in a boolean 1/0
if encode_categ == True:
    df, cols = hd.encode_categorical_features(df, ['race'], astype='int')
    cols_categorical_max.extend(cols)
    df = df.drop(['race'], axis=1)

    df, cols = hd.encode_categorical_features(df, ['ethnicity'], astype='int')
    cols_categorical_max.extend(cols)
    df = df.drop(['ethnicity'], axis=1)

    df, cols = hd.encode_categorical_features(df, ['gender'], astype='int')
    cols_categorical_max.extend(cols)
    df = df.drop(['gender'], axis=1)

    if drop_ref == True:
        cols_categorical_max = [x for x in cols_categorical_max if x not in ref_cols_max]

        # drop a reference column, if it exists
        for rc in ref_cols_max:
            if rc in df.columns:
                df = df.drop([rc], axis=1)

# these columns will be encoded as the number of times one of these categories was logged
if encode_categ == True:
    df, cols = hd.encode_categorical_features(df, ['housing_status_project_start'], astype='int')
    cols_categorical_sum.extend(cols)
    df = df.drop(['housing_status_project_start'], axis=1)

    df, cols = hd.encode_categorical_features(df, ['living_situation_before_program_entry'], astype='int')
    cols_categorical_sum.extend(cols)
    df = df.drop(['living_situation_before_program_entry'], axis=1)

    df, cols = hd.encode_categorical_features(df, ['project_type_code'], astype='int')
    cols_categorical_sum.extend(cols)
    df = df.drop(['project_type_code'], axis=1)

    if drop_ref == True:
        cols_categorical_sum = [x for x in cols_categorical_sum if x not in ref_cols_sum]

        # drop a reference column, if it exists
        for rc in ref_cols_sum:
            if rc in df.columns:
                df = df.drop([rc], axis=1)


In [12]:
df.head(3)

Unnamed: 0_level_0,race,ethnicity,gender,veteran_status,times_in_permanent_housing,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,project_name,project_type_code,address_city,address_postal_code,transsexual
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
214224,white,nonlatino,male,False,0,203474,60,0,2012-01-01,2013-09-08,2938,cat1homeless,hotel,,203474,True,False,,,False,False,NaT,False,,False,616,,False,SCz - HSC - Page Smith Community House,transitionalhousing,Santa Cruz,95060,False
194592,white,nonlatino,male,True,1,150135,53,91103,2012-01-02,2013-08-31,2134,cat1homeless,family,,150135,True,False,,,False,False,NaT,False,,False,607,,False,MOSBE Veterans Transition Center - Outreach,other,Marina,93933,False
179384,white,latino,female,False,0,155873,50,93245,2012-01-03,2016-01-28,2086,cat1homeless,streets,CA-506,155873,False,False,,,False,False,NaT,False,,False,1486,,True,MOSBE The Salvation Army - Good Samaritan Center,servicesonly,,93955,False


In [13]:
# using these lists of features

features_binary = [
    'veteran_status',
    'disabling_condition',
    'continuously_homeless_one_year',
    'chronic_homeless',
    'in_permanent_housing',
    'domestic_violence_victim',
    'dv_currently_fleeing',
    'head_of_household',
    'transsexual',
]

features_not_using = [
    'project_entry_id',
    'last_permanent_zip',
    'project_id',
    'client_location',
    'household_id',
    'project_name',
    'address_city',
    'address_postal_code',
]

# just keeping track of other features here in comments

# features_categorical = [
#     'race',
#     'ethnicity',
#     'gender',
#     'housing_status_project_start',
#     'living_situation_before_program_entry',
#     'project_type_code',
# #     'noncash_benefit',
# #     'disability_type',
# #     'health_insurance',
# ]

# features_quant = [
#     'client_age_at_entry',
#     'times_homeless_past_three_years',
#     'months_homeless_this_time',
#     'months_ago_dv_occurred',
#     'days_enrolled',
#     'days_to_residential_move_in',
#     'times_in_permanent_housing',
# ]

# features_date = [
#     'entry_date',
#     'exit_date',
#     'residential_move_in_date',
#     ]


In [14]:
df = df.drop(features_not_using, axis=1)

In [15]:
# df, cols_categorical = hd.encode_categorical_features(df, features_categorical, astype='int')

In [16]:
# set up to count the number of times a person was in the system
df['enrollments'] = 1

if encode_categ == True:
    # for each person, aggregate their one-hot encoded features
    agg_categorical = {}
    for col in cols_categorical_sum:
        agg_categorical[col] = 'sum'
    for col in cols_categorical_max:
        agg_categorical[col] = 'max'

# create feature vectors for each person by subselecting or aggregating their enrollments;
# one row per person
agg = {
    # binary
    'veteran_status': 'max',
    'disabling_condition': 'max',
    'continuously_homeless_one_year': 'max',
    'chronic_homeless': 'max',
    'domestic_violence_victim': 'max',
    'dv_currently_fleeing': 'max',
    'head_of_household': 'max',
    'transsexual': 'max',
    # quantitative
    'enrollments': 'sum',
    'client_age_at_entry': 'last',
    'times_homeless_past_three_years': 'last',
    'months_homeless_this_time': 'last',
    'months_ago_dv_occurred': 'last',
    'days_enrolled': 'sum',
    'times_in_permanent_housing': 'max',
    # categorical
    'race': 'first',
    'ethnicity': 'first',
    'gender': 'first',
    'housing_status_project_start': 'last',
    'living_situation_before_program_entry': 'last',
    'project_type_code': 'last',
    #     'noncash_benefit': 'last',
    #     'disability_type': 'last',
    #     'health_insurance': 'last',
    # outcome related
    'in_permanent_housing': 'last',
    'days_to_residential_move_in': 'last',
    }

if encode_categ == True:
    agg.update(agg_categorical)

df_features = df.reset_index().groupby(by=['Personal ID']).agg(agg)

# convert booleans to integers
for col in features_binary:
    df_features[col] = df_features[col].astype(int)

# sort the columns
df_features = df_features[sorted(df_features.columns)]

In [19]:
# for col in df_features.columns:
#     if df_features[col].isnull().mean() > 0:
#         print(col, df_features[col].isnull().mean())

# # columsn with nans
# cols_fillna = [
#     'days_to_residential_move_in',
#     'times_homeless_past_three_years',
#     'months_homeless_this_time',
#     'months_ago_dv_occurred',
# ]

In [18]:
# join benefits, health insurance, diability

df_features = df_features.merge(df_benefit, how='left', left_index=True, right_index=True)
cols = df_benefit.columns.tolist()
df_features[cols] = df_features[cols].fillna(value=0)

df_features = df_features.merge(df_healthins, how='left', left_index=True, right_index=True)
cols = df_healthins.columns.tolist()
df_features[cols] = df_features[cols].fillna(value=0)

df_features = df_features.merge(df_disability, how='left', left_index=True, right_index=True)
cols = df_disability.columns.tolist()
df_features[cols] = df_features[cols].fillna(value=0)

In [19]:
# number of people in the dataset
df_features.shape

(11362, 52)

In [20]:
# glance at the data
df_features.head()

Unnamed: 0_level_0,chronic_homeless,client_age_at_entry,continuously_homeless_one_year,days_enrolled,days_to_residential_move_in,disabling_condition,domestic_violence_victim,dv_currently_fleeing,enrollments,ethnicity,gender,head_of_household,housing_status_project_start,in_permanent_housing,living_situation_before_program_entry,months_ago_dv_occurred,months_homeless_this_time,project_type_code,race,times_homeless_past_three_years,times_in_permanent_housing,transsexual,veteran_status,noncash_benefit_tanfother,noncash_benefit_wic,noncash_benefit_publichousing,noncash_benefit_temprental,noncash_benefit_other,noncash_benefit_tanftransportation,noncash_benefit_tanfchildcare,noncash_benefit_foodstamps,health_insurance_medicaid,health_insurance_statechild,health_insurance_stateadult,health_insurance_veteranadmin,health_insurance_pirvate,health_insurance_employer,health_insurance_medicare,health_insurance_cobra,disability_type_alcohol,disability_type_alcoholdrug,disability_type_physical,disability_type_mentalhealth,disability_type_hearing,disability_type_dualdiagnosis,disability_type_developmental,disability_type_hivaids,disability_type_substance,disability_type_vision,disability_type_other,disability_type_chronichealth,disability_type_drug
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
173781,0,35,0,147,,0,1,0,2,latino,female,0,cat1homeless,0,emershelter,12.0,,emergencyshelter,white,2.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173782,0,10,1,147,,0,1,0,1,latino,male,0,cat1homeless,0,emershelter,12.0,,emergencyshelter,white,2.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173783,0,12,1,147,,0,1,0,1,latino,female,0,cat1homeless,0,emershelter,12.0,,emergencyshelter,white,2.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173803,0,32,0,78,,0,1,0,1,latino,female,0,cat1homeless,0,friend,12.0,,emergencyshelter,white,,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173804,0,11,0,78,,0,0,0,1,latino,female,0,cat1homeless,0,friend,,,emergencyshelter,white,,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df_features.describe()

Unnamed: 0,chronic_homeless,client_age_at_entry,continuously_homeless_one_year,days_enrolled,days_to_residential_move_in,disabling_condition,domestic_violence_victim,dv_currently_fleeing,enrollments,head_of_household,in_permanent_housing,months_ago_dv_occurred,months_homeless_this_time,times_homeless_past_three_years,times_in_permanent_housing,transsexual,veteran_status,noncash_benefit_tanfother,noncash_benefit_wic,noncash_benefit_publichousing,noncash_benefit_temprental,noncash_benefit_other,noncash_benefit_tanftransportation,noncash_benefit_tanfchildcare,noncash_benefit_foodstamps,health_insurance_medicaid,health_insurance_statechild,health_insurance_stateadult,health_insurance_veteranadmin,health_insurance_pirvate,health_insurance_employer,health_insurance_medicare,health_insurance_cobra,disability_type_alcohol,disability_type_alcoholdrug,disability_type_physical,disability_type_mentalhealth,disability_type_hearing,disability_type_dualdiagnosis,disability_type_developmental,disability_type_hivaids,disability_type_substance,disability_type_vision,disability_type_other,disability_type_chronichealth,disability_type_drug
count,11362.0,11362.0,11362.0,11362.0,747.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,1910.0,1726.0,5202.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0,11362.0
mean,0.059849,33.62225,0.185443,207.853459,72.552878,0.390512,0.183154,0.001672,5.12313,0.409347,0.101039,16.102618,6.976246,1.34679,0.139236,0.00264,0.192836,0.005457,0.026844,0.014522,0.002728,0.019187,0.007481,0.006689,0.265094,0.329255,0.031421,0.02658,0.137828,0.008801,0.009593,0.076571,0.0022,0.027196,0.040222,0.10007,0.124538,8.8e-05,8.8e-05,0.020331,0.015842,0.00044,0.000264,0.000264,0.076219,0.034765
std,0.237217,20.246602,0.388673,330.189494,81.557515,0.487887,0.38681,0.040861,18.586563,0.491735,0.301393,9.192029,9.96952,1.122937,1.042184,0.051319,0.394543,0.073671,0.161634,0.119635,0.052165,0.137187,0.086173,0.081516,0.441403,0.469964,0.174459,0.160859,0.344734,0.093405,0.097479,0.265921,0.046858,0.162661,0.196488,0.300107,0.330209,0.009382,0.009382,0.141136,0.124871,0.020974,0.016248,0.016248,0.26536,0.183192
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,16.0,0.0,26.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,34.0,0.0,96.0,48.0,0.0,0.0,0.0,2.0,0.0,0.0,24.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,51.0,0.0,258.0,97.0,1.0,0.0,0.0,2.0,1.0,0.0,24.0,12.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,101.0,1.0,5040.0,730.0,1.0,1.0,1.0,577.0,1.0,1.0,24.0,24.0,4.0,68.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# save it for easy loading
filename = '2017-01-22_homeless_summary_categorical_string.csv'
df_features.to_csv(os.path.join(datadir_proc, filename))