# Enrollment Preprocessing

Now we have the fun of preprocessing the enrollment data. Depending on which super project the row exists in, we will process that enrollment differently

In [1]:
import numpy as np
import pandas as pd

In [2]:
enrollment = pd.read_csv("data/raw/Enrollment.csv", sep=",")

In [3]:
enrollment.head()

Unnamed: 0,Personal ID,Project Entry ID,Client Age at Entry,Last Permanent Zip,Project Entry Date Updated,Destination,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,...,Continuously Homeless One Year,"If Yes for ""Client entering from streets, ES or SH"" Approximate date started:",Times Homeless Past Three Years,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing
0,173781,252608,34.0,93907.0,1/23/2015,Other (HUD),2/28/2014,2/28/2014,2104,At-risk of homelessness (HUD),...,,,,,No,,,Yes (HUD),More than a year ago (HUD),
1,173781,314080,35.0,93907.0,1/12/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,No (HUD),,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),
2,173782,122915,1.0,,12/30/2004,,11/29/2004,,2114,,...,,,,,No,,,,,
3,173782,314084,10.0,93905.0,1/7/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,Yes (HUD),4/6/2014,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),
4,173783,122916,2.0,,12/30/2004,,11/29/2004,,2114,,...,,,,,No,,,,,


In [4]:
enrollment.columns

Index([u'Personal ID', u'Project Entry ID', u'Client Age at Entry',
       u'Last Permanent Zip', u'Project Entry Date Updated', u'Destination',
       u'Entry Date', u'Exit Date', u'Project ID',
       u'Housing Status @ Project Start',
       u'Living situation before program entry?', u'Client Location',
       u'Household ID', u'Relationship to HoH', u'Disabling Condition',
       u'Continuously Homeless One Year',
       u'If Yes for "Client entering from streets, ES or SH" Approximate date started:',
       u'Times Homeless Past Three Years', u'Months Homeless This Time',
       u'Chronic Homeless', u'In Permanent Housing',
       u'Residential Move In Date', u'Domestic Violence Victim',
       u'DV When Occurred', u'DV Currently Fleeing'],
      dtype='object')

In [5]:
enrollment["Project ID"].value_counts()

3036    21439
2840     7103
2876     5243
3444     2640
2966     1963
2121     1786
2947     1627
2101      863
2110      709
2104      650
2939      649
3212      560
2872      501
2115      487
2112      469
2934      454
2945      445
2086      439
3214      432
3184      403
2889      382
3342      337
3213      309
2102      298
2111      287
2106      284
3387      262
3443      257
2862      242
2935      225
        ...  
2866        6
2807        5
2815        5
2955        5
3383        5
3462        4
2964        4
3008        4
2113        3
2996        3
2085        2
2943        2
2971        2
3256        2
3254        2
2870        2
2930        2
2117        2
2942        2
3340        2
2869        1
2932        1
2093        1
2968        1
2116        1
3325        1
2108        1
3201        1
2860        1
3080        1
Name: Project ID, dtype: int64

Now that we've obtained the enrollment dataset, we grab the preprocessed Project dataset, and filter rows based on whether the project ID falls under a super project category

In [6]:
projects = pd.read_csv("data/preprocessed/projects.csv", index_col=0)

In [7]:
projects.head()

Unnamed: 0,Project Name,Project ID,Organization Name,CoC Code,Project Type Code,Method for Tracking ES Utilization,Address City,Address Postal Code,Funder,Grant Start Date,Grant End Date,Super Project
0,MOSBE CHS - Elm House,2142,MOSBE Community Human Services (CHS),CA-506,Transitional housing (HUD),,,93942,,,,Temporary Housing
1,MOSBE CHS - Elm House,2142,MOSBE Community Human Services (CHS),CA-506,Transitional housing (HUD),,,93955,,,,Temporary Housing
2,MOSBE CHS - RHY - BCP ES,3417,MOSBE Community Human Services (CHS),CA-506,Emergency Shelter (HUD),,Monterey,93942,,,,Nightly Housing
3,MOSBE CHS - RHY - BCP ES,3417,MOSBE Community Human Services (CHS),CA-506,Emergency Shelter (HUD),,Seaside,93955,,,,Nightly Housing
4,MOSBE CHS - RHY - BCP - HP,3418,MOSBE Community Human Services (CHS),CA-506,Homelessness Prevention (HUD),,Monterey,93942,,,,External Funding


In [8]:
temp_housing = set(projects.loc[projects["Super Project"] == "Temporary Housing", "Project ID"])
ext_funding = set(projects.loc[projects["Super Project"] == "External Funding", "Project ID"])
long_stay = set(projects.loc[projects["Super Project"] == "Long Stay", "Project ID"])
nightly_housing = set(projects.loc[projects["Super Project"] == "Nightly Housing", "Project ID"])

In [9]:
assert temp_housing
assert ext_funding
assert long_stay

In [10]:
def assign_super_project(row):
    project_id = row["Project ID"]
    if project_id in temp_housing: return "Temporary Housing"
    if project_id in ext_funding: return "Extended Stay"
    if project_id in long_stay: return "Long Stay"
    if project_id in nightly_housing: return "Nightly Housing"
    return "No Super Project"

In [11]:
enrollment_super_projects = enrollment.apply(assign_super_project, axis=1)

In [12]:
enrollment["Super Project"] = enrollment_super_projects

## Misc preprocessing

Before I go any further, I want to preprocess the times into unix time. Unix time is often easier to work with than datetime objects

In [13]:
enrollment.head()

Unnamed: 0,Personal ID,Project Entry ID,Client Age at Entry,Last Permanent Zip,Project Entry Date Updated,Destination,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,...,"If Yes for ""Client entering from streets, ES or SH"" Approximate date started:",Times Homeless Past Three Years,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing,Super Project
0,173781,252608,34.0,93907.0,1/23/2015,Other (HUD),2/28/2014,2/28/2014,2104,At-risk of homelessness (HUD),...,,,,No,,,Yes (HUD),More than a year ago (HUD),,No Super Project
1,173781,314080,35.0,93907.0,1/12/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),,Nightly Housing
2,173782,122915,1.0,,12/30/2004,,11/29/2004,,2114,,...,,,,No,,,,,,No Super Project
3,173782,314084,10.0,93905.0,1/7/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,4/6/2014,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),,Nightly Housing
4,173783,122916,2.0,,12/30/2004,,11/29/2004,,2114,,...,,,,No,,,,,,No Super Project


In [14]:
enrollment["Entry Date"] = pd.to_datetime(enrollment["Entry Date"], format="%m/%d/%Y")
enrollment["Exit Date"] = pd.to_datetime(enrollment["Exit Date"], format="%m/%d/%Y")

In [15]:
assert type(enrollment["Entry Date"][0]) is pd.tslib.Timestamp

In [16]:
# This is a little hackish, sorry
import arrow
def safe_utc_localize(t):
    if type(t) is pd.tslib.Timestamp:
        return arrow.get(t.tz_localize("America/Los_Angeles")).timestamp
    return np.inf

In [17]:
entry_unix_time = enrollment["Entry Date"].apply(safe_utc_localize)
exit_unix_time = enrollment["Exit Date"].apply(safe_utc_localize)

In [18]:
enrollment["UTC Entry Time"] = entry_unix_time
enrollment["UTC Exit Time"] = exit_unix_time

In [19]:
# Finding weird cases which don't make sense
enrollment[enrollment["UTC Entry Time"] > enrollment["UTC Exit Time"]]

Unnamed: 0,Personal ID,Project Entry ID,Client Age at Entry,Last Permanent Zip,Project Entry Date Updated,Destination,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,...,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing,Super Project,UTC Entry Time,UTC Exit Time
12159,213071,234693,38.0,,12/24/2013,Client doesn't know (HUD),2013-12-24,2013-12-19,2947,Category 1 - Homeless (HUD),...,,No,,,Client doesn't know (HUD),,,Nightly Housing,1387872000,1387440000.0


In [20]:
enrollment = enrollment[enrollment["UTC Entry Time"] <= enrollment["UTC Exit Time"]]

In [22]:
enrollment["Entry Date"].min()

Timestamp('2002-09-06 00:00:00')

In [23]:
enrollment["Entry Date"].max()

Timestamp('2016-05-31 00:00:00')

## Splitting Dataset into 4

In [28]:
enrollment.head()

Unnamed: 0,Personal ID,Project Entry ID,Client Age at Entry,Last Permanent Zip,Project Entry Date Updated,Destination,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,...,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing,Super Project,UTC Entry Time,UTC Exit Time
0,173781,252608,34.0,93907.0,1/23/2015,Other (HUD),2014-02-28,2014-02-28,2104,At-risk of homelessness (HUD),...,,No,,,Yes (HUD),More than a year ago (HUD),,No Super Project,1393574400,1393574000.0
1,173781,314080,35.0,93907.0,1/12/2015,"Emergency shelter, including hotel or motel pa...",2014-05-06,2014-09-30,2101,Category 1 - Homeless (HUD),...,,No,,,Yes (HUD),From six to twelve months ago (HUD),,Nightly Housing,1399359600,1412060000.0
2,173782,122915,1.0,,12/30/2004,,2004-11-29,NaT,2114,,...,,No,,,,,,No Super Project,1101715200,inf
3,173782,314084,10.0,93905.0,1/7/2015,"Emergency shelter, including hotel or motel pa...",2014-05-06,2014-09-30,2101,Category 1 - Homeless (HUD),...,,No,,,Yes (HUD),From six to twelve months ago (HUD),,Nightly Housing,1399359600,1412060000.0
4,173783,122916,2.0,,12/30/2004,,2004-11-29,NaT,2114,,...,,No,,,,,,No Super Project,1101715200,inf


In [30]:
enrollment_temp_housing = enrollment.loc[enrollment["Super Project"] == "Temporary Housing"]
enrollment_ext_funding = enrollment.loc[enrollment["Super Project"] == "Extended Stay"]
enrollment_long_stay = enrollment.loc[enrollment["Super Project"] == "Long Stay"]
enrollment_nightly_housing = enrollment.loc[enrollment["Super Project"] == "Nightly Housing"]
enrollment_no_super_project = enrollment.loc[enrollment["Super Project"] == "No Super Project"]

In [32]:
assert not enrollment_temp_housing.empty
assert not enrollment_ext_funding.empty
assert not enrollment_long_stay.empty
assert not enrollment_nightly_housing.empty
assert not enrollment_no_super_project.empty

In [33]:
# Find out the sizes of these mini datasets
print enrollment_temp_housing.shape
print enrollment_ext_funding.shape
print enrollment_long_stay.shape
print enrollment_nightly_housing.shape

(2314, 28)
(2051, 28)
(2092, 28)
(44822, 28)


In [35]:
enrollment_temp_housing.to_csv("data/preprocessed/enrollment_temp_housing.csv", sep=",")
enrollment_ext_funding.to_csv("data/preprocessed/enrollment_ext_funding.csv", sep=",")
enrollment_long_stay.to_csv("data/preprocessed/enrollment_long_stay.csv", sep=",")
enrollment_nightly_housing.to_csv("data/preprocessed/enrollment_nightly_housing.csv", sep=",")