In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
%load_ext autoreload
# # the "1" means: always reload modules marked with "%aimport"
%autoreload 2

from __future__ import absolute_import, division, print_function
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
# from tqdm import tqdm
# import warnings

sns.set_context("poster", font_scale=1.3)
pd.set_option('display.max_columns', 100)

# add the data functions to the path
src_data_dir = os.path.join(os.getcwd(), os.pardir, 'src/data')
sys.path.append(src_data_dir)

# functions to load the data
import dataset as ds

In [2]:
filename = os.path.join(os.getenv('HOME'), 'Dropbox', 'C4SF-datasci-homeless', 'processed', 'homeless_row_per_enrollment.csv')
df = pd.read_csv(filename, header=0, index_col=0, parse_dates=['Entry Date', 'Exit Date', 'Residential Move In Date'])

In [3]:
df.head()

Unnamed: 0_level_0,Race,Ethnicity,Gender,Veteran Status,Project Entry ID,Client Age at Entry,Last Permanent Zip,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,Living situation before program entry?,Client Location,Household ID,Relationship to HoH,Disabling Condition,Continuously Homeless One Year,Times Homeless Past Three Years,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing,Days Enrolled,Non-Cash Benefit,Disability Type,Project Name,Project Type Code,Address City,Address Postal Code,Enrollments
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
214224,White,Non-Hispanic/Non-Latino,Male,False,203474,60,0,2012-01-01,2013-09-08,2938,Category 1 - Homeless,Hotel or motel paid for without emergency shel...,,203474,,True,False,,,False,False,NaT,False,N/A - No Domestic Violence,False,616,,Physical,SCz - HSC - Page Smith Community House,Transitional housing,Santa Cruz,95060,1
194592,White,Non-Hispanic/Non-Latino,Male,True,150135,53,91103,2012-01-02,2013-08-31,2134,Category 1 - Homeless,"Staying or living in a family member's room, a...",,150135,,True,False,,,False,False,NaT,False,,False,607,Food Stamps,Mental Health Problem,MOSBE Veterans Transition Center - Outreach,Other,Marina,93933,1
194592,White,Non-Hispanic/Non-Latino,Male,True,150135,53,91103,2012-01-02,2013-08-31,2134,Category 1 - Homeless,"Staying or living in a family member's room, a...",,150135,,True,False,,,False,False,NaT,False,,False,607,Food Stamps,Mental Health Problem,MOSBE Veterans Transition Center - Outreach,Other,Marina,93933,1
179384,White,Hispanic/Latino,Female,False,155873,50,93245,2012-01-03,2016-01-28,2086,Category 1 - Homeless,Place not meant for habitation,CA-506,155873,Self (head of household),False,False,Data not collected,,False,False,NaT,False,,False,1486,,,MOSBE The Salvation Army - Good Samaritan Center,Services Only,,93955,1
183019,White,Hispanic/Latino,Male,False,155872,28,93950,2012-01-03,2012-10-26,2086,Category 1 - Homeless,Place not meant for habitation,CA-506,155872,,False,False,,,False,False,NaT,False,,False,297,,,MOSBE The Salvation Army - Good Samaritan Center,Services Only,,93955,1


In [18]:
df = df.sort_values(by=['Entry Date'])

In [23]:
# number the enrollments
df['Enrollment'] = df[['Entry Date']].groupby(level=0).cumcount().values + 1

In [31]:
max_enrollments = 3

choose_these = (df['Enrollment'].groupby(level=0).max() <= max_enrollments)

df_max = df.loc[choose_these[choose_these].index, :]

In [34]:
df.shape

(63324, 34)

In [32]:
df_max.shape

(14973, 34)

In [58]:
n_projects = 3

field = 'Project Type Code'

for n in range(1, n_projects+1):
    if n == 1:
        cols = [
            'Race',
            'Ethnicity',
            'Gender',
            'Veteran Status',
            'Project Entry ID',
            'Client Age at Entry',
            'Entry Date',
            'Housing Status @ Project Start',
            ]

        cols.append(field)
        
        df_out = df_max[df_max['Enrollment'] == n][cols]
        df_out = df_out.rename(columns={field: 'Project {}'.format(n)})
    else:
        cols = [field]

        df_enroll = df_max[df_max['Enrollment'] == n][cols]
        df_enroll = df_enroll.rename(columns={field: 'Project {}'.format(n)})
        
        df_out = df_out.merge(df_enroll, how='left', left_index=True, right_index=True)
    
perm = df_max[df_max['Enrollment'] <= n_projects]['In Permanent Housing'].groupby(level=0).max()

df_out['In Permanent Housing'] = perm

In [59]:
df_out.head()

Unnamed: 0_level_0,Race,Ethnicity,Gender,Veteran Status,Project Entry ID,Client Age at Entry,Entry Date,Housing Status @ Project Start,Project 1,Project 2,Project 3,In Permanent Housing
Personal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
173781,White,Hispanic/Latino,Female,False,252608,34,2014-02-28,At-risk of homelessness,Street Outreach,Emergency Shelter,,False
173782,White,Hispanic/Latino,Male,False,314084,10,2014-05-06,Category 1 - Homeless,Emergency Shelter,,,False
173783,White,Hispanic/Latino,Female,False,314085,12,2014-05-06,Category 1 - Homeless,Emergency Shelter,,,False
173803,White,Hispanic/Latino,Female,False,201286,32,2013-02-08,Category 1 - Homeless,Emergency Shelter,,,False
173804,White,Hispanic/Latino,Female,False,201288,11,2013-02-08,Category 1 - Homeless,Emergency Shelter,,,False


In [62]:
outfile = os.path.join(os.getenv('HOME'), 'Desktop/homeless_3_projects_outcome.csv')

df_out.to_csv(outfile)