# Compare number of days enrolled with the previous year

The outcome of this analysis shows the reduction or an increase of the homelessness period compared to the preceding federal fiscal year.

The aim for organisations that provide services is to reduce the mean length of the homelessness episode with 90% from the mean length of the preceding federal fiscal year. 
A reduction in the mean length of a homelessness episode will provide further funding to the organisation. 

Two approaches:
1. Calculate the average reduction/increase of days that people are homeless compared with the preceding federal fiscal year.
2. Calculate the average reduction/increase of homeless days only of those people that have been homeless for two sequential years.  

### Load and prepare data

In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
%load_ext autoreload
# # the "1" means: always reload modules marked with "%aimport"
%autoreload 2

from __future__ import absolute_import, division, print_function
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
# from tqdm import tqdm
# import warnings

# sns.set_context("poster", font_scale=0.9)
sns.set_context("notebook", font_scale=1.0)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

# add the data functions to the path
annalie = 'data/c4sf/C4SF-datasci-homeless/raw/'
datadir = os.path.join(os.getenv('HOME'), annalie)

src_data_dir = os.path.join(os.getcwd(), os.pardir, 'src/data')
sys.path.append(src_data_dir)

# functions to load the data
import dataset_homeless as ds

In [2]:
datadir

'/Users/annalie/data/c4sf/C4SF-datasci-homeless/raw/'

In [3]:
# load in and process the data in separate sheets

df_client = ds.process_data_client(simplify_strings=True, datadir=datadir)

df_enroll = ds.process_data_enrollment(simplify_strings=True, datadir=datadir)

# Only keep rows with entry dates starting in 2012
df_enroll = df_enroll[df_enroll['Entry Date'] >= '2012']
# Only keep rows with exit dates before 2016-06-01
df_enroll = df_enroll[df_enroll['Exit Date'] <= '2016-06-01']

df_disability = ds.process_data_disability(simplify_strings=True, datadir=datadir)

df_healthins = ds.process_data_healthins(simplify_strings=True, datadir=datadir)

df_benefit = ds.process_data_benefit(simplify_strings=True, datadir=datadir)

df_income = ds.process_data_income(simplify_strings=True, datadir=datadir)

df_project = ds.process_data_project(simplify_strings=True, datadir=datadir)

df_service = ds.process_data_service(simplify_strings=True, datadir=datadir)

df_bedinv = ds.process_data_bedinventory(simplify_strings=True, datadir=datadir)

# Join the client information with enrollment information.
# Inner join because we want to only keep individuals
# for whom we have both client and enrollment information.
df = df_client.merge(df_enroll, how='inner', left_index=True, right_index=True)

# just choose the first non-cash benefit; this is too simple!
# TODO: join on the exact Project ID, and possible Date
df = df.merge(df_benefit.reset_index().groupby(by=['Personal ID'])[['Non-Cash Benefit']].nth(0),
              how='left', left_index=True, right_index=True)
# # possible fix for above, but this isn't working properly (results in too many rows);
# # probably need date too, but they do not align
# df.reset_index().merge(df_benefit.reset_index()[['Personal ID', 'Project Entry ID', 'Non-Cash Benefit']].drop_duplicates(),
#                        how='left',
#                        on=['Personal ID', 'Project Entry ID'],
#                       ).drop_duplicates().set_index('Personal ID')

df['Non-Cash Benefit'] = df['Non-Cash Benefit'].fillna('None')

# add information about their disability status
# just choose the first disability; this is too simple!
# TODO: join on the exact Project ID
df = df.merge(df_disability.reset_index().groupby(by=['Personal ID'])[['Disability Type']].nth(0),
              how='left', left_index=True, right_index=True)
# # possible fix for above, but this isn't working properly (results in too many rows);
# # probably need date too, but they do not align
# df.reset_index().merge(df_disability.reset_index()[['Personal ID', 'Project Entry ID', 'Disability Type']].drop_duplicates(),
#                        how='left',
#                        on=['Personal ID', 'Project Entry ID'],
#                       ).drop_duplicates().set_index('Personal ID')

df['Disability Type'] = df['Disability Type'].fillna('None')

# add Project Type Code to DataFrame
df = df.merge(df_project[['Project Name',
                          'Project Type Code',
                          'Address City',
                          'Address Postal Code',
                         ]], left_on=['Project ID'], right_index=True)

# sort by entry date
df = df.sort_values('Entry Date')

# rename the columns to have no spaces
df = ds.rename_columns(df)

In [4]:
df_resetidx = df.reset_index()

In [5]:
df_resetidx.head(2)

Unnamed: 0,Personal ID,race,ethnicity,gender,veteran_status,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,noncash_benefit,disability_type,project_name,project_type_code,address_city,address_postal_code
0,214224,white,nonlatino,male,False,203474,60,0,2012-01-01,2013-09-08,2938,cat1homeless,hotel,,203474,True,False,,,False,False,NaT,False,,False,616,,False,,physical,SCz - HSC - Page Smith Community House,transitionalhousing,Santa Cruz,95060
1,194592,white,nonlatino,male,True,150135,53,91103,2012-01-02,2013-08-31,2134,cat1homeless,family,,150135,True,False,,,False,False,NaT,False,,False,607,,False,foodstamps,mentalhealth,MOSBE Veterans Transition Center - Outreach,other,Marina,93933


In [6]:
df = df_resetidx.sort_values('Personal ID')
df.head(5)

Unnamed: 0,Personal ID,race,ethnicity,gender,veteran_status,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,noncash_benefit,disability_type,project_name,project_type_code,address_city,address_postal_code
19228,173781,white,latino,female,False,252608,34,93907,2014-02-28,2014-02-28,2104,atrisk,rental,,252608,False,False,,,False,False,NaT,True,24,False,0,,False,foodstamps,,MOSBE SOP - Mobile Outreach Shelter Program - ...,streetoutreach,,93933
23765,173781,white,latino,female,False,314080,35,93907,2014-05-06,2014-09-30,2101,cat1homeless,emershelter,CA-506,314080,False,False,2.0,,False,False,NaT,True,12,False,147,,False,foodstamps,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933
23766,173782,white,latino,male,False,314084,10,93905,2014-05-06,2014-09-30,2101,cat1homeless,emershelter,CA-506,314080,False,True,2.0,,False,False,NaT,True,12,False,147,,False,,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933
23767,173783,white,latino,female,False,314085,12,93905,2014-05-06,2014-09-30,2101,cat1homeless,emershelter,CA-506,314080,False,True,2.0,,False,False,NaT,True,12,False,147,,False,,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933
3813,173803,white,latino,female,False,201286,32,93905,2013-02-08,2013-04-27,2101,cat1homeless,friend,,201286,False,False,,,False,False,NaT,True,12,False,78,,False,foodstamps,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933


In [7]:
list(df['noncash_benefit'].unique())

['foodstamps',
 'None',
 'other',
 'wic',
 'publichousing',
 'tanfother',
 'tanftransportation',
 'tanfchildcare',
 'temprental']

In [8]:
list(df['project_type_code'].unique())

['streetoutreach',
 'emergencyshelter',
 'other',
 'transitionalhousing',
 'servicesonly',
 'homelessnessprevention',
 'rapidrehousing',
 'permanentsupportivehousing']

In [9]:
# select cases where noncash_benefit != None
print ('len df before removing None: ', len(df))
df = df[df['noncash_benefit']!='None']
print ('len df after removing None: ', len(df))

len df before removing None:  58209
len df after removing None:  22134


In [10]:
# change 0 in the column days_enrolled to 1
df['days_enrolled'] = df['days_enrolled'].replace(0, 1)

In [11]:
df.head()

Unnamed: 0,Personal ID,race,ethnicity,gender,veteran_status,project_entry_id,client_age_at_entry,last_permanent_zip,entry_date,exit_date,project_id,housing_status_project_start,living_situation_before_program_entry,client_location,household_id,disabling_condition,continuously_homeless_one_year,times_homeless_past_three_years,months_homeless_this_time,chronic_homeless,in_permanent_housing,residential_move_in_date,domestic_violence_victim,months_ago_dv_occurred,dv_currently_fleeing,days_enrolled,days_to_residential_move_in,head_of_household,noncash_benefit,disability_type,project_name,project_type_code,address_city,address_postal_code
19228,173781,white,latino,female,False,252608,34,93907,2014-02-28,2014-02-28,2104,atrisk,rental,,252608,False,False,,,False,False,NaT,True,24,False,1,,False,foodstamps,,MOSBE SOP - Mobile Outreach Shelter Program - ...,streetoutreach,,93933
23765,173781,white,latino,female,False,314080,35,93907,2014-05-06,2014-09-30,2101,cat1homeless,emershelter,CA-506,314080,False,False,2.0,,False,False,NaT,True,12,False,147,,False,foodstamps,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933
3813,173803,white,latino,female,False,201286,32,93905,2013-02-08,2013-04-27,2101,cat1homeless,friend,,201286,False,False,,,False,False,NaT,True,12,False,78,,False,foodstamps,,MOSBE SOP - Natividad Shelter,emergencyshelter,Marina,93933
37922,173848,white,nonlatino,female,False,447694,48,93955,2015-03-20,2015-03-21,2840,cat1homeless,hospital,CA-506,447694,True,False,4.0,1.0,False,False,NaT,True,24,False,1,,True,foodstamps,physical,MOSBE Franciscan Workers - Women Alive! Shelter,emergencyshelter,Salinas,93901
10693,173848,white,nonlatino,female,False,399498,46,93955,2013-11-27,2013-12-29,2102,cat1homeless,hospital,CA-506,399498,True,False,4.0,1.0,False,False,NaT,True,24,False,32,,True,foodstamps,physical,MOSBE SOP - Hamilton ES,emergencyshelter,Marina,93933


In [12]:
# select columns
df = df[['Personal ID', 'entry_date', 'exit_date', 'living_situation_before_program_entry', 'in_permanent_housing', 'residential_move_in_date', 'days_enrolled', 'noncash_benefit', 'project_type_code']]

In [13]:
df.head()

Unnamed: 0,Personal ID,entry_date,exit_date,living_situation_before_program_entry,in_permanent_housing,residential_move_in_date,days_enrolled,noncash_benefit,project_type_code
19228,173781,2014-02-28,2014-02-28,rental,False,NaT,1,foodstamps,streetoutreach
23765,173781,2014-05-06,2014-09-30,emershelter,False,NaT,147,foodstamps,emergencyshelter
3813,173803,2013-02-08,2013-04-27,friend,False,NaT,78,foodstamps,emergencyshelter
37922,173848,2015-03-20,2015-03-21,hospital,False,NaT,1,foodstamps,emergencyshelter
10693,173848,2013-11-27,2013-12-29,hospital,False,NaT,32,foodstamps,emergencyshelter


In [39]:
df2 = df[['Personal ID', 'entry_date', 'exit_date', 'days_enrolled', 'noncash_benefit', 'project_type_code']]

In [40]:
# create column for the year of enrollment
df2['entry_date_year'] = df2['entry_date'].astype(str).str[:4]
df2['exit_date_year'] = df2['exit_date'].astype(str).str[:4]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [41]:
df2.head()

Unnamed: 0,Personal ID,entry_date,exit_date,days_enrolled,noncash_benefit,project_type_code,entry_date_year,exit_date_year
19228,173781,2014-02-28,2014-02-28,1,foodstamps,streetoutreach,2014,2014
23765,173781,2014-05-06,2014-09-30,147,foodstamps,emergencyshelter,2014,2014
3813,173803,2013-02-08,2013-04-27,78,foodstamps,emergencyshelter,2013,2013
37922,173848,2015-03-20,2015-03-21,1,foodstamps,emergencyshelter,2015,2015
10693,173848,2013-11-27,2013-12-29,32,foodstamps,emergencyshelter,2013,2013


In [44]:
# where entry_date_year is 2012, then end_year is 2012-12-31
# where entry_date_year is 2013, then end_year is 2013-12-31
# where entry_date_year is 2014, then end_year is 2014-12-31
# where entry_date_year is 2015, then end_year is 2015-12-31

# def retrieve_end_year(row):
#     val = []
#     if row['entry_date_year'] == '2012':
#         val = pd.Timestamp('2012-12-31')
#     elif row['entry_date_year'] == '2013':
#         val = pd.Timestamp('2013-12-31')
#     elif row['entry_date_year'] == '2014':
#         val = pd.Timestamp('2014-12-31')
#     elif row['entry_date_year'] == '2015':
#         val = pd.Timestamp('2015-12-31')
#     return val

# def retrieve_end_year(row):
#     val = []
#     if row['entry_date_year'] == '2012':
#         val = pd.to_datetime('2012-12-31', format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2013':
#         val = pd.to_datetime('2013-12-31', format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2014':
#         val = pd.to_datetime('2014-12-31', format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2015':
#         val = pd.to_datetime('2015-12-31', format="%Y/%m/%d")
#     return val

# def retrieve_end_year(row):
#     val = []
#     if row['entry_date_year'] == '2012':
#         val = pd.to_datetime(pd.Series(['2012/12/31']), format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2013':
#         val = pd.to_datetime(pd.Series(['2013/12/31']), format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2014':
#         val = pd.to_datetime(pd.Series(['2014/12/31']), format="%Y/%m/%d")
#     elif row['entry_date_year'] == '2015':
#         val = pd.to_datetime(pd.Series(['2015/12/31']), format="%Y/%m/%d")
#     return val

# def retrieve_end_year(row):
#     val = []
#     if row['entry_date_year'] == '2012':
#         val = '2012-12-31'
#     elif row['entry_date_year'] == '2013':
#         val = '2013-12-31'
#     elif row['entry_date_year'] == '2014':
#         val = '2014-12-31'
#     elif row['entry_date_year'] == '2015':
#         val = '2015-12-31'
#     return val

import datetime
def retrieve_end_year(row):
    val = []
    if row['entry_date_year'] == '2012':
        val = datetime(2012,12,31)
    elif row['entry_date_year'] == '2013':
        val = datetime(2013,12,31)
    elif row['entry_date_year'] == '2014':
        val = datetime(2014,12,31)
    elif row['entry_date_year'] == '2015':
        val = datetime(2015,12,31)
    return val

In [45]:
df2['end_year_date'] = df2.apply(retrieve_end_year, axis=1)
df2.head()

TypeError: ("'module' object is not callable", u'occurred at index 19228')

In [32]:
# df2['end_year_date_64'] = pd.to_datetime(df2['end_year_date'])

In [28]:
df2.dtypes

Personal ID                   int64
entry_date           datetime64[ns]
exit_date            datetime64[ns]
days_enrolled                 int64
noncash_benefit              object
project_type_code            object
entry_date_year              object
exit_date_year               object
end_year_date                object
dtype: object

In [38]:
# find number of days enrolled per year

# df2['days_enrolled_year'] = df2['end_year_date'] - df2['entry_date']
end_year_date = df2['end_year_date'].values.astype('datetime64[ns]')
df2['days_enrolled_year'] = end_year_date - df2['entry_date'].values

ValueError: Could not convert object to NumPy datetime

In [27]:
df2[df2['noncash_benefit']=='publichousing']

Unnamed: 0,Personal ID,entry_date,exit_date,days_enrolled,noncash_benefit,project_type_code,entry_date_year,exit_date_year,days_enrolled_year
7104,178174,2013-05-29,2014-03-07,282,publichousing,transitionalhousing,2013,2014,282 days
7106,178174,2013-05-29,2014-03-07,282,publichousing,transitionalhousing,2013,2014,282 days
19774,178174,2014-03-07,2015-06-30,480,publichousing,permanentsupportivehousing,2014,2015,480 days
606,178174,2012-08-08,2013-05-28,293,publichousing,transitionalhousing,2012,2013,293 days
603,178174,2012-08-08,2013-05-28,293,publichousing,transitionalhousing,2012,2013,293 days
16476,178178,2014-02-01,2015-06-30,514,publichousing,permanentsupportivehousing,2014,2015,514 days
24858,178178,2014-07-02,2015-06-30,363,publichousing,other,2014,2015,363 days
14237,178178,2014-01-06,2015-06-30,540,publichousing,other,2014,2015,540 days
24859,178178,2014-07-02,2015-06-30,363,publichousing,other,2014,2015,363 days
916,178178,2012-10-01,2014-02-01,488,publichousing,permanentsupportivehousing,2012,2014,488 days


### Find number of days homeless per episode

In [30]:
# Find mean of 

In [46]:
# select cases where personal ID exists more than once in the dataset

# find duplicate values in the column Personal ID
# df_total_days_enrolled = df_total_days_enrolled[df_total_days_enrolled['Personal ID'].duplicated(keep=False)]

In [17]:
list(df_total_days_enrolled['entry_date_year'].unique())

['2014', '2013', '2015', '2016', '2012']

#### Calculate the average reduction/increase of days that people are homeless compared with the preceding federal fiscal year

In [None]:
df_2012 = df_total_days_enrolled[df_total_days_enrolled['entry_date_year']=='2012']
df_2013 = df_total_days_enrolled[df_total_days_enrolled['entry_date_year']=='2013']
df_2014 = df_total_days_enrolled[df_total_days_enrolled['entry_date_year']=='2014']
df_2015 = df_total_days_enrolled[df_total_days_enrolled['entry_date_year']=='2015']
df_2016 = df_total_days_enrolled[df_total_days_enrolled['entry_date_year']=='2016']

In [21]:
# groupby Personal ID and count days_enrolled
# df_2013['days_enrolled'].groupby(df_2013['Personal ID']).size()

# sum total days that people where enrolled per year
no_days_enrolled_2012 = df_2012['days_enrolled'].sum(axis=0)
no_days_enrolled_2013 = df_2013['days_enrolled'].sum(axis=0)
no_days_enrolled_2014 = df_2014['days_enrolled'].sum(axis=0)
no_days_enrolled_2015 = df_2015['days_enrolled'].sum(axis=0)
no_days_enrolled_2016 = df_2016['days_enrolled'].sum(axis=0)

print ('# enrollment days in 2012: ', df_2012['days_enrolled'].sum(axis=0))
print ('# enrollment days in 2013: ', df_2013['days_enrolled'].sum(axis=0))
print ('# enrollment days in 2014: ', df_2014['days_enrolled'].sum(axis=0))
print ('# enrollment days in 2015: ', df_2015['days_enrolled'].sum(axis=0))
print ('# enrollment days in 2016: ', df_2016['days_enrolled'].sum(axis=0))

# enrollment days in 2012:  192443
# enrollment days in 2013:  239877
# enrollment days in 2014:  257526
# enrollment days in 2015:  152937
# enrollment days in 2016:  20133


In [73]:
# calculate number of people enrolled per year
no_homeless_2012 = len(list(df_2012['Personal ID'].unique()))
print ('# people enrolled in 2012: ', len(list(df_2012['Personal ID'].unique())))

no_homeless_2013 = len(list(df_2013['Personal ID'].unique()))
print ('# people enrolled in 2013: ', len(list(df_2013['Personal ID'].unique())))

no_homeless_2014 = len(list(df_2014['Personal ID'].unique()))
print ('# people enrolled in 2014: ', len(list(df_2014['Personal ID'].unique())))

no_homeless_2015 = len(list(df_2015['Personal ID'].unique()))
print ('# people enrolled in 2015: ', len(list(df_2015['Personal ID'].unique())))

no_homeless_2016 = len(list(df_2016['Personal ID'].unique()))
print ('# people enrolled in 2016: ', len(list(df_2016['Personal ID'].unique())))

# people enrolled in 2012:  540
# people enrolled in 2013:  1147
# people enrolled in 2014:  1335
# people enrolled in 2015:  1079
# people enrolled in 2016:  382


#### Compare 2013, 2014 and 2015

In [74]:
# calculate reduction/increase in percentages compared with the preceding fiscal year
diff_2013_2012 = no_homeless_2013/no_homeless_2012 * 100
diff_2014_2013 = no_homeless_2014/no_homeless_2013 * 100
diff_2015_2014 = no_homeless_2015/no_homeless_2014 * 100

# print ('percentage difference between # days enrolled 2013 versus 2012: ', diff_2013_2012,'%')
print ('percentage difference between # days enrolled 2014 versus 2013: ', diff_2014_2013,'%')
print ('percentage difference between # days enrolled 2015 versus 2014: ', diff_2015_2014,'%')

percentage difference between # days enrolled 2014 versus 2013:  116.390584133 %
percentage difference between # days enrolled 2015 versus 2014:  80.8239700375 %


#### Calculate the average reduction/increase of homeless days only of those people that have been homeless for two sequential years

#### Get number of days enrolled per homeless person per year

In [29]:
# return highest value for days enrolled per Personal ID per year
def get_days_enrolled_per_year(year):

    days_enrolled_per_year = year.groupby('Personal ID').size()
    days_enrolled_per_year = days_enrolled_per_year.to_frame()
    days_enrolled_per_year['Personal ID'] = days_enrolled_per_year.index
    days_enrolled_per_year.columns = ['days_enrolled_total', 'Personal ID']
    days_enrolled_per_year = days_enrolled_per_year.reset_index(drop=True)

    return days_enrolled_per_year

In [30]:
df_days_enrolled_2012 = get_days_enrolled_per_year(df_2012)
df_days_enrolled_2012.columns = ['days_enrolled_total_2012', 'Personal ID']

df_days_enrolled_2013 = get_days_enrolled_per_year(df_2013)
df_days_enrolled_2013.columns = ['days_enrolled_total_2013', 'Personal ID']

df_days_enrolled_2014 = get_days_enrolled_per_year(df_2014)
df_days_enrolled_2014.columns = ['days_enrolled_total_2014', 'Personal ID']

df_days_enrolled_2015 = get_days_enrolled_per_year(df_2015)
df_days_enrolled_2015.columns = ['days_enrolled_total_2015', 'Personal ID']

df_days_enrolled_2016 = get_days_enrolled_per_year(df_2016)
df_days_enrolled_2016.columns = ['days_enrolled_total_2016', 'Personal ID']

In [37]:
# merge dataframes on Personal ID
df_days_enrolled_2012_2013 = df_days_enrolled_2012.merge(df_days_enrolled_2013, on='Personal ID', how='outer')
df_days_enrolled_2013_2014 = df_days_enrolled_2013.merge(df_days_enrolled_2014, on='Personal ID', how='outer')
df_days_enrolled_2014_2015 = df_days_enrolled_2014.merge(df_days_enrolled_2015, on='Personal ID', how='outer')
df_days_enrolled_2015_2016 = df_days_enrolled_2015.merge(df_days_enrolled_2016, on='Personal ID', how='outer')

In [38]:
df_days_enrolled_2012_2013.head()

Unnamed: 0,days_enrolled_total_2012,Personal ID,days_enrolled_total_2013
0,1.0,174125,4.0
1,2.0,174254,
2,2.0,174473,
3,2.0,174598,6.0
4,2.0,174757,


In [85]:
# 1. keep only the rows where both columsn are filled, so remove columns with missing values
df_days_enrolled_2012_2013_new = df_days_enrolled_2012_2013.dropna()
df_days_enrolled_2013_2014_new = df_days_enrolled_2013_2014.dropna()
df_days_enrolled_2014_2015_new = df_days_enrolled_2014_2015.dropna()
df_days_enrolled_2015_2016_new = df_days_enrolled_2015_2016.dropna()

# What percentage of enrolled people returned the next year? 
def percentage_returned_the_next_year(df, df_new):
    return (len(df_new)/len(df)*100)


print ('percentage that returned the next year for 2012-2013:', percentage_returned_the_next_year(df_days_enrolled_2012_2013, df_days_enrolled_2012_2013_new),'%')
print ('percentage that returned the next year for 2013-2014:', percentage_returned_the_next_year(df_days_enrolled_2013_2014, df_days_enrolled_2013_2014_new),'%')
print ('percentage that returned the next year for 2014-2015:', percentage_returned_the_next_year(df_days_enrolled_2014_2015, df_days_enrolled_2014_2015_new),'%')
print ('percentage that returned the next year for 2015-2016:', percentage_returned_the_next_year(df_days_enrolled_2015_2016, df_days_enrolled_2015_2016_new),'%')

percentage that returned the next year for 2012-2013: 12.0185922975 %
percentage that returned the next year for 2013-2014: 12.7669241254 %
percentage that returned the next year for 2014-2015: 15.5023923445 %
percentage that returned the next year for 2015-2016: 11.1872146119 %


In [80]:
# 2. create new column that computes the increase or reduction with the preceding year per person     
def get_difference_in_percentage_with_preceding_year(df):
    df['percentage_compared_to_preceding_year'] = df.iloc[:,2] / df.iloc[:,0] * 100
    return df

df_days_enrolled_2012_2013_new = get_difference_in_percentage_with_preceding_year(df_days_enrolled_2012_2013_new)
df_days_enrolled_2013_2014_new = get_difference_in_percentage_with_preceding_year(df_days_enrolled_2013_2014_new)
df_days_enrolled_2014_2015_new = get_difference_in_percentage_with_preceding_year(df_days_enrolled_2014_2015_new)
df_days_enrolled_2015_2016_new = get_difference_in_percentage_with_preceding_year(df_days_enrolled_2015_2016_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [84]:
# 3. average percentages
def average_percentages(df):
    return df['percentage_compared_to_preceding_year'].mean()

print ('average of homeless people that returned for 2012-2013:', average_percentages(df_days_enrolled_2012_2013_new),'%')
print ('average of homeless people that returned for 2013-2014:', average_percentages(df_days_enrolled_2013_2014_new),'%')
print ('average of homeless people that returned for 2014-2015:', average_percentages(df_days_enrolled_2014_2015_new),'%')
print ('average of homeless people that returned for 2015-2016:', average_percentages(df_days_enrolled_2015_2016_new),'%')

average of homeless people that returned for 2012-2013: 416.823289187 %
average of homeless people that returned for 2013-2014: 303.321250361 %
average of homeless people that returned for 2014-2015: 219.317152815 %
average of homeless people that returned for 2015-2016: 205.603321972 %
