In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import username, password
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

In [2]:
engine = create_engine('postgresql://{}:{}@localhost:5432/HMIS'.format(username, password))
conn = engine.connect()

## Clients table

#### data from 2002-2019

In [3]:
clients = pd.read_sql("SELECT * FROM Clients", conn)

In [5]:
print (clients.shape)
print(clients.columns)
clients.head(2)

(65822, 9)
Index(['Race', 'Ethnicity', 'Gender', 'Vet_Status', 'Vet_Discharge_Status',
       'Created_Date', 'Updated_Date', 'Birth_Date', 'Client_Id'],
      dtype='object')


Unnamed: 0,Race,Ethnicity,Gender,Vet_Status,Vet_Discharge_Status,Created_Date,Updated_Date,Birth_Date,Client_Id
0,White,Non-Hispanic/Non-Latino,Female,No,,2009-10-22,2010-02-06,7654-01-01,30066035
1,Black or African American,Non-Hispanic/Non-Latino,Male,,,2019-08-22,2019-08-22,2019-01-01,3889225129


In [6]:
clients.nunique(axis=0)

Race                        9
Ethnicity                   5
Gender                      8
Vet_Status                  5
Vet_Discharge_Status        9
Created_Date             5014
Updated_Date             4238
Birth_Date                108
Client_Id               65822
dtype: int64

In [7]:
clients.Vet_Status.unique()
# similar categories may need clean up if we want to use

array(['No', None, 'Yes', "Client doesn't know", 'Data not collected',
       'Client refused'], dtype=object)

In [9]:
clients.Vet_Discharge_Status.unique()
# similar categories may need clean up if we want to use

array([None, 'General under honorable conditions', 'Honorable',
       "Client doesn't know", 'Uncharacterized', 'Data not collected',
       'Under other than honorable conditions (OTH)', 'Bad Conduct',
       'Dishonorable', 'Client refused'], dtype=object)

In [16]:
clients.dtypes

Race                    object
Ethnicity               object
Gender                  object
Vet_Status              object
Vet_Discharge_Status    object
Created_Date            object
Updated_Date            object
Birth_Date              object
Client_Id                int64
dtype: object

In [28]:
clients.isnull().sum(axis = 0)

Clients_Race                  5994
Clients_Ethnicity             2850
Clients_Gender                 940
Clients_Vetern_Status        14099
Clients_Discharge_Status     61114
Clients_Date_Created_Date        0
Clients_Date_Updated             0
Birth_date_d                   157
Personal_Id_d                    0
dtype: int64

In [19]:
clients.Created_Date

<bound method Series.max of 0        2009-10-22
1        2019-08-22
2        2019-08-01
3        2019-07-29
4        2019-07-10
            ...    
65817    2018-03-17
65818    2018-11-20
65819    2003-07-24
65820    2018-08-16
65821    2019-07-09
Name: Created_Date, Length: 65822, dtype: object>

## Assessment

#### data fom 2015-2019

In [20]:
assess = pd.read_sql("SELECT * FROM Assessment", conn)

In [22]:
print (assess.shape)
print(assess.columns)
assess.head(2)

(8944, 5)
Index(['Client_Id', 'Assessment_Id', 'Assessment_Type', 'Assessment_Score',
       'Assessment_Date'],
      dtype='object')


Unnamed: 0,Client_Id,Assessment_Id,Assessment_Type,Assessment_Score,Assessment_Date
0,1122742152,15808214,VI-SPDAT Prescreen for Single Adults [v1],18,2016-03-03
1,721906958,10524533,VI-SPDAT Prescreen for Single Adults [v1],18,2015-12-09


In [29]:
assess.Assessment_Score.describe()

count    8944.000000
mean        9.584526
std         3.668045
min         0.000000
25%         7.000000
50%        10.000000
75%        13.000000
max        18.000000
Name: Assessment_Score, dtype: float64

In [27]:
assess.isnull().sum(axis = 0)

Client_Id           0
Assessment_Id       0
Assessment_Type     0
Assessment_Score    0
Assessment_Date     0
dtype: int64

In [28]:
assess.Assessment_Type.unique()

array(['VI-SPDAT Prescreen for Single Adults [v1]'], dtype=object)

## Enrollment Table

#### dates 2012-2019

In [30]:
enroll = pd.read_sql("SELECT * FROM Enrollment", conn)

In [32]:
print (enroll.shape)
print(enroll.columns)
enroll.head(2)

(144067, 11)
Index(['Client_Id', 'Enrollment_Id', 'Household_Id', 'Program_Id',
       'Added_Date', 'Housing_Status', 'LOS_Prior', 'Zip', 'Chronic_Homeless',
       'Prior_Residence', 'Last_Grade_Completed'],
      dtype='object')


Unnamed: 0,Client_Id,Enrollment_Id,Household_Id,Program_Id,Added_Date,Housing_Status,LOS_Prior,Zip,Chronic_Homeless,Prior_Residence,Last_Grade_Completed
0,321300535,1935521,1925701,3,2014-01-01,,Two to six nights,95204.0,No,"Emergency Shelter, including hotel/motel paid ...",
1,716937870,1939455,1929625,5,2014-01-01,At-risk of homelessness,"One month or more, but less than 90 days",,No,"Staying or living in a family member's room, a...",


In [33]:
enroll.isnull().sum(axis = 0)

Client_Id                    0
Enrollment_Id                0
Household_Id                 0
Program_Id                   0
Added_Date                   0
Housing_Status           52413
LOS_Prior                37140
Zip                      73323
Chronic_Homeless             0
Prior_Residence          21612
Last_Grade_Completed    141573
dtype: int64

In [35]:
enroll.Program_Id.unique()

array([  3,   5,   4,   6,   7,   9,   8,  10,  13,  11,  12,  14,  16,
        15,  19,  17,  18,  41,  23,  20,  22,  21,  24,  26,  25,  27,
        28,  29,  31,  30,  32,  33,  34,  35,  36,  38,  37,  48,  40,
        39,  65,  42,  44,  45,  43,  55,  46,  47,  49,  50,  52,  51,
        53,  54,  56,  57,  58,  60,  62,  63,  64,  66,  67,  68,  70,
        71,  73,  75,  76,  72,  77,  78,  80,  81,  83,  84,  82,  85,
        87,  86,  88,  89,  91,  90,  92,  95,  96,  97,  98,  99, 101,
       102, 103, 108, 104, 105, 109, 110, 111, 113, 112, 115, 114, 116,
       117, 118, 119, 120, 123, 125, 126, 127, 128, 129, 130, 132, 133,
       134, 135, 136, 138, 141, 144, 146, 151, 153, 155, 154, 157, 161,
       162, 163, 164, 165, 283, 315, 166, 167, 168, 171, 503, 176, 172,
       173, 174, 175, 177, 178, 179, 180, 183, 184, 185, 187, 186, 188,
       193, 196, 197, 256, 198, 288, 276, 294, 199, 314, 321, 206, 201,
       202, 331, 217, 251, 245, 257, 262, 207, 204, 203, 214, 21

In [37]:
enroll.Housing_Status.unique()

array([None, 'At-risk of homelessness',
       'Category 2 - At Imminent risk of losing housing',
       'Category 1 - Homeless', "Client doesn't know", 'Stably housed',
       'Client refused', 'Category 4 - Fleeing domestic violence',
       'Data not collected',
       'Category 3 - Homeless only under other federal statutes'],
      dtype=object)

In [38]:
enroll.Chronic_Homeless.unique()

array(['No', 'Yes'], dtype=object)

In [40]:
enroll.Last_Grade_Completed.unique()

array([None, 'Grades 5-6', 'Grades 9-11', 'Some college', 'Grade 12',
       'GED', 'School program does not have grade levels',
       'Vocational certification', 'Data not collected',
       "Bachelor's degree", 'Grades 7-8', 'Less than Grade 5',
       'Associates degree', 'Graduate degree', "Client doesn't know",
       'Client refused'], dtype=object)

In [66]:
enroll.LOS_Prior.unique()

array(['Two to six nights', 'One month or more, but less than 90 days',
       None, 'One week or more, but less than one month',
       '90 days or more, but less than one year', 'One year or longer',
       'Client refused', "Client doesn't know", 'Data not collected',
       'One night or Less'], dtype=object)

## Program Table

#### dates 1980-2019 (Program_Start)

In [41]:
programs = pd.read_sql("SELECT * FROM Programs", conn)

In [42]:
print (programs.shape)
print(programs.columns)
programs.head()

(298, 11)
Index(['Program_Id', 'Agency_Id', 'Program_Name', 'Program_Start',
       'Program_End', 'Continuum', 'Project_Type', 'Target_Pop',
       'Housing_Type', 'Added_Date', 'Updated_Date'],
      dtype='object')


Unnamed: 0,Program_Id,Agency_Id,Program_Name,Program_Start,Program_End,Continuum,Project_Type,Target_Pop,Housing_Type,Added_Date,Updated_Date
0,5,19,TLCS Carol's Place ES (9),1990-01-01,2012-02-29,1,Emergency Shelter,Not Applicable,,2014-01-01,2014-01-01
1,6,42,VOA Aid in Kind ES,1990-01-01,2012-07-22,1,Emergency Shelter,,,2014-01-01,2014-01-01
2,7,19,TLCS Palmer Apartments -TH (48),1990-01-01,2016-02-28,1,Transitional Housing,Not Applicable,,2014-01-01,2016-10-27
3,8,19,TLCS: New Direction - SSO,1990-01-01,,1,Services Only,Not Applicable,,2014-01-01,2019-02-20
4,10,41,DHA Emergency Shelter (0),1990-01-01,2013-12-31,1,Emergency Shelter,Not Applicable,,2014-01-01,2014-01-01


In [44]:
programs.nunique(axis=0)

Program_Id       298
Agency_Id         81
Program_Name     292
Program_Start    108
Program_End       72
Continuum          2
Project_Type      13
Target_Pop         3
Housing_Type       3
Added_Date       101
Updated_Date      81
dtype: int64

In [46]:
programs.Project_Type.unique()

array(['Emergency Shelter', 'Transitional Housing', 'Services Only',
       'PH - Permanent Supportive Housing (disability required)',
       'RETIRED (HPRP)', 'Other', 'Street Outreach',
       'PH - Rapid Re-Housing', 'Homeless Prevention',
       'PH - Housing Only',
       'PH - Housing with Services (no disability required)',
       'Day Shelter', 'Coordinated Assessment'], dtype=object)

In [47]:
programs.Target_Pop.unique()

array(['Not Applicable', None, 'Persons with HIV/AIDS',
       'Domestic Violence victims'], dtype=object)

In [48]:
programs.Housing_Type.unique()

array([None, 'Site-based – single site',
       'Site-based – clustered / multiple sites',
       'Tenant-based - scattered site'], dtype=object)

In [50]:
programs.isnull().sum(axis = 0)

Program_Id         0
Agency_Id          0
Program_Name       0
Program_Start      0
Program_End      164
Continuum          0
Project_Type       0
Target_Pop         6
Housing_Type     142
Added_Date         0
Updated_Date       0
dtype: int64

## Exit Table

#### dates 1986-2019

In [51]:
exit_df = pd.read_sql("SELECT * FROM Exit_Screen", conn) 

In [52]:
print (exit_df.shape)
print(exit_df.columns)
exit_df.head()

(135217, 6)
Index(['Client_Id', 'Enrollment_Id', 'Exit_Destination', 'Exit_Reason',
       'Exit_Date', 'es_id'],
      dtype='object')


Unnamed: 0,Client_Id,Enrollment_Id,Exit_Destination,Exit_Reason,Exit_Date,es_id
0,82216,758921321,Permanent housing (other than RRH) for formerl...,Completed Program,2010-05-25,1
1,170237,2746776022,"Rental by client, no ongoing housing subsidy",Completed Program,2010-07-31,2
2,453629,2554945387,"Emergency shelter, including hotel or motel pa...",Completed Program,2010-02-26,3
3,897131,1829973754,"Rental by client, no ongoing housing subsidy",Reached maximum time allowed by program,2010-07-31,4
4,907879,2743367629,"Rental by client, no ongoing housing subsidy",Reached maximum time allowed by program,2010-04-30,5


In [53]:
exit_df.isnull().sum(axis = 0)

Client_Id               0
Enrollment_Id           0
Exit_Destination    36791
Exit_Reason         26632
Exit_Date               0
es_id                   0
dtype: int64

In [54]:
exit_df.Exit_Reason.unique()

array(['Completed Program', 'Reached maximum time allowed by program',
       'Unknown/disappeared', 'Disagreement with rules/persons',
       'Left for a housing opportunity before completing program', None,
       'Death', 'Other',
       'Criminal activity/destruction of property/violence',
       'Non-compliance with program', 'Needs could not be met by program',
       'Non-payment of rent/occupancy charge'], dtype=object)

## Combining data

In [61]:
# Can link enrollment and programs in pandas due to missing info in programs table data
enroll_pgm = pd.merge(programs, enroll, on="Program_Id")
enroll_pgm.head(2)

Unnamed: 0,Program_Id,Agency_Id,Program_Name,Program_Start,Program_End,Continuum,Project_Type,Target_Pop,Housing_Type,Added_Date_x,...,Client_Id,Enrollment_Id,Household_Id,Added_Date_y,Housing_Status,LOS_Prior,Zip,Chronic_Homeless,Prior_Residence,Last_Grade_Completed
0,5,19,TLCS Carol's Place ES (9),1990-01-01,2012-02-29,1,Emergency Shelter,Not Applicable,,2014-01-01,...,716937870,1939455,1929625,2014-01-01,At-risk of homelessness,"One month or more, but less than 90 days",,No,"Staying or living in a family member's room, a...",
1,5,19,TLCS Carol's Place ES (9),1990-01-01,2012-02-29,1,Emergency Shelter,Not Applicable,,2014-01-01,...,913804588,1949314,1939459,2014-01-01,Category 2 - At Imminent risk of losing housing,"One week or more, but less than one month",,No,Transitional housing for homeless persons,


## Visualizations

## Other tools

In [None]:
# # get rid of redundant
# df_cleaned = df_cleaned.copy().drop(['url','image_url','city_url'], axis=1)

In [None]:
# #remove columns with more than 40% null value
# NA_val = df_cleaned.isna().sum()def na_filter(na, threshold = .4): #only select variables that passees the threshold
#     col_pass = []
#     for i in na.keys():
#         if na[i]/df_cleaned.shape[0]<threshold:
#             col_pass.append(i)
#     return col_passdf_cleaned = df_cleaned[na_filter(NA_val)]
# df_cleaned.columns

In [None]:
# # remove outliers
# df_cleaned = df_cleaned[df_cleaned['price'].between(999.99, 99999.00)]
# df_cleaned = df_cleaned[df_cleaned['year'] > 1990]
# df_cleaned = df_cleaned[df_cleaned['odometer'] < 899999.00]df_cleaned.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
# #remove rows with null values
# df_cleaned = df_cleaned.dropna(axis=0)
# df_cleaned.shape

In [None]:
# calculate correlation matrix
corr = df_cleaned.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))