# Creating the data schema

In [1]:
files = ["BedInventory.csv", "Benefit.csv", "Client.csv", "Disability.csv", 
         "HealthInsurance.csv", "Income Entry & Exit.csv", "Project.csv", "Service.csv",
         "Enrollment.csv"]

In [2]:
import pandas as pd

In [3]:
with open("data/schema.txt", "w") as schema:
    for f in files:
        df = pd.read_csv("data/rawraw/" + f)
        schema.write(f + "\n")
        row_names = ("\t- " + name for name in df.columns)
        schema.write("\n".join(row_names))
        schema.write("\n\n")

# Client data

In [4]:
client = pd.read_csv("data/raw/Client.csv")

In [5]:
client.head()

Unnamed: 0,Personal ID,Name Data Quality,SSN Data Quality,Race,Ethnicity,Gender,Veteran Status,Theater of Operations,Year Entered Military Service,Year Separated from Military Service,Military Branch,Discharge Status
0,173781,,Full SSN Reported (HUD),White (HUD),Hispanic/Latino (HUD),Female,No (HUD),,,,,
1,173782,,Full SSN Reported (HUD),White (HUD),Hispanic/Latino (HUD),Male,No (HUD),,,,,
2,173783,,,White (HUD),Hispanic/Latino (HUD),Female,No (HUD),,,,,
3,173800,,Full SSN Reported (HUD),Black or African American (HUD),Non-Hispanic/Non-Latino (HUD),Female,No (HUD),,,,,
4,173803,,Full SSN Reported (HUD),White (HUD),Hispanic/Latino (HUD),Female,No (HUD),,,,,


In [6]:
client["Military Branch"].value_counts()

Army (HUD)              473
Navy (HUD)              245
Air Force (HUD)         131
Marines (HUD)           115
Coast Guard (HUD)        18
Other (HUD)               8
National Guard            3
Client refused (HUD)      2
Name: Military Branch, dtype: int64

# Bed Inventory

In [7]:
bed_inventory = pd.read_csv("data/raw/BedInventory.csv")

In [8]:
bed_inventory.head()

Unnamed: 0,Project ID,Inventory ID,Inventory Household Type,HMIS Participating Beds,Inventory Start Date,Inventory End Date,Unit Inventory,Bed Inventory,Vet Bed Inventory,Youth Bed Inventory,Youth Bed Age Group
0,2088,144,Households with at least one adult and one child,36.0,1/1/2001,9/30/2013,9.0,36.0,,,
1,2088,655,Households with at least one adult and one child,36.0,10/1/2013,,9.0,36.0,,,
2,2095,194,Households without children,0.0,1/1/2004,9/30/2013,2.0,2.0,0.0,2.0,Only Ages 18-24
3,2095,678,Households without children,,10/1/2013,,4.0,4.0,,4.0,Only Under Age 18
4,2101,145,Households with at least one adult and one child,24.0,1/1/1992,9/30/2013,5.0,24.0,,,


In [9]:
bed_inventory["Unit Inventory"].value_counts(dropna=False)

NaN       38
 0.0      25
 2.0      19
 6.0      18
 1.0      12
 4.0      10
 3.0       8
 12.0      7
 5.0       7
 28.0      7
 100.0     4
 14.0      4
 46.0      4
 40.0      4
 9.0       3
 16.0      3
 25.0      3
 8.0       3
 24.0      2
 30.0      2
 15.0      2
 21.0      2
 10.0      2
 23.0      2
 17.0      2
 188.0     1
 34.0      1
 26.0      1
 31.0      1
 11.0      1
 20.0      1
 13.0      1
 18.0      1
 104.0     1
 32.0      1
 54.0      1
 36.0      1
 7.0       1
 22.0      1
 35.0      1
 68.0      1
Name: Unit Inventory, dtype: int64

In [10]:
bed_inventory["Vet Bed Inventory"].value_counts(dropna=False)

NaN       174
 1.0        6
 0.0        5
 5.0        3
 3.0        2
 46.0       2
 12.0       2
 6.0        2
 2.0        2
 16.0       2
 4.0        2
 188.0      1
 10.0       1
 11.0       1
 86.0       1
 18.0       1
 21.0       1
 51.0       1
Name: Vet Bed Inventory, dtype: int64

In [11]:
bed_inventory["Youth Bed Inventory"].value_counts(dropna=False)

NaN      186
 0.0      19
 17.0      1
 6.0       1
 4.0       1
 2.0       1
Name: Youth Bed Inventory, dtype: int64

In [12]:
bed_inventory["Inventory Household Type"].value_counts(dropna=False)

Households without children                         129
Households with at least one adult and one child     80
Name: Inventory Household Type, dtype: int64

# Income

In [13]:
income = pd.read_csv("data/raw/Income Entry & Exit.csv")

In [14]:
income

Unnamed: 0,Personal ID,Project Entry ID,Entry Alimony,Entry Child Support,Entry Earned,Entry GA,Entry Other,Entry Pension,Entry Private Disability,Entry Social Security Retirement,...,Exit Social Security Retirement,Exit SSDI,Exit SSI,Exit TANF,Exit Unemployment,Exit VA Non-Service,Exit VA Service Connected,Exit Worker's Compensation,Exit Total Income,Income Change
0,173781,252608,,,,,,,,,...,,,,$607,,,,,$607,$0
1,173781,314080,,,,,,,,,...,,,,"$1,143",,,,,"$1,143",$0
2,173782,122915,,,,,,,,,...,,,,,,,,,,
3,173782,314084,,,,,,,,,...,,,,,,,,,,
4,173783,122916,,,,,,,,,...,,,,,,,,,,
5,173783,314085,,,,,,,,,...,,,,,,,,,,
6,173800,122917,,,,,,,,,...,,,,,,,,,,
7,173803,201286,,,,,,,,,...,,,,$571,$752,,,,"$1,323",$0
8,173804,201288,,,,,,,,,...,,,,,,,,,,
9,173805,201290,,,,,,,,,...,,,,,,,,,,


# Enrollment

In [15]:
import pandas as pd

In [16]:
enrollment = pd.read_csv("data/raw/Enrollment.csv")

In [17]:
enrollment["Client Age at Entry"].value_counts(dropna=False)

 57.0     2142
 55.0     2090
 56.0     1975
 50.0     1948
 52.0     1865
 49.0     1837
 46.0     1630
 45.0     1619
 53.0     1572
 51.0     1479
 54.0     1473
 47.0     1467
 48.0     1429
 42.0     1413
 44.0     1396
 58.0     1337
 59.0     1268
 62.0     1165
 35.0     1153
 60.0     1030
 61.0     1015
 27.0     1008
 43.0      971
 28.0      926
 23.0      891
 36.0      869
 29.0      852
 34.0      852
 38.0      805
 65.0      766
          ... 
 10.0      216
 72.0      215
 69.0      212
 15.0      201
 13.0      190
 12.0      185
 14.0      179
 70.0      163
 74.0       94
 76.0       88
 81.0       79
 73.0       68
 75.0       56
 82.0       53
 78.0       32
 77.0       18
 80.0       12
 79.0        7
 83.0        6
 101.0       6
-54.0        3
 87.0        2
 88.0        2
 89.0        1
 90.0        1
 93.0        1
 84.0        1
 85.0        1
 97.0        1
 86.0        1
Name: Client Age at Entry, dtype: int64

In [18]:
enrollment.head()

Unnamed: 0,Personal ID,Project Entry ID,Client Age at Entry,Last Permanent Zip,Project Entry Date Updated,Destination,Entry Date,Exit Date,Project ID,Housing Status @ Project Start,...,Continuously Homeless One Year,"If Yes for ""Client entering from streets, ES or SH"" Approximate date started:",Times Homeless Past Three Years,Months Homeless This Time,Chronic Homeless,In Permanent Housing,Residential Move In Date,Domestic Violence Victim,DV When Occurred,DV Currently Fleeing
0,173781,252608,34.0,93907.0,1/23/2015,Other (HUD),2/28/2014,2/28/2014,2104,At-risk of homelessness (HUD),...,,,,,No,,,Yes (HUD),More than a year ago (HUD),
1,173781,314080,35.0,93907.0,1/12/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,No (HUD),,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),
2,173782,122915,1.0,,12/30/2004,,11/29/2004,,2114,,...,,,,,No,,,,,
3,173782,314084,10.0,93905.0,1/7/2015,"Emergency shelter, including hotel or motel pa...",5/6/2014,9/30/2014,2101,Category 1 - Homeless (HUD),...,Yes (HUD),4/6/2014,2.0,,No,,,Yes (HUD),From six to twelve months ago (HUD),
4,173783,122916,2.0,,12/30/2004,,11/29/2004,,2114,,...,,,,,No,,,,,


In [19]:
for name in enrollment.columns:
    print name

Personal ID
Project Entry ID
Client Age at Entry
Last Permanent Zip
Project Entry Date Updated
Destination
Entry Date
Exit Date
Project ID
Housing Status @ Project Start
Living situation before program entry?
Client Location
Household ID
Relationship to HoH
Disabling Condition
Continuously Homeless One Year
If Yes for "Client entering from streets, ES or SH" Approximate date started:
Times Homeless Past Three Years
Months Homeless This Time
Chronic Homeless
In Permanent Housing
Residential Move In Date
Domestic Violence Victim
DV When Occurred
DV Currently Fleeing
