# Imports

In [24]:
# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [25]:
# data
df = pd.read_csv("../../data/processed/DMV_dataset.csv", index_col = 0)

# Descriptive Statistics

1. Data Types
2. Distributions
3. Outliers


## 1. Data Types

In [26]:
df.dtypes

GEOID                                     int64
year                                      int64
units                                     int64
units_sf                                  int64
units_2_4                                 int64
units_mf                                  int64
name                                     object
state                                    object
exp_homelessness                        float64
bike_2021                               float64
bus_2021                                float64
ferry_2021                              float64
train_2021                              float64
air_2021                                float64
inequality_index                        float64
med_home_val                            float64
agg_interest_div_rental                 float64
med_home_income                         float64
civ_unemp                               float64
civ_emp                                 float64
poverty_rate                            

## 2. Distributions

In [27]:
# Housing & Homelessness
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["units", "units_sf", "units_2_4", "units_mf", "exp_homelessness"]].describe()

Unnamed: 0,units,units_sf,units_2_4,units_mf,exp_homelessness
count,4763.0,4763.0,4763.0,4763.0,1410.0
mean,412.470082,310.118833,313.143187,95.849255,147.02766
std,814.901639,594.264385,597.253777,335.876578,662.566254
min,0.0,0.0,0.0,0.0,0.858421
25%,45.0,40.0,41.0,0.0,7.953127
50%,116.0,99.0,100.0,0.0,16.69054
75%,348.5,274.0,277.0,30.0,64.987965
max,7898.0,6047.0,6047.0,5683.0,8350.0


Note: Multifamily homes are very uncommon, and there are many outliers for each feature.

In [28]:
# Transit
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["bike_2021", "bus_2021", "ferry_2021", "train_2021", "air_2021"]].describe()

Unnamed: 0,bike_2021,bus_2021,ferry_2021,train_2021,air_2021
count,1590.0,1590.0,1590.0,1590.0,1590.0
mean,10.716981,11.358491,0.150943,3.679245,0.226415
std,41.171353,35.943681,0.451397,7.77461,0.418642
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,1.0,0.0
75%,0.0,5.0,0.0,2.0,0.0
max,278.0,242.0,2.0,40.0,1.0


Note: The average county has one bus stop and one train stop. Airports and bike racks are highly concentrated, and ferries are practically non-existant.

In [33]:
# Economic
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["inequality_index", "med_home_val", "med_monthly_home_cost",
                                                       "agg_interest_div_rental", "med_home_income", "civ_unemp",
                                                       "civ_emp", "poverty_rate"]].describe()


Unnamed: 0,inequality_index,med_home_val,med_monthly_home_cost,agg_interest_div_rental,med_home_income,civ_unemp,civ_emp,poverty_rate
count,1583.0,1583.0,1109.0,1583.0,1583.0,1424.0,1424.0,1583.0
mean,0.433917,215468.288061,1067.639315,146724800.0,57025.394188,3252.007725,46218.089888,0.137869
std,0.040926,109233.389148,512.323758,385245100.0,20987.984934,6835.194973,93010.323868,0.06661
min,0.3195,62000.0,298.0,1169300.0,22333.0,5.0,874.0,0.026865
25%,0.40305,137150.0,671.0,14734650.0,41231.0,480.75,6781.0,0.084353
50%,0.43,195400.0,940.0,35386400.0,51092.0,1001.0,13241.5,0.12582
75%,0.459,262650.0,1359.0,103634200.0,68139.0,2364.0,37524.0,0.184739
max,0.6002,789300.0,2815.0,3441312000.0,142299.0,49845.0,613908.0,0.396077


Note: Inequality is fairly condensed and evenly distributed. Every other metric has serious outliers, and aggregate interest income is difficult to interpret.

In [30]:
# Census - Internet, healthcare, transit, public assistance
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["internet_access", "healthcare_coverage",
                                                       "pub_transit", "work_from_home", "percent_with_cash_pub_assist_or_snap",
                                                       "medicare_coverage"]].describe()

Unnamed: 0,internet_access,healthcare_coverage,pub_transit,work_from_home,percent_with_cash_pub_assist_or_snap,medicare_coverage
count,474.0,1265.0,1583.0,1425.0,1583.0,1265.0
mean,0.790497,1.780609,3516.885028,2045.779649,0.122313,0.396058
std,0.09127,0.076689,14550.436755,4488.036643,0.063513,0.117748
min,0.565991,1.481949,0.0,0.0,0.017519,0.129046
25%,0.721658,1.730608,26.0,254.0,0.073851,0.30669
50%,0.782068,1.782425,77.0,520.0,0.111454,0.401061
75%,0.866933,1.837611,566.5,1529.0,0.160171,0.477035
max,0.973068,1.948806,129713.0,39253.0,0.378099,0.876588


In [40]:
df[["GEOID", "healthcare_coverage", "medicare_coverage"]].loc[~(df["healthcare_coverage"].isna())]

Unnamed: 0,GEOID,healthcare_coverage,medicare_coverage
23,11001,1.864168,0.242972
24,11001,1.866694,0.244907
25,11001,1.871587,0.243507
26,11001,1.883725,0.243898
27,11001,1.896897,0.248482
...,...,...,...
4759,51840,1.653089,0.308630
4760,51840,1.669176,0.323002
4761,51840,1.691907,0.320481
4762,51840,1.689479,0.328945


Note: There seems to be something off with the healthcare coverage data, as these should be proportions. Otherwise, we see the typical set of county outliers here as with other features. These are likely due to not weighting by population yet. That said, there are some % stats that already have outliers.

In [31]:
# Hospitals
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["rating", "ems_hospitals", "acute_care_hospitals",
                                                       "critical_access_hospitals", "childrens_hospitals", "public_hospitals",
                                                       "private_hospitals", "non_profit_hospitals", "tribal_hospitals"]].describe()

Unnamed: 0,rating,ems_hospitals,acute_care_hospitals,critical_access_hospitals,childrens_hospitals,public_hospitals,private_hospitals,non_profit_hospitals,tribal_hospitals
count,1830.0,2580.0,2580.0,2580.0,2580.0,2580.0,2580.0,2580.0,2580.0
mean,2.972678,1.453488,1.395349,0.093023,0.05814,0.046512,0.186047,0.046512,0.0
std,0.690467,1.444249,1.527408,0.290521,0.279366,0.210631,0.38922,0.210631,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,12.0,12.0,1.0,2.0,1.0,1.0,1.0,0.0


Note: Hospitals seem to be generally uncommon, albeit acute care and ems hospitals are different. Tribal hospitals are non-existant in these data.

In [34]:
# Demographic
df.drop(["GEOID", "year", "name", "state"], axis = 1)[["pop", "white_pop", "black_pop", "nativeAm_pop"]].describe()

Unnamed: 0,pop,white_pop,black_pop,nativeAm_pop
count,1583.0,1583.0,1583.0,1583.0
mean,92928.86,58727.639924,22962.397979,266.584334
std,177158.4,98738.583448,65194.539191,559.458253
min,2204.0,1384.0,0.0,0.0
25%,15684.0,11520.5,1493.5,23.0
50%,30184.0,22760.0,4411.0,71.0
75%,78386.5,62631.5,10951.0,209.0
max,1145862.0,713926.0,572465.0,5123.0


Note: Some population variables are missing. Otherwise, note that there are extreme outliers in population, likely due to cities within counties. or the independent cities of Virginia.