In [13]:
import os
print(os.path.expanduser("~/Desktop"))

/Users/robinakashita/Desktop


In [17]:
hackathon_path = "/Users/robinakashita/Desktop/Hackathon"
print(os.listdir(hackathon_path))

['dc_demographics.gpkg', 'nyc_demographics.gpkg', 'washington_demographics.gpkg', 'demographics.gpkg']


In [4]:
import geopandas as gpd
import pandas as pd

PATH_MA = "/Users/robinakashita/Desktop/Hackathon/ma_demographics.gpkg"
PATH_WA = "/Users/robinakashita/Desktop/Hackathon/washington_demographics.gpkg"
PATH_NYC = "/Users/robinakashita/Desktop/Hackathon/nyc_demographics.gpkg"
PATH_DC = "/Users/robinakashita/Desktop/Hackathon/dc_demographics.gpkg"

LAYER_NAME = "blockgroup"  

POV_COL = "income_100PercentPoverty_ratio"
COMMUTE_COL = "commute_time60min_ratio"

def load_region(path, region_name):
    gdf = gpd.read_file(path, layer=LAYER_NAME)
    df = gdf[[POV_COL, COMMUTE_COL]].copy()
    df["region"] = region_name
    return df

# Load all four regions 

df_ma  = load_region(PATH_MA,  "MA")
df_wa  = load_region(PATH_WA,  "WA")
df_nyc = load_region(PATH_NYC, "NYC")
df_dc  = load_region(PATH_DC,  "DC")

# Combine into a single DataFrame 

all_df = pd.concat([df_ma, df_wa, df_nyc, df_dc], ignore_index=True)

# Drop rows where both metrics are missing

all_df = all_df.dropna(subset=[POV_COL, COMMUTE_COL], how="all")

print(all_df.head())
print(all_df["region"].value_counts())


   income_100PercentPoverty_ratio  commute_time60min_ratio region
0                        0.052023                 0.166220     MA
1                        0.028986                 0.000000     MA
2                        0.091525                 0.108808     MA
3                        0.067210                 0.157576     MA
4                        0.269305                 0.015564     MA
region
NYC    15801
WA      5292
MA      5072
DC       571
Name: count, dtype: int64


In [27]:
# Summary statisticss by region

def summarize_by_region(df, col):
    print(f"\nSummary for {col} by region")
    summary = df.groupby("region")[col].describe()  # count, mean, std, min, 25%, 50%, 75%, max
    print(summary)
    return summary

stats_poverty = summarize_by_region(all_df, POV_COL)
stats_commute = summarize_by_region(all_df, COMMUTE_COL)


Summary for income_100PercentPoverty_ratio by region
          count      mean       std       min       25%       50%       75%  \
region                                                                        
DC        571.0  0.357172  0.289464  0.003914  0.119122  0.273810  0.516582   
MA       5060.0  0.107017  0.120983  0.000000  0.024865  0.063291  0.145301   
NYC     15801.0  0.339924  0.312940  0.002710  0.119658  0.263095  0.490956   
WA       5286.0  0.102109  0.103830  0.000000  0.030303  0.070475  0.140115   

              max  
region             
DC       1.823529  
MA       0.861635  
NYC     13.823529  
WA       0.866337  

Summary for commute_time60min_ratio by region
          count      mean       std       min       25%       50%       75%  \
region                                                                        
DC        571.0  0.380572  0.364782  0.053435  0.188242  0.269737  0.424476   
MA       5068.0  0.116114  0.100938  0.000000  0.040158  0.097016  

In [5]:
# Correlation between poverty & commute burden by region 

print("\nCorrelation (poverty vs commute60) by region")
corr_results = []

# Only use rows where both values are present

valid = all_df.dropna(subset=[POV_COL, COMMUTE_COL])

for region, sub in valid.groupby("region"):
    r = sub[POV_COL].corr(sub[COMMUTE_COL])  # Pearson correlation
    corr_results.append({"region": region, "corr": r})
    print(f"{region}: r = {r:.3f}")


Correlation (poverty vs commute60) by region
DC: r = 0.428
MA: r = -0.108
NYC: r = 0.251
WA: r = -0.060
