In [80]:
!pip install geopandas



In [81]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

In [82]:
rawdf = pd.read_csv(
    "data/raw_data/GreenspaceDownload/GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_2.csv",
    encoding="unicode_escape",
    low_memory=False,
)

In [83]:
rawdf.shape

(36902, 160)

In [84]:
rawdf.describe()

Unnamed: 0,ID_HDC_G0,QA2_1V,AREA,BBX_LATMN,BBX_LONMN,BBX_LATMX,BBX_LONMX,GCPNT_LAT,GCPNT_LON,XBRDR,...,EX_SS_AREA,EX_SS_B75,EX_SS_B90,EX_SS_B00,EX_SS_B15,EX_SS_P75,EX_SS_P90,EX_SS_P00,EX_SS_P15,SDG_A2G14
count,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,...,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0,13135.0
mean,6568.0,1.176323,50.26083,21.100424,51.795521,21.165498,51.882079,21.132864,51.838877,0.012181,...,5.875828,2.02606,2.28909,2.585256,2.735535,14281.22,19205.04,22274.1,26036.8,0.29079
std,3791.892228,0.429571,189.169066,17.966437,58.682289,17.974479,58.675668,17.97038,58.679035,0.109698,...,61.114603,31.431693,33.614654,36.48836,37.881205,181029.9,233726.9,267897.2,314979.1,0.291329
min,1.0,0.0,1.0,-54.82509,-158.043016,-54.785331,-157.730529,-54.803854,-157.893497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.012064
25%,3284.5,1.0,8.0,10.760656,19.585115,10.82173,19.615632,10.79264,19.600961,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054903
50%,6568.0,1.0,18.0,24.368125,73.429889,24.418414,73.499483,24.390754,73.468653,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182939
75%,9851.5,1.0,37.0,32.654868,90.288306,32.732591,90.383252,32.702407,90.340966,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461405
max,13135.0,2.0,6622.0,69.303014,178.382096,69.362302,178.526063,69.333682,178.461255,1.0,...,3487.0,1987.376221,2141.008545,2326.824463,2419.250488,10812890.0,12278890.0,12717570.0,15049410.0,1.003432


In [85]:
rawdf.dtypes

ID_HDC_G0      float64
QA2_1V         float64
AREA           float64
BBX_LATMN      float64
BBX_LONMN      float64
                ...   
EX_EQ19_Q       object
EX_HW_IDX       object
SDG_LUE9015     object
SDG_A2G14      float64
SDG_OS15MX      object
Length: 160, dtype: object

In [112]:
cols_to_keep = [
    "AREA",
    "GCPNT_LAT",
    "GCPNT_LON",
    "CTR_MN_NM",
    "UC_NM_MN",
    "UC_NM_LST", # general
    "E_BM_NM_LST",
    "E_SL_LST",
    "EL_AV_ALS",
    "E_WR_P_14",
    "E_WR_T_14",
    "E_GR_AV14",
    "E_GR_AT14", # geography
    "P15",
    "B15",
    "BUCAP15",
    "NTL_AV",
    "GDP15_SM", #socio
    "E_EC2E_R15",
    "E_EC2E_I15",
    "E_EC2E_T15",
    "E_EC2E_A15",
    "E_EC2O_R15",
    "E_EC2O_I15", # emissions1
    "E_EC2O_A15",
    "E_EPM2_R15",
    "E_EPM2_I15",
    "E_EPM2_T15",
    "E_EPM2_A15",
    "E_CPM2_T14", #emissions2
    "SDG_A2G14",
    "SDG_OS15MX",
    "SDG_LUE9015",
    "EX_HW_IDX" #landuse
]
df = rawdf[cols_to_keep]
df = df[df["CTR_MN_NM"] == "United States"] #didn't include income and development data because it is at the country level and we have filtered out all other countries
df.replace(
    to_replace=["?", "??", "???", "NAN"],
    value=[np.nan, np.nan, np.nan, np.nan],
    inplace=True,
)
df.rename(
    columns={
        "AREA": 'Urban Center Area',
        "GCPNT_LAT": 'Latitude',
        "GCPNT_LON": 'Longitude',
        "CTR_MN_NM": 'Country',
        "UC_NM_MN": 'Urban Center',
        "UC_NM_LST": 'Cities in Urban Center',
        "E_BM_NM_LST": 'Biome',
        "E_SL_LST": 'Soil Group',
        "EL_AV_ALS": 'Avg Elevation',
        "E_WR_P_14": 'Avg Precipitation',
        "E_WR_T_14": 'Avg Temp',
        "E_GR_AV14": 'Avg Greenness',
        "E_GR_AT14": 'Total Green Area',
        "P15": 'Population',
        "B15": 'Total Built-up Area',
        "BUCAP15": 'Built-up Area per capita',
        "NTL_AV": 'Avg Nighttime Light Emission',
        "GDP15_SM": 'Sum of GDP',
        "E_EC2E_R15": 'TCNSCE Residential', #total co2 non short cycle emissions
        "E_EC2E_I15": 'TCNSCE Industry',
        "E_EC2E_T15": 'TCNSCE Transport',
        "E_EC2E_A15": 'TCNSCE Agriculture',
        "E_EC2O_R15": 'TCSCOE Residential', #total co2 short cycle organic emissions
        "E_EC2O_I15": 'TCSCOE Industry',
        "E_EC2O_A15": 'TCSCOE Agriculture',
        "E_EPM2_R15": 'Particulate Matter Emissions Residential',
        "E_EPM2_I15": 'Particulate Matter Emissions Industry',
        "E_EPM2_T15": 'Particulate Matter Emissions Transport',
        "E_EPM2_A15": 'Particulate Matter Emissions Agriculture',
        "E_CPM2_T14": 'Total Concertation of Particulate Matter',
        "SDG_A2G14": '% of Pop in High Green Area',
        "SDG_OS15MX": '% of Open Spaces',
        "SDG_LUE9015": 'Land Use Efficiency',
        "EX_HW_IDX": 'Max Magnitude of Heatwaves'
    },
    inplace=True,
)
df.head()

Unnamed: 0,Urban Center Area,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,Biome,Soil Group,Avg Elevation,Avg Precipitation,...,TCSCOE Agriculture,Particulate Matter Emissions Residential,Particulate Matter Emissions Industry,Particulate Matter Emissions Transport,Particulate Matter Emissions Agriculture,Total Concertation of Particulate Matter,% of Pop in High Green Area,% of Open Spaces,Land Use Efficiency,Max Magnitude of Heatwaves
0,185.0,21.340678,-157.893497,United States,Honolulu,Honolulu; Waipahu; Pearl City; Aiea,Tropical and Subtropical Dry Broadleaf Forests,Vertisols,52.29643514,741.6250153,...,3.078803527,40.14455643,174.221274,23.53676553,0.047533547,5.765125,0.226415,56.41,0.074385203,
2,55.0,34.923123,-120.434372,United States,Santa Maria,Santa Maria,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,87.12809223,245.1750011,...,8.266219641,8.503971075,35.140393,3.014212862,0.047214542,11.6309,0.040129,23.64,0.481144026,2.791739941
3,48.0,36.60772,-121.882378,United States,Monterey,Monterey,"Mediterranean Forests, Woodlands, and Scrub",Phaeozems,38.54796203,324.9250069,...,1.07850128,7.497389813,11.707452,1.972619542,0.008828798,10.93275,0.138683,42.17,0.444839872,
4,60.0,34.427664,-119.743693,United States,Santa Barbara,Santa Barbara,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,38.29809451,183.9250069,...,1.329586573,14.04887269,30.528288,3.263458168,0.013192485,13.8037,0.061348,36.5,0.55676245,4.255020142
5,57.0,36.971274,-121.978684,United States,Santa Cruz,Santa Cruz,"Mediterranean Forests, Woodlands, and Scrub",Phaeozems,24.08581144,324.9250069,...,0.071545386,11.32560598,24.377837,3.99150779,0.001443604,11.86825,0.109287,39.46,0.317594332,1.51970005


In [87]:
checker = df[df["Urban Center"].str.contains("?", regex=False)]
checker

Unnamed: 0,Urban Center Area,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,Biome,Soil Group,Avg Elevation,Avg Precipitation,...,TCSCOE Agriculture,Particulate Matter Emissions Residential,Particulate Matter Emissions Industry,Particulate Matter Emissions Transport,Particulate Matter Emissions Agriculture,Total Concertation of Particulate Matter,% of Pop in High Green Area,% of Open Spaces,Land Use Efficiency,Max Magnitude of Heatwaves
482,168.0,38.777312,-90.611861,United States,O?Fallon,O?Fallon; Saint Charles; Saint Peters; Cottlev...,"Temperate Grasslands, Savannas, and Shrublands",Phaeozems,163.0952796,1057.825027,...,1832.278462,114.1057715,281.217959,45.87761046,7.098453931,11.3387,0.790093,75.6,1.532844781,17.70470047


In [88]:
a1 = df.loc[482]["Cities in Urban Center"]
a1replace = a1.replace("?", "'")

In [89]:
df.at[482, "Urban Center"] = "O'Fallon"
df.at[482, "Cities in Urban Center"] = a1replace
df.at[553, "Urban Center"] = "Minneapolis"

In [90]:
mhdf = pd.read_csv(
    "data/raw_data/500_Cities__City-level_Data__GIS_Friendly_Format___2017_release_20240514.csv"
)
mhdf.columns

Index(['StateAbbr', 'PlaceName', 'PlaceFIPS', 'Population2010',
       'ACCESS2_CrudePrev', 'ACCESS2_Crude95CI', 'ACCESS2_AdjPrev',
       'ACCESS2_Adj95CI', 'ARTHRITIS_CrudePrev', 'ARTHRITIS_Crude95CI',
       ...
       'SLEEP_Adj95CI', 'STROKE_CrudePrev', 'STROKE_Crude95CI',
       'STROKE_AdjPrev', 'STROKE_Adj95CI', 'TEETHLOST_CrudePrev',
       'TEETHLOST_Crude95CI', 'TEETHLOST_AdjPrev', 'TEETHLOST_Adj95CI',
       'Geolocation'],
      dtype='object', length=117)

In [91]:
mh_cities = (mhdf["PlaceName"].unique()).tolist()
mh_cities

['Birmingham',
 'Hoover',
 'Huntsville',
 'Mobile',
 'Montgomery',
 'Tuscaloosa',
 'Anchorage',
 'Avondale',
 'Chandler',
 'Gilbert',
 'Glendale',
 'Mesa',
 'Peoria',
 'Phoenix',
 'Scottsdale',
 'Surprise',
 'Tempe',
 'Tucson',
 'Yuma',
 'Fayetteville',
 'Fort Smith',
 'Jonesboro',
 'Little Rock',
 'Springdale',
 'Alameda',
 'Alhambra',
 'Anaheim',
 'Antioch',
 'Apple Valley',
 'Bakersfield',
 'Baldwin Park',
 'Bellflower',
 'Berkeley',
 'Buena Park',
 'Burbank',
 'Carlsbad',
 'Carson',
 'Chico',
 'Chino',
 'Chino Hills',
 'Chula Vista',
 'Citrus Heights',
 'Clovis',
 'Compton',
 'Concord',
 'Corona',
 'Costa Mesa',
 'Daly City',
 'Downey',
 'El Cajon',
 'Elk Grove',
 'El Monte',
 'Escondido',
 'Fairfield',
 'Folsom',
 'Fontana',
 'Fremont',
 'Fresno',
 'Fullerton',
 'Garden Grove',
 'Hawthorne',
 'Hayward',
 'Hemet',
 'Hesperia',
 'Huntington Beach',
 'Indio',
 'Inglewood',
 'Irvine',
 'Lake Forest',
 'Lakewood',
 'Lancaster',
 'Livermore',
 'Long Beach',
 'Los Angeles',
 'Lynwood',
 

In [92]:
df["Cities in Urban Center_copy"] = df["Cities in Urban Center"]
df["Cities in Urban Center"] = df["Cities in Urban Center"].str.split(";")
df = df.explode("Cities in Urban Center")
df.reset_index(inplace=True, drop=False)
df.rename(columns={"index": "UC Grouping"}, inplace=True)
df["Cities in Urban Center"] = df["Cities in Urban Center"].str.strip()

In [93]:
# filtered_df = df[df["Cities in Urban Center_copy"].str.contains("San Francisco")]
# filtered_df

In [94]:
# t = df[df['Cities in Urban Center'] == 'San Francisco']
# t

In [95]:
ucgroup = df[df["Cities in Urban Center"].isin(mh_cities)]
ucgrouplist = ucgroup.index.tolist()

In [96]:
df = df[df.index.isin(ucgrouplist)]
df.head()

Unnamed: 0,UC Grouping,Urban Center Area,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,Biome,Soil Group,Avg Elevation,...,Particulate Matter Emissions Residential,Particulate Matter Emissions Industry,Particulate Matter Emissions Transport,Particulate Matter Emissions Agriculture,Total Concertation of Particulate Matter,% of Pop in High Green Area,% of Open Spaces,Land Use Efficiency,Max Magnitude of Heatwaves,Cities in Urban Center_copy
0,0,185.0,21.340678,-157.893497,United States,Honolulu,Honolulu,Tropical and Subtropical Dry Broadleaf Forests,Vertisols,52.29643514,...,40.14455643,174.221274,23.53676553,0.047533547,5.765125,0.226415,56.41,0.074385203,,Honolulu; Waipahu; Pearl City; Aiea
4,2,55.0,34.923123,-120.434372,United States,Santa Maria,Santa Maria,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,87.12809223,...,8.503971075,35.140393,3.014212862,0.047214542,11.6309,0.040129,23.64,0.481144026,2.791739941,Santa Maria
6,4,60.0,34.427664,-119.743693,United States,Santa Barbara,Santa Barbara,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,38.29809451,...,14.04887269,30.528288,3.263458168,0.013192485,13.8037,0.061348,36.5,0.55676245,4.255020142,Santa Barbara
8,6,54.0,36.688991,-121.640831,United States,Salinas,Salinas,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,21.30891932,...,16.56970821,65.973825,6.895040733,0.059791583,12.3763,0.076114,24.61,0.843804111,11.39150047,Salinas
9,7,136.0,34.217486,-119.209132,United States,Oxnard,Oxnard,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,21.89917854,...,30.23359415,118.460613,16.89482998,0.072824232,17.0109,0.036199,28.65,0.43558362,,Oxnard; Ventura


In [97]:
stateboundaries = gpd.read_file("data/raw_data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp")
stateboundaries.head()

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ..."
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ..."
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6..."
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ..."
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169..."


In [98]:
def statefinder(row):
    point = Point(row["Longitude"], row["Latitude"])
    state = stateboundaries[stateboundaries.contains(point)]

    if not state.empty:
        return state.iloc[0]["STUSPS"]
    else:
        return np.nan

In [99]:
df["State"] = df.apply(statefinder, axis=1)

In [100]:
# states = pd.read_csv('uscities.csv')
# states.sort_values(by='population', ascending=False, inplace=True)
# states = states[['city', 'state_id', 'lat', 'lng']]
# states.rename(columns={'city': 'Urban Center', 'state_id': 'State'}, inplace=True)
# states.drop_duplicates(subset='Urban Center', keep='first', inplace=True)

In [101]:
# dfj = df.join(states.set_index('Urban Center'), on='Cities in Urban Center', how='left')
# dfj

In [102]:
# dfj['lat_diff'] = np.abs(dfj['Latitude'] - dfj['lat'])
# dfj['lng_diff'] = np.abs(dfj['Longitude'] - dfj['lng'])

In [103]:
# check_df = dfj[(dfj['lat_diff'] > 1) | (dfj['lng_diff'] > 1)]
# print(check_df.shape)
# check_df.tail(20)

In [104]:
# dfj.at[21, 'State'] = 'CA'
# dfj.at[37, 'State'] = 'CA'
# dfj.at[47, 'State'] = 'CA'
# dfj.at[77, 'State'] = 'CA'
# dfj.at[116, 'State'] = 'AZ'
# dfj.at[162, 'State'] = 'UT'
# dfj.at[235, 'State'] = 'KS'
# dfj.at[285, 'State'] = 'MO'
# dfj.at[323, 'State'] = 'MN'
# dfj.at[369, 'State'] = 'IL'
# dfj.at[391, 'State'] = 'IL'
# dfj.at[400, 'State'] = 'IL'
# dfj.at[459, 'State'] = 'PA'
# dfj.at[473, 'State'] = 'VA'
# dfj.at[505, 'State'] = 'PA'

# dfj.at[384, 'State'] = 'IN'
# dfj.at[303, 'State'] = 'GA'
# dfj.at[550, 'State'] = 'CT'
# dfj.at[135, 'State'] = 'TX'
# dfj.at[493, 'State'] = 'MD'
# dfj.at[262, 'State'] = 'MO'
# dfj.at[579, 'State'] = 'MA'
# dfj.at[309, 'State'] = 'IL'
# dfj.at[584, 'State'] = 'ME'
# dfj.at[366, 'State'] = 'IN'
# dfj.at[433, 'State'] = 'NC'
# dfj.at[340, 'State'] = 'MN'
# dfj.at[453, 'State'] = 'NC'
# dfj.at[535, 'State'] = 'NJ'
# dfj.at[557, 'State'] = 'NY'


In [105]:
def us_division():
    """
    Returns a dictionary of US divisions and their respective states.
    """
    us_divisions = {
        "New England": ["CT", "ME", "MA", "NH", "RI", "VT"],
        "Middle Atlantic": ["NJ", "NY", "PA"],
        "East North Central": ["IL", "IN", "MI", "OH", "WI"],
        "West North Central": ["IA", "KS", "MN", "MO", "NE", "ND", "SD"],
        "South Atlantic": ["DE", "FL", "GA", "MD", "NC", "SC", "VA", "WV", "DC"],
        "East South Central": ["AL", "KY", "MS", "TN"],
        "West South Central": ["AR", "LA", "OK", "TX"],
        "Mountain": ["AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY"],
        "Pacific": ["AK", "CA", "HI", "OR", "WA"],
    }
    return us_divisions

In [106]:
def us_region():
    """
    Returns a dictionary of US regions and their respective states.
    """

    us_regions = {
        "West": [
            "AK",
            "AZ",
            "CA",
            "CO",
            "HI",
            "ID",
            "MT",
            "NV",
            "NM",
            "OR",
            "UT",
            "WA",
            "WY",
        ],
        "Midwest": [
            "IL",
            "IN",
            "IA",
            "KS",
            "MI",
            "MN",
            "MO",
            "NE",
            "ND",
            "OH",
            "SD",
            "WI",
        ],
        "Northeast": ["CT", "DE", "ME", "MD", "MA", "NH", "NJ", "NY", "PA", "RI", "VT"],
        "South": [
            "AL",
            "AR",
            "FL",
            "GA",
            "KY",
            "LA",
            "MS",
            "NC",
            "OK",
            "SC",
            "TN",
            "TX",
            "VA",
            "WV",
            "DC",
        ],
    }
    return us_regions

In [107]:
def apply_geo_labels(df, label_col_name, label_dict, base_col):
    """
    Apply labels based on existing column.
    Input df, name for labeled column, label dictionary, and based column.
    Returns the dataframe with the labeled column.
    """
    new_df = df.copy()
    new_df[label_col_name] = ["None" for x in range(len(df))]
    for key, value in label_dict.items():
        new_df.loc[new_df[base_col].isin(value), label_col_name] = key
    return new_df

In [108]:
region_dic = us_region()
div_dic = us_division()

df = apply_geo_labels(df, 'Region', region_dic, 'State')
df = apply_geo_labels(df, 'Division', div_dic, 'State')

In [109]:
df[df["State"].isna()].head()

Unnamed: 0,UC Grouping,Urban Center Area,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,Biome,Soil Group,Avg Elevation,...,Particulate Matter Emissions Agriculture,Total Concertation of Particulate Matter,% of Pop in High Green Area,% of Open Spaces,Land Use Efficiency,Max Magnitude of Heatwaves,Cities in Urban Center_copy,State,Region,Division


In [110]:
df.head()

Unnamed: 0,UC Grouping,Urban Center Area,Latitude,Longitude,Country,Urban Center,Cities in Urban Center,Biome,Soil Group,Avg Elevation,...,Particulate Matter Emissions Agriculture,Total Concertation of Particulate Matter,% of Pop in High Green Area,% of Open Spaces,Land Use Efficiency,Max Magnitude of Heatwaves,Cities in Urban Center_copy,State,Region,Division
0,0,185.0,21.340678,-157.893497,United States,Honolulu,Honolulu,Tropical and Subtropical Dry Broadleaf Forests,Vertisols,52.29643514,...,0.047533547,5.765125,0.226415,56.41,0.074385203,,Honolulu; Waipahu; Pearl City; Aiea,HI,West,Pacific
4,2,55.0,34.923123,-120.434372,United States,Santa Maria,Santa Maria,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,87.12809223,...,0.047214542,11.6309,0.040129,23.64,0.481144026,2.791739941,Santa Maria,CA,West,Pacific
6,4,60.0,34.427664,-119.743693,United States,Santa Barbara,Santa Barbara,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,38.29809451,...,0.013192485,13.8037,0.061348,36.5,0.55676245,4.255020142,Santa Barbara,CA,West,Pacific
8,6,54.0,36.688991,-121.640831,United States,Salinas,Salinas,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,21.30891932,...,0.059791583,12.3763,0.076114,24.61,0.843804111,11.39150047,Salinas,CA,West,Pacific
9,7,136.0,34.217486,-119.209132,United States,Oxnard,Oxnard,"Mediterranean Forests, Woodlands, and Scrub",Luvisols,21.89917854,...,0.072824232,17.0109,0.036199,28.65,0.43558362,,Oxnard; Ventura,CA,West,Pacific


In [111]:
df.to_csv("data/cleaned_data/greenspace_cleaned.csv")