In [39]:
import numpy as np
import pandas as pd

## Zillow data

In [56]:
# Chicago zipcodes list
chicago_zipcodes = [
    60601, 60602, 60603, 60604, 60605, 60606, 60607, 60608, 60609, 60610,
    60611, 60612, 60613, 60614, 60615, 60616, 60617, 60618, 60619, 60620,
    60621, 60622, 60623, 60624, 60625, 60626, 60628, 60629, 60630, 60631,
    60632, 60633, 60634, 60636, 60637, 60638, 60639, 60640, 60641, 60642,
    60643, 60644, 60645, 60646, 60647, 60649, 60651, 60652, 60653, 60654,
    60655, 60656, 60657, 60659, 60660, 60661, 60707, 60827
]

# Read and filter Zillow ZHVI (Home Value Index) data
print("Loading ZHVI data...")
zhvi_data = pd.read_csv('data/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
chicago_zhvi = zhvi_data[zhvi_data['RegionName'].isin(chicago_zipcodes)].copy()
print(f"Filtered ZHVI data: {len(chicago_zhvi)} rows")

# Read and filter Zillow ZORI (Rental Index) data
print("Loading ZORI data...")
zori_data = pd.read_csv('data/Zip_zori_uc_sfrcondomfr_sm_month.csv')
chicago_zori = zori_data[zori_data['RegionName'].isin(chicago_zipcodes)].copy()
print(f"Filtered ZORI data: {len(chicago_zori)} rows")

Loading ZHVI data...
Filtered ZHVI data: 58 rows
Loading ZORI data...
Filtered ZORI data: 54 rows


In [57]:
# Display summary of filtered data
print(f"Chicago ZHVI records: {len(chicago_zhvi)}")
print(f"Chicago ZORI records: {len(chicago_zori)}")
print(f"\nUnique zipcodes in ZHVI: {chicago_zhvi['RegionName'].nunique()}")
print(f"Unique zipcodes in ZORI: {chicago_zori['RegionName'].nunique()}")

Chicago ZHVI records: 58
Chicago ZORI records: 54

Unique zipcodes in ZHVI: 58
Unique zipcodes in ZORI: 54


In [58]:
# Preview the structure of filtered data
print("ZHVI columns:", chicago_zhvi.columns.tolist()[:10], "...")
print("\nZORI columns:", chicago_zori.columns.tolist()[:10], "...")

ZHVI columns: ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 'State', 'City', 'Metro', 'CountyName', '2000-01-31'] ...

ZORI columns: ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 'State', 'City', 'Metro', 'CountyName', '2015-01-31'] ...


In [59]:
# Melt ZHVI data to long format (date as variable)
id_vals = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
           'State', 'City', 'Metro', 'CountyName']

chicago_zhvi_melted = chicago_zhvi.melt(
    id_vars=id_vals,
    var_name='Date',
    value_name='Zhvi'
)

# Melt ZORI data to long format
chicago_zori_melted = chicago_zori.melt(
    id_vars=id_vals,
    var_name='Date',
    value_name='Zori'
)

print("ZHVI melted shape:", chicago_zhvi_melted.shape)
print("ZORI melted shape:", chicago_zori_melted.shape)
chicago_zhvi_melted.head()

ZHVI melted shape: (17980, 11)
ZORI melted shape: (7020, 11)


Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,Date,Zhvi
0,84630,12,60629,zip,IL,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,2000-01-31,93989.740732
1,84620,33,60618,zip,IL,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,2000-01-31,212178.091419
2,84639,41,60639,zip,IL,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,2000-01-31,95498.839634
3,84646,58,60647,zip,IL,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,2000-01-31,211853.469994
4,84633,74,60632,zip,IL,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,2000-01-31,102388.182521


In [60]:
# keep only necessary columns
chicago_zhvi_melted = chicago_zhvi_melted[['RegionName', 'Date', 'Zhvi']]
chicago_zori_melted = chicago_zori_melted[['RegionName', 'Date', 'Zori']]

In [61]:
# Date range for both datasets
print(chicago_zhvi_melted['Date'].min(), chicago_zhvi_melted['Date'].max())
print(chicago_zori_melted['Date'].min(), chicago_zori_melted['Date'].max())

2000-01-31 2025-10-31
2015-01-31 2025-10-31


In [62]:
# only keep data after 2001
chicago_zhvi_melted = chicago_zhvi_melted[
    chicago_zhvi_melted['Date'] >= '2001-01-01'
].copy()

In [63]:
# the zip codes that are in zhvi but not in zori
zhvi_zipcodes = set(chicago_zhvi_melted['RegionName'].unique()) 
zori_zipcodes = set(chicago_zori_melted['RegionName'].unique())
missing_in_zori = zhvi_zipcodes - zori_zipcodes
missing_in_zori

{60602, 60604, 60633, 60655}

In [64]:
# fill in missing zip codes in zori with NaN values for all dates
all_dates = chicago_zhvi_melted['Date'].unique()
for zipcode in missing_in_zori:
    for date in all_dates:
        chicago_zori_melted = pd.concat([
            chicago_zori_melted,
            pd.DataFrame({'RegionName': [zipcode], 'Date': [date], 'Zori': [np.nan]})
        ], ignore_index=True)

In [65]:
# extract year and month from Date
chicago_zhvi_melted['Year'] = pd.to_datetime(chicago_zhvi_melted['Date']).dt.year
chicago_zhvi_melted['Month'] = pd.to_datetime(chicago_zhvi_melted['Date']).dt.month
chicago_zori_melted['Year'] = pd.to_datetime(chicago_zori_melted['Date']).dt.year
chicago_zori_melted['Month'] = pd.to_datetime(chicago_zori_melted['Date']).dt.month

In [66]:
display(chicago_zhvi_melted.head())

Unnamed: 0,RegionName,Date,Zhvi,Year,Month
696,60629,2001-01-31,103440.022854,2001,1
697,60618,2001-01-31,250946.421308,2001,1
698,60639,2001-01-31,109182.296876,2001,1
699,60647,2001-01-31,248647.761698,2001,1
700,60632,2001-01-31,112927.45277,2001,1


In [67]:
# check missing values in ZHVI
print("Missing values in ZHVI:", chicago_zhvi_melted['Zhvi'].isna().sum())

# Extact rows with missing ZHVI values
missing_zhvi = chicago_zhvi_melted[chicago_zhvi_melted['Zhvi'].isna()]
print("Rows with missing ZHVI values:", len(missing_zhvi))
missing_zhvi.head()

Missing values in ZHVI: 357
Rows with missing ZHVI values: 357


Unnamed: 0,RegionName,Date,Zhvi,Year,Month
752,60603,2001-01-31,,2001,1
753,60604,2001-01-31,,2001,1
810,60603,2001-02-28,,2001,2
811,60604,2001-02-28,,2001,2
868,60603,2001-03-31,,2001,3


In [68]:
# impute missing zhvi using average of neighboring months
def impute_zhvi(row):
    if pd.isna(row['Zhvi']):
        zipcode = row['RegionName']
        year = row['Year']
        month = row['Month']
        
        prev_month = month - 1 if month > 1 else 12
        prev_year = year if month > 1 else year - 1
        next_month = month + 1 if month < 12 else 1
        next_year = year if month < 12 else year + 1
        
        prev_value = chicago_zhvi_melted[
            (chicago_zhvi_melted['RegionName'] == zipcode) &
            (chicago_zhvi_melted['Year'] == prev_year) &
            (chicago_zhvi_melted['Month'] == prev_month)
        ]['Zhvi'].values
        
        next_value = chicago_zhvi_melted[
            (chicago_zhvi_melted['RegionName'] == zipcode) &
            (chicago_zhvi_melted['Year'] == next_year) &
            (chicago_zhvi_melted['Month'] == next_month)
        ]['Zhvi'].values
        
        values = []
        if len(prev_value) > 0 and not pd.isna(prev_value[0]):
            values.append(prev_value[0])
        if len(next_value) > 0 and not pd.isna(next_value[0]):
            values.append(next_value[0])
        
        if values:
            return np.mean(values)
    return row['Zhvi']

chicago_zhvi_melted['Zhvi'] = chicago_zhvi_melted.apply(impute_zhvi, axis=1)

In [69]:
# check missing values in Zhvi again
print("Missing values in ZHVI after imputation:", chicago_zhvi_melted['Zhvi'].isna().sum())

Missing values in ZHVI after imputation: 354


In [71]:
# save the preprocessed data
chicago_zhvi_melted.to_csv('data/chicago_zhvi_preprocessed.csv', index=False)
chicago_zori_melted.to_csv('data/chicago_zori_preprocessed.csv', index=False)

## Crime data

In [12]:
chicago_crime = pd.read_csv('data/crime.csv')

len(chicago_crime)

8429442

In [13]:
chicago_crime.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')

In [15]:
# get the date extracted from Date colunmn in format like 10/19/2025 12:00:00 AM
chicago_crime['Date_mod'] = pd.to_datetime(chicago_crime['Date'], format='%m/%d/%Y %I:%M:%S %p')
chicago_crime.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Date_mod
0,14002298,JJ457143,10/19/2025 12:00:00 AM,003XX N LA SALLE DR,1330,CRIMINAL TRESPASS,TO LAND,RESIDENCE,True,False,...,8.0,26,1175074.0,1902840.0,2025,10/26/2025 03:54:36 PM,41.888778,-87.632512,"(41.888778301, -87.632512191)",2025-10-19
1,14002537,JJ457392,10/19/2025 12:00:00 AM,035XX W SHAKESPEARE AVE,710,THEFT,THEFT FROM MOTOR VEHICLE,STREET,False,False,...,22.0,6,1152802.0,1914069.0,2025,10/26/2025 03:54:36 PM,41.920062,-87.714006,"(41.920061753, -87.714005807)",2025-10-19
2,14004359,JJ457391,10/19/2025 12:00:00 AM,012XX W 79TH ST,810,THEFT,OVER $500,CONVENIENCE STORE,False,False,...,71.0,6,1169539.0,1852401.0,2025,10/26/2025 03:54:36 PM,41.750491,-87.654302,"(41.750490827, -87.654302474)",2025-10-19
3,14002796,JJ457673,10/19/2025 12:00:00 AM,033XX N LAKE SHORE DR,266,CRIMINAL SEXUAL ASSAULT,PREDATORY,APARTMENT,False,False,...,6.0,2,1172831.0,1922482.0,2025,10/26/2025 03:54:36 PM,41.942727,-87.640166,"(41.942726859, -87.640166261)",2025-10-19
4,14002374,JJ457223,10/19/2025 12:00:00 AM,066XX N ASHLAND AVE,820,THEFT,$500 AND UNDER,ALLEY,False,False,...,1.0,6,1164402.0,1944361.0,2025,10/26/2025 03:54:36 PM,42.002947,-87.670525,"(42.002946514, -87.670524517)",2025-10-19


In [16]:
# get the month and year from the date
chicago_crime['month'] = chicago_crime['Date_mod'].dt.month
chicago_crime['year'] = chicago_crime['Date_mod'].dt.year

In [54]:
import geopandas as gpd
import pandas as pd
from shapely import wkt

# --- Load ZIP boundaries ---
zip_df = pd.read_csv("./data/Boundaries_-_ZIP_Codes_20251123.csv")

# Convert WKT to geometry
zip_df["geometry"] = zip_df["the_geom"].apply(wkt.loads)

# Convert to GeoDataFrame
zip_gdf = gpd.GeoDataFrame(zip_df, geometry="geometry", crs="EPSG:4326")
zip_gdf.head()

# add names to each zip code
zipcode_names = {
    60601: "Lakeshore East / New Eastside",
    60602: "Central Loop",
    60603: "Financial District",
    60604: "South Loop (Historic Core)",
    60605: "South Loop / Museum Campus",
    60606: "West Loop (Financial Annex)",
    60607: "West Loop / UIC",
    60608: "Pilsen / Lower West Side",
    60609: "Back of the Yards / Fuller Park",
    60610: "Old Town / Near North Side",
    60611: "Streeterville / Magnificent Mile",
    60612: "Near West Side / Illinois Medical District",
    60613: "Lakeview / Wrigleyville",
    60614: "Lincoln Park",
    60615: "Hyde Park (North)",
    60616: "Chinatown / Near South Side",
    60617: "South Chicago / East Side",
    60618: "Avondale / Irving Park",
    60619: "Chatham / Avalon Park",
    60620: "Auburn Gresham",
    60621: "Englewood",
    60622: "Wicker Park / Ukrainian Village",
    60623: "Little Village",
    60624: "East Garfield Park",
    60625: "Lincoln Square / Ravenswood",
    60626: "Rogers Park",
    60628: "Roseland / Pullman",
    60629: "West Lawn / Chicago Lawn",
    60630: "Jefferson Park",
    60631: "Edison Park / Norwood Park",
    60632: "Brighton Park",
    60633: "Hegewisch",
    60634: "Belmont Cragin",
    60636: "West Englewood",
    60637: "Hyde Park (South) / Woodlawn",
    60638: "Garfield Ridge",
    60639: "Hermosa / Belmont Cragin (West)",
    60640: "Uptown",
    60641: "Irving Park / Portage Park",
    60642: "River West / Noble Square",
    60643: "Morgan Park / Beverly (North)",
    60644: "Austin (West)",
    60645: "West Ridge",
    60646: "Forest Glen / Edgebrook",
    60647: "Logan Square",
    60649: "South Shore",
    60651: "Humboldt Park / West Humboldt Park",
    60652: "Ashburn",
    60653: "Bronzeville",
    60654: "River North",
    60655: "Mount Greenwood",
    60656: "O'Hare / Norwood Park East",
    60657: "Lakeview East / Boystown",
    60659: "North Park / West Ridge (North)",
    60660: "Edgewater",
    60661: "Fulton River District",
    60707: "Elmwood Park / Galewood",
    60827: "Riverdale / Calumet Park"

}

zip_gdf['ZipName'] = zip_gdf['ZIP'].map(zipcode_names)


In [20]:
# crime_gdf = gpd.GeoDataFrame(
#     chicago_crime,
#     geometry=gpd.points_from_xy(chicago_crime.Longitude, chicago_crime.Latitude),
#     crs="EPSG:4326"
# )

# crime_with_zip = gpd.sjoin(crime_gdf, zip_gdf[["ZIP", "geometry"]], how="left", predicate="within")
# crime_with_zip_nona = crime_with_zip.dropna(subset=['ZIP'])

In [55]:
# use zip_gdf to export to geojson
zip_gdf.to_file("data/chicago_zipcodes.geojson", driver="GeoJSON")

In [40]:
chicago_crime_grouped = crime_with_zip_nona.groupby(['year', 'month', 'ZIP', 'Primary Type','Arrest']).size().reset_index(name='crime_count')
chicago_crime_grouped['ZIP'] = chicago_crime_grouped['ZIP'].astype(int).astype(str)
chicago_crime_grouped.head()

Unnamed: 0,year,month,ZIP,Primary Type,Arrest,crime_count
0,2001,1,60601,ASSAULT,False,8
1,2001,1,60601,ASSAULT,True,4
2,2001,1,60601,BATTERY,False,10
3,2001,1,60601,BATTERY,True,9
4,2001,1,60601,BURGLARY,False,2


In [41]:
#save the preprocessed data
chicago_zhvi_melted.to_csv('data/chicago_zhvi_preprocessed.csv', index=False)
chicago_zori_melted.to_csv('data/chicago_zori_preprocessed.csv', index=False)
chicago_crime_grouped.to_csv('data/chicago_crime_preprocessed.csv', index=False)


In [1]:
# inspect the geojson file
import geopandas as gpd
geojson_path = "data/chicago_zipcodes.geojson"
gdf = gpd.read_file(geojson_path)
gdf.head()

Unnamed: 0,the_geom,OBJECTID,ZIP,SHAPE_AREA,SHAPE_LEN,geometry
0,MULTIPOLYGON (((-87.64740454306485 41.88314614...,1,60607,64664294.2344,39143.6400726,"MULTIPOLYGON (((-87.6474 41.88315, -87.6474 41..."
1,MULTIPOLYGON (((-87.64081267674514 41.88907723...,2,60661,9357755.8171,13132.5659184,"MULTIPOLYGON (((-87.64081 41.88908, -87.64068 ..."
2,MULTIPOLYGON (((-87.63696184579966 41.88905665...,3,60606,6766410.78996,12040.4399607,"MULTIPOLYGON (((-87.63696 41.88906, -87.63696 ..."
3,MULTIPOLYGON (((-87.58103206721337 41.80421129...,4,60615,66565454.6108,38321.3132699,"MULTIPOLYGON (((-87.58103 41.80421, -87.58089 ..."
4,MULTIPOLYGON (((-87.66326342055945 41.72120432...,5,60643,207706232.893,75254.7301243,"MULTIPOLYGON (((-87.66326 41.7212, -87.66326 4..."


In [22]:
len(gdf)

59

## Population

In [4]:
pop = pd.read_csv('data/Chicago_Population_Counts.csv')
pop.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
0,Citywide,2018,Chicago,2705988,548999,552935,456321,336457,312965,262991,...,2156989,349712,1386113,1319875,776661,179841.0,784266.0,899980,119467.0,Citywide-Chicago-2018
1,ZIP Code,2018,60601,14675,820,4606,2792,2190,1333,1340,...,13855,2075,7484,7191,1274,,,9677,,ZIP_Code-60601-2018
2,ZIP Code,2018,60602,1244,149,435,462,135,53,10,...,1095,5,551,693,81,,,788,,ZIP_Code-60602-2018
3,ZIP Code,2018,60603,1174,56,561,101,97,197,97,...,1118,112,601,573,115,,,707,,ZIP_Code-60603-2018
4,ZIP Code,2018,60604,782,38,303,104,51,101,130,...,744,93,413,369,34,,,479,,ZIP_Code-60604-2018


In [5]:
# filter by Geography = 'Chicago'
pop_chicago = pop[pop['Geography'] == 'Chicago']
pop_chicago.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
0,Citywide,2018,Chicago,2705988,548999,552935,456321,336457,312965,262991,...,2156989,349712,1386113,1319875,776661,179841.0,784266.0,899980,119467.0,Citywide-Chicago-2018
60,Citywide,2019,Chicago,2693959,555424,526788,469300,340219,294864,261009,...,2138535,362629,1382528,1311431,776290,184768.0,768524.0,901941,62436.0,Citywide-Chicago-2019
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,...,2146679,342174,1388469,1310878,772791,182251.0,776470.0,900055,67780.0,Citywide-Chicago-2020
175,Citywide,2021,Chicago,2696561,540255,519276,464574,338452,314035,269647,...,2156306,373166,1387010,1309551,777744,184263.0,757971.0,887130,89453.0,Citywide-Chicago-2021
