In [17]:
import pandas as pd
import kagglehub
from sklearn.linear_model import LinearRegression
from shapely.geometry import Point
import geopandas as gpd
from datetime import timedelta

#Philadelphia 2025 Crime Statistics, from our github
url = "data/philadelphia_crime_data_2025.csv"
phil_test = pd.read_csv(url)

#this downloads the kaggle nfl dataset
path_nfl = kagglehub.dataset_download("keonim/nfl-game-scores-dataset-2017-2023")
game_dates = pd.read_csv(f"{path_nfl}/Season_Scores/2024_scores.csv")


In [6]:
def nfl_game_date(year, team):
    # load the specific year
    path_nfl = kagglehub.dataset_download("keonim/nfl-game-scores-dataset-2017-2023")
    df = pd.read_csv(f"{path_nfl}/Season_Scores/{year}_scores.csv")
    df = df.dropna(subset=['Date'])

    # subsets the df to games where the team is playing
    df_away = df[(df["AwayTeam"] == team)]
    df_home = df[(df["HomeTeam"] == team)]

    home_dates_with_year = df_home['Date'].apply(
        lambda x: f"{x}/{year+1}" if int(str(x).split('/')[0]) <= 6 
        else f"{x}/{year}")    
    away_dates_with_year = df_away['Date'].apply(
        lambda x: f"{x}/{year+1}" if int(str(x).split('/')[0]) <= 6 
        else f"{x}/{year}")

    # Now convert to datetime
    home_game_dates = pd.to_datetime(home_dates_with_year)
    away_game_dates = pd.to_datetime(away_dates_with_year)

    # Gets the result of the game
    home_game_win = df_home['HomeWin'].to_list()
    away_game_win = df_away['AwayWin'].to_list()

    # Convert to "Win" or "Loss"
    home_game_win = ["Win" if bool(x) else "Loss" for x in home_game_win]
    away_game_win = ["Win" if bool(x) else "Loss" for x in away_game_win]

    # Creates an away, home df, with the pairs for results and tags the Location
    home = pd.DataFrame(list(zip(home_game_dates, home_game_win)))
    away = pd.DataFrame(list(zip(away_game_dates, away_game_win)))
    home["Location"] = "Home"
    away["Location"] = "Away"

    # Puts the two together and sorts them by date
    season = pd.concat([home, away], ignore_index=True) 
    season.columns = ["Date", "Result", "Location"]
    season = season.sort_values('Date').reset_index(drop=True)    

    return season


In [9]:
chicago_crime=pd.read_csv("data/chicago_crime_data.csv")
# Load Chicago crime data
chicago_crime["Latitude"] = pd.to_numeric(chicago_crime["Latitude"], errors='coerce')
chicago_crime["Longitude"] = pd.to_numeric(chicago_crime["Longitude"], errors='coerce')

# Drop rows with missing coordinates
chicago_crime = chicago_crime.dropna(subset=["Latitude", "Longitude"])

# Convert to GeoDataFrame
crime_gdf = gpd.GeoDataFrame(
    chicago_crime,
    geometry=gpd.points_from_xy(chicago_crime["Longitude"], chicago_crime["Latitude"]),
    crs="EPSG:4326"
)

# Load ZIP code shapefile
zip_shapes = gpd.read_file("data/Boundaries - ZIP Codes_20250601/geo_export_d8622bad-255e-4041-b5c2-003ea4be6c96.shp") 

# Spatial join: assign ZIP codes
zip_shapes = zip_shapes.rename(columns={"zip": "zip_code"})

# Assign ZIP code to each crime
chicago_crime_zip = gpd.sjoin(
    crime_gdf,
    zip_shapes[["geometry", "zip_code"]],
    how="left",
    predicate="within"
)


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: GEOGCS["WGS84(DD)",DATUM["WGS84",SPHEROID["WGS84", ...

  chicago_crime_zip = gpd.sjoin(


In [13]:
chicago_crime_zip.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,geometry,index_right,zip_code
0,13851043,JJ273548,05/24/2025 12:00:00 AM,003XX N ASHLAND AVE,620,BURGLARY,UNLAWFUL ENTRY,FACTORY / MANUFACTURING BUILDING,False,False,...,1165679.0,1902193.0,2025,05/31/2025 03:43:34 PM,41.887208,-87.667032,"(41.887208258, -87.667032452)",POINT (-87.66703 41.88721),0.0,60607
1,13848585,JJ270563,05/24/2025 12:00:00 AM,016XX W IRVING PARK RD,620,BURGLARY,UNLAWFUL ENTRY,OTHER (SPECIFY),False,False,...,1164434.0,1926620.0,2025,05/31/2025 03:43:34 PM,41.954264,-87.670912,"(41.954263915, -87.670911524)",POINT (-87.67091 41.95426),20.0,60613
2,13846875,JJ267802,05/24/2025 12:00:00 AM,081XX S THROOP ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,1169073.0,1850874.0,2025,05/31/2025 03:43:34 PM,41.746311,-87.656054,"(41.746310609, -87.65605416)",POINT (-87.65605 41.74631),40.0,60620
3,13845573,JJ267002,05/24/2025 12:00:00 AM,0000X E 44TH ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,1177289.0,1875862.0,2025,05/31/2025 03:43:34 PM,41.814699,-87.625195,"(41.814698873, -87.625194981)",POINT (-87.62519 41.8147),39.0,60653
4,13848461,JJ270404,05/24/2025 12:00:00 AM,050XX S SEELEY AVE,890,THEFT,FROM BUILDING,OTHER (SPECIFY),False,False,...,1163541.0,1871291.0,2025,05/31/2025 03:43:34 PM,41.802455,-87.675753,"(41.802455323, -87.675752688)",POINT (-87.67575 41.80246),22.0,60609


In [35]:
# Convert 'date' column to datetime
chicago_crime_zip['date'] = pd.to_datetime(chicago_crime_zip['date'])

# Game details
game_date = pd.to_datetime('2024-11-24')
game_end_time = pd.to_datetime('2024-11-24 19:00')  # approx 7PM
window_end_time = game_end_time + timedelta(hours=6)

# Soldier Field coordinates
soldier_field_coords = (-87.6195, 41.8623)
soldier_field_point = Point(soldier_field_coords)

# Convert to metric CRS for distance
crime_gdf = chicago_crime_zip.to_crs(epsg=3857)
soldier_field_point_proj = gpd.GeoSeries([soldier_field_point], crs="EPSG:4326").to_crs(epsg=3857).iloc[0]

# Distance to stadium
crime_gdf['distance_to_stadium_m'] = crime_gdf.geometry.distance(soldier_field_point_proj)

# Crimes in 6-hour post-game window within 3.2 km
crimes_near_stadium = crime_gdf[
    (crime_gdf['date'] >= game_end_time) &
    (crime_gdf['date'] <= window_end_time) &
    (crime_gdf['distance_to_stadium_m'] <= 3200)
]
num_game_crimes = len(crimes_near_stadium)

# Baseline: average crimes per 7PM–1AM window on prior days within 3.2 km
prior_crimes = crime_gdf[
    (crime_gdf['date'] < game_date) &
    (crime_gdf['distance_to_stadium_m'] <= 3200)
].copy()
prior_crimes['hour'] = prior_crimes['date'].dt.hour
prior_crimes = prior_crimes[prior_crimes['hour'].isin([19, 20, 21, 22, 23, 0, 1])]
prior_crimes['day'] = prior_crimes['date'].dt.date

# Average crimes per 6-hour window
avg_crimes_per_window = prior_crimes.groupby('day').size().mean()
delta_crimes = num_game_crimes - avg_crimes_per_window

# Prepare output
summary_df = pd.DataFrame({
    "Metric": [
        "Crimes near Soldier Field (7PM–1AM on Game Day)",
        "Average crimes (7PM–1AM) on prior days",
        "Δ Crime (Game Day – Average)"
    ],
    "Value": [
        num_game_crimes,
        round(avg_crimes_per_window, 2),
        round(delta_crimes, 2)
    ]
})

summary_df

Unnamed: 0,Metric,Value
0,Crimes near Soldier Field (7PM–1AM on Game Day),7.0
1,Average crimes (7PM–1AM) on prior days,7.63
2,Δ Crime (Game Day – Average),-0.63
