This python file contains the functions for our final project.

In [2]:
import pandas as pd
import kagglehub
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from shapely.ops import transform
import pyproj
import matplotlib.pyplot as plt

In [3]:
def load_philly_crime_data(years, base_path="data", filename_template="philadelphia_crime_data_{}.csv"):
    """
    Loads and concatenates Philadelphia crime data across multiple years.

    Parameters:
        years (list): List of years to load (e.g. [2021, 2022, 2023])
        base_path (str): Folder where files are stored
        filename_template (str): Filename format with one {} placeholder for the year

    Returns:
        pd.DataFrame: Combined DataFrame of crime data with 'date' and 'year' columns
    """
    crime_dfs = []

    for year in years:
        file_path = f"{base_path}/{filename_template.format(year)}"
        df = pd.read_csv(file_path)
        df["year"] = year
        df["dispatch_date_time"] = pd.to_datetime(df["dispatch_date_time"])
        df["date"] = df["dispatch_date_time"].dt.floor("D")
        crime_dfs.append(df)

    return pd.concat(crime_dfs, ignore_index=True)


In [4]:
def load_chicago_crime_data(years, base_path="data", filename_template="chicago_crime_data_{}.csv"):
    """
    Loads and concatenates Philadelphia crime data across multiple years.

    Parameters:
        years (list): List of years to load (e.g. [2021, 2022, 2023])
        base_path (str): Folder where files are stored
        filename_template (str): Filename format with one {} placeholder for the year

    Returns:
        pd.DataFrame: Combined DataFrame of crime data with 'date' and 'year' columns
    """
    crime_dfs = []

    for year in years:
        file_path = f"{base_path}/{filename_template.format(year)}"
        df = pd.read_csv(file_path)
        df["year"] = year
        df["dispatch_date_time"] = pd.to_datetime(df["Date"])
        df["date"] = df["dispatch_date_time"].dt.floor("D")
        df["lat"]=df["Latitude"]
        df["lng"]=df["Longitude"]
        crime_dfs.append(df)

    return pd.concat(crime_dfs, ignore_index=True)


In [5]:
def nfl_game_date(year, team):
    # load the specific year
    path_nfl = kagglehub.dataset_download("keonim/nfl-game-scores-dataset-2017-2023")
    df = pd.read_csv(f"{path_nfl}/Season_Scores/{year}_scores.csv")
    df = df.dropna(subset=['Date'])

    # subsets the df to games where the team is playing
    df_away = df[(df["AwayTeam"] == team)]
    df_home = df[(df["HomeTeam"] == team)]

    home_dates_with_year = df_home['Date'].apply(
        lambda x: f"{x}/{year+1}" if int(str(x).split('/')[0]) <= 6 
        else f"{x}/{year}")    
    away_dates_with_year = df_away['Date'].apply(
        lambda x: f"{x}/{year+1}" if int(str(x).split('/')[0]) <= 6 
        else f"{x}/{year}")

    # Now convert to datetime
    home_game_dates = pd.to_datetime(home_dates_with_year)
    away_game_dates = pd.to_datetime(away_dates_with_year)

    # Gets the result of the game
    home_game_win = df_home['HomeWin'].to_list()
    away_game_win = df_away['AwayWin'].to_list()

    # Convert to "Win" or "Loss"
    home_game_win = ["Win" if bool(x) else "Loss" for x in home_game_win]
    away_game_win = ["Win" if bool(x) else "Loss" for x in away_game_win]

    # Creates an away, home df, with the pairs for results and tags the Location
    home = pd.DataFrame(list(zip(home_game_dates, home_game_win)))
    away = pd.DataFrame(list(zip(away_game_dates, away_game_win)))
    home["Location"] = "Home"
    away["Location"] = "Away"

    # Puts the two together and sorts them by date
    season = pd.concat([home, away], ignore_index=True) 
    season.columns = ["Date", "Result", "Location"]
    season = season.sort_values('Date').reset_index(drop=True)    

    return season

In [6]:
def nba_game_date(team,year):
    #first get teamId for this dataset
    path_nba = kagglehub.dataset_download("eoinamoore/historical-nba-data-and-player-box-scores")
    team_df = pd.read_csv(f"{path_nba}/TeamHistories.csv")

    #searches the dataset for the ID, returns nothing if not found
    team_id = team_df[team_df['teamName'] == team]['teamId']
    if not team_id.empty:
        team_id = team_id.iloc[0]
    else:
        print("Team not found in dataset, look at the documentation.")
        return

    #use team_id and year to sort games wanted
    df = pd.read_csv(f"{path_nba}/Games.csv", low_memory=False)
    df["gameDate"] = pd.to_datetime(df["gameDate"])
    start = pd.to_datetime(f"{year}-10-01")
    end = pd.to_datetime(f"{year+1}-06-30")
    df_home = df[(df["hometeamId"] == team_id) & ((df['gameDate'] >= start) & (df['gameDate'] <= end))]
    df_away = df[(df["awayteamId"] == team_id) & ((df['gameDate'] >= start) & (df['gameDate'] <= end))]


    home_game_dates = df_home["gameDate"].to_list()
    away_game_dates = df_away["gameDate"].to_list()
    
    home_game_win = [x == team_id for x in df_home['winner']]
    away_game_win = [x == team_id for x in df_away['winner']]

    #Creates an away, home df, with the pairs for results and tags the Location
    home = pd.DataFrame(list(zip(home_game_dates, home_game_win)))
    away = pd.DataFrame(list(zip(away_game_dates, away_game_win)))
    home["Location"] = "Home"
    away["Location"] = "Away"
    season = pd.concat([home, away], ignore_index=True) 
    season.columns = ["Date", "Result", "Location"]
    season = season.sort_values('Date').reset_index(drop=True)    
    
    return season

In [7]:
def load_all_sixers_games(start_year=2021, end_year=2024):
    """
    Load all 76ers NBA games across multiple years and add 'date' and 'year' columns.

    Parameters:
        start_year (int): Start year (inclusive)
        end_year (int): End year (inclusive)

    Returns:
        pd.DataFrame: Combined DataFrame of all games with 'date', 'year', and 'Location'
    """
    all_games = []
    for year in range(start_year, end_year + 1):
        games = nba_game_date("76ers", year).copy()
        games["date"] = pd.to_datetime(games["Date"]).dt.date
        games["year"] = year
        all_games.append(games)
    return pd.concat(all_games, ignore_index=True)


In [8]:
def load_all_eagles_games(start_year=2021, end_year=2024):
    """
    Load all Eagles NFL games across multiple years and add a clean 'date' and 'year' column.

    Parameters:
        start_year (int): Start year (inclusive)
        end_year (int): End year (inclusive)

    Returns:
        pd.DataFrame: Combined DataFrame of all games with 'date', 'year', and 'Location'
    """
    all_games = []
    for year in range(start_year, end_year + 1):
        games = nfl_game_date(year, "Eagles").copy()
        games["date"] = pd.to_datetime(games["Date"]).dt.date
        games["year"] = year
        all_games.append(games)
    return pd.concat(all_games, ignore_index=True)

In [9]:
def load_all_bears_games(start_year=2021, end_year=2024):
    """
    Load all Bears NFL games across multiple years and add a clean 'date' and 'year' column.

    Parameters:
        start_year (int): Start year (inclusive)
        end_year (int): End year (inclusive)

    Returns:
        pd.DataFrame: Combined DataFrame of all games with 'date', 'year', and 'Location'
    """
    all_games = []
    for year in range(start_year, end_year + 1):
        games = nfl_game_date(year, "Bears").copy()
        games["date"] = pd.to_datetime(games["Date"]).dt.date
        games["year"] = year
        all_games.append(games)
    return pd.concat(all_games, ignore_index=True)


In [10]:
def preprocess_crime_data(df, stadium_coords, date_col,zip_shapes=None,lat_col='lat',lng_col='lng'):
    """
    Preprocess crime data:
    - Parses date column
    - Converts to GeoDataFrame using lat/lng
    - Computes distance to stadium
    - Optionally joins ZIP code geometries
    
    Parameters:
        df (pd.DataFrame): Raw crime data with 'lat', 'lng', 'date' columns
        stadium_coords (tuple): (longitude, latitude) of stadium
        zip_shapes (gpd.GeoDataFrame, optional): ZIP code shapefile with 'geometry' and 'zip_code'
    
    Returns:
        gpd.GeoDataFrame: Crime GeoDataFrame with distance and optional ZIP code
    """
    # Clean and parse
    df = df.copy()
    df['date'] = pd.to_datetime(df[date_col], errors='coerce')
    df['lat'] = pd.to_numeric(df[lat_col], errors='coerce')
    df['lng'] = pd.to_numeric(df[lng_col], errors='coerce')
    df = df.dropna(subset=['lat', 'lng', 'date'])

    # Geo conversion
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lng'], df['lat']), crs="EPSG:4326")
    gdf = gdf.to_crs(epsg=3857)

    # Project stadium point
    stadium_point = Point(stadium_coords)
    stadium_proj = gpd.GeoSeries([stadium_point], crs="EPSG:4326").to_crs(epsg=3857).iloc[0]

    # Distance in meters
    gdf['distance_to_stadium_m'] = gdf.geometry.distance(stadium_proj)

    # Optional ZIP join
    if zip_shapes is not None:
        if 'zip' in zip_shapes.columns:
            zip_shapes = zip_shapes.rename(columns={"zip": "zip_code"})
        gdf = gpd.sjoin(gdf, zip_shapes[['geometry', 'zip_code']], how="left", predicate="within")

    return gdf



In [11]:
def tag_game_windows(df, games_df, team_name=None, date_col='date'):
    """
    Tag crimes that occur on the same day as a game (whole-day tagging).

    Parameters:
        df (pd.DataFrame): Crime data with a datetime column (default 'date').
        games_df (pd.DataFrame): Game schedule with 'Date' column (datetime).
        team_name (str, optional): For labeling only.
        date_col (str): Column in df to compare with game dates.

    Returns:
        pd.DataFrame: Original crime data with 'is_game_window' column added.
    """
    df = df.copy()
    games_df = games_df.copy()

    # Validate and clean date column in crime data
    if date_col not in df.columns:
        raise ValueError(f"'{date_col}' not found in crime DataFrame.")
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce').dt.tz_localize(None)
    df = df.dropna(subset=[date_col])

    # Convert game times to dates
    games_df['Date'] = pd.to_datetime(games_df['Date'], errors='coerce').dt.tz_localize(None)
    games_df = games_df.dropna(subset=['Date'])
    game_dates = set(games_df['Date'].dt.date)

    # Tag entire day as game day
    df['is_game_window'] = df[date_col].dt.date.isin(game_dates)

    return df


In [12]:
def compute_distance_crime_delta(
    crime_df,
    distance_col='distance_to_stadium_m',
    is_game_col='is_game_window',
    bin_size_m=1000,
    max_dist_m=5000,
    bins=None
):
    """
    Compute change in average crime counts by distance band from the stadium.

    Parameters:
        crime_df (pd.DataFrame): Crime data with distance and game window indicators.
        distance_col (str): Column name for distance from stadium in meters.
        is_game_col (str): Boolean column indicating game-related crime windows.
        bin_size_m (int): Width of distance bins in meters (used only if bins is None).
        max_dist_m (int): Maximum distance to consider (used only if bins is None).
        bins (list, optional): Custom list of bin edges in meters.

    Returns:
        pd.DataFrame: Δ crime statistics per distance band.
    """
    df = crime_df.copy()
    print(f"Initial rows: {len(df)}")
    df = df[df[distance_col].notnull()]
    print(f"After dropping null {distance_col}: {len(df)}")

    # Use custom bins if provided, otherwise generate default bins
    if bins is None:
        bins = list(np.arange(0, max_dist_m + bin_size_m, bin_size_m))
    if bins[-1] != float('inf'):
        bins.append(float('inf'))  # Add catch-all bin

    print(f"Using bins (meters): {bins}")

    # Create bin labels
    labels = [f"{int(bins[i]/1000)}–{int(bins[i+1]/1000)}km" for i in range(len(bins)-2)]
    labels.append(f">{int(bins[-2]/1000)}km")
    df["distance_band_km"] = pd.cut(df[distance_col], bins=bins, labels=labels, include_lowest=True)

    print("Distance band distribution:")
    print(df["distance_band_km"].value_counts().sort_index())

    # Group by game window and distance band
    grouped = df.groupby([is_game_col, "distance_band_km"]).size().reset_index(name="crime_count")
    print("Grouped counts:")
    print(grouped)

    # Pivot table
    pivot = grouped.pivot(index="distance_band_km", columns=is_game_col, values="crime_count").fillna(0)
    print("Pivot table:")
    print(pivot)

    # Standardize column names
    if True in pivot.columns and False in pivot.columns:
        pivot.columns = ['non_game_crime', 'game_crime']
    elif True in pivot.columns:
        pivot.columns = ['game_crime']
        pivot['non_game_crime'] = 0
    elif False in pivot.columns:
        pivot.columns = ['non_game_crime']
        pivot['game_crime'] = 0

    # Normalize by number of days in each category
    n_game_days = df[df[is_game_col]].date.dt.date.nunique()
    n_non_game_days = df[~df[is_game_col]].date.dt.date.nunique()
    print(f"Number of game days: {n_game_days}")
    print(f"Number of non-game days: {n_non_game_days}")

    if n_game_days == 0 or n_non_game_days == 0:
        raise ValueError("No game or non-game days found — cannot compute delta.")

    # Compute averages and delta
    pivot["avg_game_crime"] = pivot["game_crime"] / n_game_days
    pivot["avg_non_game_crime"] = pivot["non_game_crime"] / n_non_game_days
    pivot["delta_crime"] = pivot["avg_game_crime"] - pivot["avg_non_game_crime"]

    print("Final delta stats:")
    print(pivot)

    return pivot.reset_index()


In [13]:
def plot_stadium_radii_map_with_bar_chart(
    stadium_coords,
    crime_df,
    delta_df,
    radii_meters=[1000, 5000, 10000, 20000],
    zoom=15,
    title="Stadium Radius Map + Δ Crime by Distance Band",
    lat_col="lat",
    lng_col="lng"
):
    """
    Plot a map with concentric radius bands around a stadium and a bar chart of precomputed Δ crimes.

    Parameters:
        stadium_coords (tuple): (longitude, latitude) of the stadium
        crime_df (pd.DataFrame): Used only to show map region; must have lat/lng for mapbox to render
        delta_df (pd.DataFrame): Precomputed Δ crime data with 'distance_band_km' and 'delta_crime'
        radii_meters (list): Radii to draw on map
        zoom (int): Map zoom level
        title (str): Title of the map
        lat_col (str): Name of column with latitude values
        lng_col (str): Name of column with longitude values
    """
    stadium_lon, stadium_lat = stadium_coords

    # Create projection to handle buffer geometry
    proj = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True).transform
    inverse_proj = pyproj.Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True).transform
    point_proj = transform(proj, Point(stadium_lon, stadium_lat))

    # Generate concentric rings for mapping
    bands = []
    for r in radii_meters:
        buffer = transform(inverse_proj, point_proj.buffer(r))
        bands.append(gpd.GeoDataFrame({'radius_m': [r]}, geometry=[buffer], crs="EPSG:4326"))

    # Initialize map
    fig_map = go.Figure()
    fig_map.update_layout(
        mapbox_style="carto-positron",
        mapbox_zoom=zoom,
        mapbox_center={"lat": stadium_lat, "lon": stadium_lon},
        title=title
    )

    # Draw radius bands
    for gdf in bands:
        geojson = gdf.__geo_interface__
        radius = gdf['radius_m'].iloc[0]
        fig_map.add_trace(go.Choroplethmapbox(
            geojson=geojson,
            locations=[0],
            z=[radius],
            colorscale='Blues',
            showscale=False,
            name=f"{radius / 1000:.1f} km radius",
            marker_opacity=0.3,
            marker_line_width=1
        ))

    # Stadium marker
    fig_map.add_trace(go.Scattermapbox(
        lat=[stadium_lat],
        lon=[stadium_lon],
        mode='markers+text',
        marker=go.scattermapbox.Marker(size=10, color='red'),
        text=["Stadium"],
        textposition="top right"
    ))

    # Optionally add a sample of crime points (can be skipped for performance)
    # if lat_col in crime_df.columns and lng_col in crime_df.columns:
    #     fig_map.add_trace(go.Scattermapbox(
    #         lat=crime_df[lat_col].sample(min(500, len(crime_df))).tolist(),
    #         lon=crime_df[lng_col].sample(min(500, len(crime_df))).tolist(),
    #         mode='markers',
    #         marker=go.scattermapbox.Marker(size=3, color='black', opacity=0.2),
    #         name="Crime"
    #     ))

    # Bar chart: Δ crime by distance band
    fig_bar = px.bar(
        delta_df,
        x="distance_band_km",
        y="delta_crime",
        title="Δ Crime per Day by Distance Band",
        labels={
            "distance_band_km": "Distance Band",
            "delta_crime": "Δ Crimes per Day (Game - Non-game)"
        },
        text_auto=True
    )

    fig_map.show()
    fig_bar.show()


In [14]:
def compute_event_and_followup_crime_delta(crime_df, event_date, date_col='date'):
    """
    Compute crime delta on an event day and the day after, compared to average daily crime
    in the prior year.

    Parameters:
        crime_df (pd.DataFrame): Crime data with a datetime column.
        event_date (str or datetime): Date of the major event (e.g. championship game).
        date_col (str): Name of the date column in the DataFrame.

    Returns:
        dict: {
            'event_date': date,
            'next_day': date,
            'event_day_crimes': int,
            'next_day_crimes': int,
            'average_crimes_per_day': float,
            'delta_event_day': float,
            'delta_next_day': float
        }
    """
    # Ensure consistent timezone handling
    event_date = pd.to_datetime(event_date).tz_localize(None)
    next_day = event_date + pd.Timedelta(days=1)

    df = crime_df.copy()
    df[date_col] = pd.to_datetime(df[date_col]).dt.tz_localize(None)

    next_day = event_date + pd.Timedelta(days=1)
    one_year_prior = event_date - pd.Timedelta(days=365)

    # Crimes on event day and next day
    crimes_event_day = df[df[date_col] == event_date]
    crimes_next_day = df[df[date_col] == next_day]
    
    # Prior year crimes (exclude event + next day)
    prior_year_crimes = df[
        (df[date_col] >= one_year_prior) &
        (df[date_col] < event_date)
    ]

    avg_crimes_per_day = prior_year_crimes.groupby(date_col).size().mean()

    # Compute deltas
    delta_event = len(crimes_event_day) - avg_crimes_per_day
    delta_next = len(crimes_next_day) - avg_crimes_per_day

    return {
        "event_date": event_date.date(),
        "next_day": next_day.date(),
        "event_day_crimes": len(crimes_event_day),
        "next_day_crimes": len(crimes_next_day),
        "average_crimes_per_day": round(avg_crimes_per_day, 2),
        "delta_event_day": round(delta_event, 2),
        "delta_next_day": round(delta_next, 2)
    }


In [15]:
def plot_event_comparison_map(
    crime_df,
    event1_date,
    event2_date,
    event1_label="Event 1",
    event2_label="Event 2",
    lat_col="lat",
    lng_col="lng",
    date_col="date",
    crime_type_col="Primary Type",
    zoom=11,
    title="Crime Locations on Key Event Days"
):
    """
    Plot a side-by-side map of crimes on two specified dates using lat/lng data.

    Parameters:
        crime_df (pd.DataFrame): Full crime dataset with coordinates and a datetime column.
        event1_date (str or datetime): First date (e.g. "2025-02-09").
        event2_date (str or datetime): Second date (e.g. "2025-02-10").
        event1_label (str): Label to show for first event.
        event2_label (str): Label to show for second event.
        lat_col (str): Name of the latitude column.
        lng_col (str): Name of the longitude column.
        date_col (str): Name of the date column.
        crime_type_col (str): Optional column to show in hover.
        zoom (int): Map zoom level.
        title (str): Title of the map.
    """
    df = crime_df.copy()
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lng_col] = pd.to_numeric(df[lng_col], errors="coerce")
    df[date_col] = pd.to_datetime(df[date_col])


    # Ensure date inputs are datetime
    event1_date = pd.to_datetime(event1_date)
    event2_date = pd.to_datetime(event2_date)

    # Debug: Check for timezone mismatch
    if df[date_col].dt.tz is not None:
        event1_date = event1_date.tz_localize("UTC")
        event2_date = event2_date.tz_localize("UTC")

    # Filter and drop missing coordinates
    df_event1 = df[df[date_col] == event1_date].dropna(subset=[lat_col, lng_col]).copy()
    df_event2 = df[df[date_col] == event2_date].dropna(subset=[lat_col, lng_col]).copy()

    df_event1["Event"] = event1_label
    df_event2["Event"] = event2_label
    combined = pd.concat([df_event1, df_event2], axis=0)

    # Plot
    fig = px.scatter_mapbox(
        combined,
        lat=lat_col,
        lon=lng_col,
        color="Event",
        hover_name=crime_type_col if crime_type_col in combined.columns else None,
        hover_data=[date_col] if date_col in combined.columns else None,
        zoom=zoom,
        height=600,
        title=title
    )
    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r": 0, "t": 40, "l": 0, "b": 0})
    fig.show()
