## Combine Actual and Shooting Data

In [0]:
from pyspark.sql.functions import col, trim, when, lit, initcap, count, countDistinct, lag, sum, collect_set, explode, mean, coalesce, sqrt, round, avg, flatten
from pyspark.sql.functions import max as f_max, min as f_min
from pyspark.sql.window import Window
from pyspark.sql.functions import col, collect_list, size, expr


In [0]:
population_race_county_df = spark.table("default.percentage_population_race_per_county_by_year")
shooting_race_county_df = spark.table("default.percentage_shooting_race_per_county_by_year")
shooting_race_county_by_month_df = spark.table("default.percentage_shooting_race_per_county_by_month_year")



### Pre-Processing for Join

In [0]:

# Apply trim and cast to the join columns, but keep the other columns intact
population_race_county_df = population_race_county_df.withColumn(
    'county', initcap(trim(col('county')))
).withColumn(
    'year', col('year').cast('int')  # Cast to int if needed
).withColumn(
    'race', trim(col('race'))
)

shooting_race_county_df = shooting_race_county_df.withColumn(
    'county', initcap(trim(col('county')))
).withColumn(
    'year', col('year').cast('int')  # Cast to int if needed
).withColumn(
    'race', trim(col('race'))
)

### Combine both population and shooting races

In [0]:
# join both data and get just disparities
overall_analysis_df = population_race_county_df.join(shooting_race_county_df, on=['county', 'year', 'race'], how='right')

# Calculate the county standard deviation
mean_window = Window.partitionBy("county")
overall_analysis_df = overall_analysis_df.withColumn("county_total_shooting", round(sum(col("shooting_count")).over(mean_window), 2)).withColumn("county_mean_shooting", round(mean(col("total_shooting_count")).over(mean_window), 2))

# Define the window to partition by county and race
race_mean_window = Window.partitionBy("county", "race")

# Compute the county standard deviation in a single formula
overall_analysis_df = overall_analysis_df.withColumn(
    "county_race_standard_deviation",
    sqrt(
        avg((col("shooting_count") - col("county_mean_shooting")) ** 2)
        .over(race_mean_window)
    )
).withColumn("race_mean_shooting", round(mean(col("shooting_count")).over(race_mean_window), 2))

   
# overall_analysis_df = overall_analysis_df.orderBy(col("county").asc(), col("race").asc(), col("year").asc())
overall_analysis_df = overall_analysis_df.withColumn("rank_diff", col("race_shooting_rank") - col("race_population_rank"))


In [0]:

display(overall_analysis_df.filter(col("county") == "Los Angeles").select("county", "year", "race", "race_population_rank", "race_shooting_rank", "rank_diff").orderBy("county", "race", "year"))


county,year,race,race_population_rank,race_shooting_rank,rank_diff
Los Angeles,2015,A,4.0,4.0,0.0
Los Angeles,2016,A,4.0,4.0,0.0
Los Angeles,2017,A,4.0,4.0,0.0
Los Angeles,2018,A,4.0,4.0,0.0
Los Angeles,2019,A,4.0,4.0,0.0
Los Angeles,2020,A,4.0,4.0,0.0
Los Angeles,2021,A,4.0,,
Los Angeles,2022,A,3.0,4.0,1.0
Los Angeles,2023,A,3.0,5.0,2.0
Los Angeles,2024,A,,5.0,


In [0]:
display(overall_analysis_df)


county,year,race,population,total_population,%population_race,race_population_rank,shooting_count,total_shooting_count,mean_shooting,%shooting_race,race_shooting_rank,county_total_shooting,county_mean_shooting,county_race_standard_deviation,race_mean_shooting,rank_diff
Abbeville,2021,,,,,,1.0,1.0,1.0,100.0,1.0,1,1.0,0.0,1.0,
Abbeville,2024,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2017,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2016,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2019,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2015,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2018,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2022,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2023,,,,,,,,,,,1,1.0,0.0,1.0,
Abbeville,2020,,,,,,,,,,,1,1.0,0.0,1.0,


### Shooting crossing the mean

In [0]:
# fluctuation_years= 0
# above_mean_disparity = overall_analysis_df.withColumn("mean_flag", when((col("shooting_count").isNotNull()) & (col("shooting_count") >= col("mean_shooting")), 0).otherwise(1))

# # how many years of data we have for dispirated races
# disparity_window = Window.partitionBy("county", "race")
# disparity_races_df = above_mean_disparity.withColumn("disparity_years_count", sum("mean_flag").over(disparity_window))
# disparity_races_df = disparity_races_df.filter(col("disparity_years_count")==fluctuation_years)
# display(disparity_races_df)


## Disparity in Certain Period i.e., 2015-2023

In [0]:
# disparity_period_start, disparity_period_end = 2015, 2023
# disparity_years = disparity_period_end - disparity_period_start + 1

# # a window partitioned by county and race, ordered by year, get previous year's rank_diff
# # window_spec = Window.partitionBy("county", "race").orderBy("year")
# # disparity_rise_df = overall_analysis_df.withColumn("prev_rank_diff",coalesce(lag("rank_diff", 1).over(window_spec), col("rank_diff")))

# # display(overall_analysis_df)

# # Compute change in rank difference
# disparity_rise_df = overall_analysis_df.withColumn(
#     'is_rising_disparity',
#     when((col("%shooting_race")>=col("%population_race")) , 1).otherwise(0)
# )


# # Filter only years 2020-2023
# disparity_rise_df = disparity_rise_df.filter(col("year").between(disparity_period_start, disparity_period_end))

# # Aggregate per (county, race) and check if all values in 2020-2023 are True
# disparity_rise_df = disparity_rise_df.groupBy("county", "race").agg(
#     count("*").alias("total_years"),  # Count how many years exist (should be 4)
#     sum(col("is_rising_disparity").cast("int")).alias("true_count")
# ).withColumn(
#     "is_persistent_rising_disparity", 
#     (col("total_years") == disparity_years) & (col("true_count") == disparity_years)  # Ensure all years are present & True
# ).filter(col("is_persistent_rising_disparity")==True)
# disparity_rise_df = disparity_rise_df.select(col("county"), col("race"))

# # show the county and race having disparity
# display(disparity_rise_df)


# continuous_disparity_county_df = overall_analysis_df.join(disparity_rise_df, on=['county', 'race'], how='inner')
# change_continuous_disparity_county_df = continuous_disparity_county_df.filter(col("population").isNotNull())

# # Show results
# # display(continuous_disparity_county_df)

In [0]:
# disparity_period_start, disparity_period_end = 2015, 2023
# disparity_years = disparity_period_end - disparity_period_start + 1

# # a window partitioned by county and race, ordered by year, get previous year's rank_diff
# # window_spec = Window.partitionBy("county", "race").orderBy("year")
# # disparity_rise_df = overall_analysis_df.withColumn("prev_rank_diff",coalesce(lag("rank_diff", 1).over(window_spec), col("rank_diff")))

# # display(overall_analysis_df)

# # Compute change in rank difference
# disparity_rise_df = overall_analysis_df.withColumn(
#     'is_rising_disparity',
#     when((col("rank_diff")<0) & (col("%shooting_race")>=col("%population_race")), 1).otherwise(0)
# )

# display(disparity_rise_df.filter(col("county") == "Los Angeles"))

# # Filter only years 2020-2023
# disparity_rise_df = disparity_rise_df.filter(col("year").between(disparity_period_start, disparity_period_end))

# # Aggregate per (county, race) and check if all values in 2020-2023 are True
# disparity_rise_df = disparity_rise_df.groupBy("county", "race").agg(
#     count("*").alias("total_years"),  # Count how many years exist (should be 4)
#     sum(col("is_rising_disparity").cast("int")).alias("true_count")
# ).withColumn(
#     "is_persistent_rising_disparity", 
#     (col("total_years") == disparity_years) & (col("true_count") == disparity_years)  # Ensure all years are present & True
# ).filter(col("is_persistent_rising_disparity")==True)
# disparity_rise_df = disparity_rise_df.select(col("county"), col("race"))

# # show the county and race having disparity
# display(disparity_rise_df)


# continuous_disparity_county_df = overall_analysis_df.join(disparity_rise_df, on=['county', 'race'], how='inner')
# continuous_disparity_county_df = continuous_disparity_county_df.filter(col("population").isNotNull())

# # Show results
# display(continuous_disparity_county_df.select(col("county"), col("race"), col("county_race_standard_deviation"), col("race_mean_shooting"), col("county_mean_shooting")).distinct())



In [0]:
disparity_period_start, disparity_period_end = 2015, 2023
disparity_years = disparity_period_end - disparity_period_start + 1

overall_analysis_df = overall_analysis_df.withColumn("rank_diff", col("race_shooting_rank") - col("race_population_rank"))

# a window partitioned by county and race, ordered by year, get previous year's rank_diff
window_spec = Window.partitionBy("county", "race").orderBy("year")
disparity_rise_df = overall_analysis_df.withColumn("prev_rank_diff",coalesce(lag("rank_diff", 1).over(window_spec), col("rank_diff")))


# Compute change in rank difference
disparity_rise_df = disparity_rise_df.withColumn(
    'is_rising_disparity',
    when((col("rank_diff")<0) & (col("rank_diff") <= col("prev_rank_diff")), 1).otherwise(0)
)

# Filter only years 2020-2023
disparity_rise_df = disparity_rise_df.filter(col("year").between(disparity_period_start, disparity_period_end))

# Aggregate per (county, race) and check if all values in 2020-2023 are True
disparity_rise_df = disparity_rise_df.groupBy("county", "race").agg(
    count("*").alias("total_years"),  # Count how many years exist (should be 4)
    sum(col("is_rising_disparity").cast("int")).alias("true_count")
).withColumn(
    "is_persistent_rising_disparity", 
    (col("total_years") == disparity_years) & (col("true_count") == disparity_years)  # Ensure all years are present & True
).filter(col("is_persistent_rising_disparity")==True)
disparity_rise_df = disparity_rise_df.select(col("county"), col("race"))


# show the county and race having disparity
display(disparity_rise_df)


continuous_disparity_county_df = overall_analysis_df.join(disparity_rise_df, on=['county', 'race'], how='inner')
continuous_disparity_county_df = continuous_disparity_county_df.filter(col("population").isNotNull())

# Show results
display(continuous_disparity_county_df.select(col("county"), col("race"), col("county_race_standard_deviation"), col("race_mean_shooting"), col("county_mean_shooting")).distinct())



county,race
Baltimore,B
Cook,B
Fulton,B


county,race,county_race_standard_deviation,race_mean_shooting,county_mean_shooting
Baltimore,B,2.3,3.1,4.1
Cook,B,3.7483329627982624,6.2,8.5
Fulton,B,2.3000000000000003,4.0,5.7


In [0]:
display(continuous_disparity_county_df)

county,race,year,population,total_population,%population_race,race_population_rank,shooting_count,total_shooting_count,mean_shooting,%shooting_race,race_shooting_rank,county_total_shooting,county_mean_shooting,county_race_standard_deviation,race_mean_shooting,rank_diff
Baltimore,B,2023,253564,849586,29.85,2,1,2,1.0,50.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2019,235778,828018,28.47,2,7,10,3.333333333333333,70.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2022,252089,850737,29.63,2,3,4,2.0,75.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2015,219731,822959,26.7,2,3,5,2.5,60.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2020,239709,828193,28.94,2,3,4,2.0,75.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2017,228044,828637,27.52,2,2,4,2.0,50.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2018,231318,827625,27.95,2,2,2,2.0,100.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2021,247318,850702,29.07,2,7,7,7.0,100.0,1,41,4.1,2.3,3.1,-1
Baltimore,B,2016,223862,825666,27.11,2,2,2,2.0,100.0,1,41,4.1,2.3,3.1,-1
Cook,B,2020,1189074,5192136,22.9,3,5,8,4.0,62.5,1,85,8.5,3.7483329627982624,6.2,-2


### Line Charts for Disparity and Mean Shootings

In [0]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import plotly.io as pio
import tempfile
import shutil

def continuous_disparity_line_plot(county_df, county_name):
    race_mapping = {
        "W": "White", "B": "Black", "A": "Asian", "N": "Native American",
        "H": "Hispanic", "N;H": "Native American: Hispanic", "W;H": "White: Hispanic",
        "B;H": "Black: Hispanic", "ToM": "Two or More Races", "O": "Other", "NA": "Not Recorded"
    }
    
    races_list = county_df["race"].unique().tolist()
    num_races = len(races_list)
    num_cols = 3  # Two plots per row
    num_rows = (num_races + 1) // num_cols  # Ensure enough rows

    # Create subplot layout
    fig = make_subplots(rows=1, cols=num_cols, 
                    subplot_titles=("Population % vs Shooting %", "Count vs Yearly Mean Shooting", "Count vs Overall Mean"))

    for idx, race_code in enumerate(races_list):
        if race_code not in race_mapping:
            print(f"Invalid race code: {race_code}")
            continue

        # Filter and sort data for the given race
        race_data = county_df[county_df["race"] == race_code].sort_values("year")
        # race_data["us_mean"] = 20
        if race_data.empty:
            print(f"No data found for {county_name} and race {race_mapping[race_code]}")
            continue
        
        row, col = divmod(idx, num_cols)  # Determine subplot position (row, col)

        # First Line Plot: Population% vs Shooting%
        trace1 = go.Scatter(x=race_data["year"], y=race_data["%population_race"],
                            mode="lines+markers", name="Population%", line=dict(color="blue"))
        trace2 = go.Scatter(x=race_data["year"], y=race_data["%shooting_race"],
                            mode="lines+markers", name="Shooting%", line=dict(color="red", dash="dash"))

        # Second Line Plot: Shooting Count vs. Mean Shooting
        trace3 = go.Scatter(x=race_data["year"], y=race_data["shooting_count"],
                            mode="lines+markers", name="Shooting Count", line=dict(color="orange", dash="dash"))
        trace4 = go.Scatter(x=race_data["year"], y=race_data["mean_shooting"],
                            mode="lines+markers", name="Mean Shooting", line=dict(color="green"))

        # Second Line Plot: Shooting Count vs. Mean Shooting
        trace5 = go.Scatter(x=race_data["year"], y=race_data["shooting_count"],
                            mode="lines+markers", name="Shooting Count", line=dict(color="orange", dash="dash"))
        trace6 = go.Scatter(x=race_data["year"], y=race_data["county_mean_shooting"],
                            mode="lines+markers", name="County Mean", line=dict(color="green"))

        # Add traces to subplots
        fig.add_trace(trace1, row=1, col=1)
        fig.add_trace(trace2, row=1, col=1)
        fig.add_trace(trace3, row=1, col=2)
        fig.add_trace(trace4, row=1, col=2)
        fig.add_trace(trace5, row=1, col=3)
        fig.add_trace(trace6, row=1, col=3)

    # Update layout with separate axis titles
    fig.update_layout(
        title=f"{county_name} - {race_mapping[race_code]} Racial Disparity",
        width=1100, height=320,
        legend_title="Category",
        xaxis_title="Year",
        xaxis2_title="Year",  # Title for the second x-axis
        xaxis3_title="Year",  # Title for the second x-axis
        yaxis_title="Percentage (%)",  # Title for the first y-axis (Population & Shooting %)
        yaxis2_title="Shooting",  # Title for the second y-axis (Shooting Count & Mean Shooting)
        yaxis3_title="Mean",  # Title for the second y-axis (Shooting Count & Mean Shooting)
    )
    fig.show()

    # # Save the figure to a temporary file in Databricks
    # with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file:
    #     file_path = temp_file.name
    #     pio.write_html(fig, file_path)

    # # Move the file to DBFS so you can download it
    # dbfs_path = f"dbfs:/FileStore/tables"
    # shutil.move(file_path, dbfs_path)
    
    # return dbfs_path

continuous_disparity_county_df = continuous_disparity_county_df.orderBy("county")
pandas_disparity_df = continuous_disparity_county_df.toPandas()
county_list = pandas_disparity_df["county"].unique().tolist()
for county_name in county_list:
    county_df = pandas_disparity_df[pandas_disparity_df["county"] == county_name].sort_values("year", ascending=True)
    county_df = county_df.fillna(0)  # or use method="ffill" to forward-fill
    continuous_disparity_line_plot(county_df, county_name=county_name)



### Mean Shooting of Each Race Per Year

In [0]:
import warnings
# Ignore warnings
warnings.simplefilter("ignore")

# First import plotly express as px
import plotly.express as px

def continuous_disparity_line_plot(df, county_name):

    race_mapping = {
        "W": "White",
        "B": "Black",
        "A": "Asian",
        "N": "Native American",
        "H": "Hispanic",
        "N;H": "Native American: Hispanic",
        "W;H": "White: Hispanic",
        "B;H": "Black: Hispanic",
        "ToM": "Two or More Races",
        "O": "Other",
        "UN": "Not Recorded"
    }

    # Define a color map for the races
    color_map = {
        "W": "blue",   # White - Blue
        "B": "red",  # Black - Black
        "A": "green",  # Asian - Green
        "H": "orange",    # Hispanic - Red
        "O": "black", # Other - Orange
        "UN": "gray"   # Not Available - Gray
    }

    # create plot foe each county
    df = df[df["county"] == county_name]
    df["year"] = df["month_year"].str.split("-").str[1].astype("int")
    df["month"] = df["month_year"].str.split("-").str[0].astype("int")

    df = df.sort_values(["year", "month"], ascending=[True, True])
    df = df.fillna(0)  # or use method="ffill" to forward-fill

    race_code = df["race"].unique().tolist()
    if isinstance(race_code[0], str):
        race = race_mapping[race_code[0]]


    # Create a lineplot on a DataFramed named stock_data
    fig = px.line(df, x="month_year", y=["shooting_count"], color="race", color_discrete_map=color_map, markers=True, title=f"{county_name} - Racial Disparity")
    # fig.add_scatter(x=df["month_year"], y=df["mean_shooting"], marker_color="black", name=f"Mean Shooting", line=dict(dash="dash"))

    # Customize axis labels
    fig.update_layout(
        xaxis_title="Month-Year",
        yaxis_title="Shooting",
        legend_title="Races"  # Legend title
    )
    fig.show()


shooting_race_county_by_month_df = spark.table("default.percentage_shooting_race_per_county_by_month_year")
disparity_rise_counties = disparity_rise_df.select(col("county")).distinct()
disparity_counties = shooting_race_county_by_month_df.join(disparity_rise_counties, on="county", how="inner")
disparity_counties = disparity_counties.orderBy("county")

pandas_disparity_df = disparity_counties.toPandas()
county_list = pandas_disparity_df["county"].unique().tolist()
for county_name in county_list:
    continuous_disparity_line_plot(pandas_disparity_df, county_name=county_name)

## Detect Disparity County with Minimum Year and Races

- min_year_disparity = 6
- min_races_disparity = 2

In [0]:

min_year_disparity = 6
min_races_disparity = 2

disparity_races_df = overall_analysis_df.filter((col("rank_diff") < 0) & (col("shooting_count").isNotNull()))

# how many years of data we have for dispirated races
disparity_window = Window.partitionBy("county", "race")
disparity_races_df = disparity_races_df.withColumn("disparity_years_count", count("year").over(disparity_window))


# disparity occured more than "min_year_disparity" for a single race, 
county_of_races_disparity = disparity_races_df.filter(col("disparity_years_count")>=min_year_disparity).select(col("county"), col("year"), col("race"), col("disparity_years_count"))
display(county_of_races_disparity)


# Aggregate at the county level: count races with disparity and list distinct races and their years
county_with_count_races_disparity = (
    county_of_races_disparity
    .groupBy("county")
    .agg(
        countDistinct("race").alias("count_of_races_disparity"),
        collect_set("race").alias("distinct_races"),  # Collect distinct races as a list
        # countDistinct("year").alias("count_of_years_disparity"),
        # collect_set("year").alias("race_disparity_years")  # Collect list of years for each race
    )
    .orderBy(col("count_of_races_disparity").desc())
)


# Explode races and their years to make the output more structured
disparity_county_race_df = county_with_count_races_disparity.select(
    "county", 
    "count_of_races_disparity",
    # "count_of_years_disparity",
    # explode("race_disparity_years").alias("year"),
    explode("distinct_races").alias("race")
)

# filter counties where "min_races_disparity" occured
county_with_particular_disparity_races = disparity_county_race_df.filter(col("count_of_races_disparity")>=min_races_disparity)

# join count of disparitied races with disparity races data  
county_disparity_races_count_df = overall_analysis_df.join(county_with_particular_disparity_races, on=["county", "race"], how="inner")
county_disparity_races_count_df = county_disparity_races_count_df.filter(col("total_population").isNotNull())



# race_mean_window = Window.partitionBy("county", "race")
# mean_window = Window.partitionBy("county")

# county_disparity_races_count_df = county_disparity_races_count_df.withColumn("race_mean_shooting", round(mean(col("shooting_count")).over(race_mean_window), 2))
# county_disparity_races_count_df =county_disparity_races_count_df.withColumn("county_mean_shooting", round(mean(col("total_shooting_count")).over(mean_window), 2))

# display(county_disparity_races_count_df.select(col("county"), col("race")).distinct())
# .filter((col("county")=="San Diego") & (col("race")=="H")))

display(county_disparity_races_count_df.groupBy(col("county"), col("race")).agg(round(mean(col("race_mean_shooting")), 2).alias("race_mean_shooting"), round(mean(col("county_mean_shooting")), 2).alias("county_mean_shooting"), round(mean(col("%population_race")), 2).alias("mean_population_race"), round((mean(col("race_mean_shooting")) / mean(col("county_mean_shooting")) * 100), 2).alias("%race_mean_shooting")).distinct())



county,year,race,disparity_years_count
Adams,2019,B,6
Adams,2020,B,6
Adams,2022,B,6
Adams,2015,B,6
Adams,2017,B,6
Adams,2023,B,6
Alameda,2018,B,7
Alameda,2021,B,7
Alameda,2022,B,7
Alameda,2015,B,7


county,race,race_mean_shooting,county_mean_shooting,mean_population_race,%race_mean_shooting
Alameda,B,2.0,4.7,10.38,42.55
Alameda,H,1.86,4.7,22.52,39.57
El Paso,B,1.14,7.4,4.24,15.41
El Paso,W,3.2,7.4,37.7,43.24
Maricopa,B,3.2,27.6,5.24,11.59
Maricopa,N,1.29,27.6,0.19,4.67
Miami-dade,B,3.0,8.4,15.77,35.71
Miami-dade,W,2.11,8.4,13.63,25.12
Orange,B,3.2,13.6,7.31,23.53
Orange,H,4.44,13.6,30.74,32.65


In [0]:
display(county_disparity_races_count_df)

county,race,year,population,total_population,%population_race,race_population_rank,shooting_count,total_shooting_count,mean_shooting,%shooting_race,race_shooting_rank,county_total_shooting,county_mean_shooting,county_race_standard_deviation,race_mean_shooting,rank_diff,count_of_races_disparity
Alameda,B,2016,176819,1605217,11.02,5,,4,1.3333333333333333,,,47,4.7,3.1854580294116035,2.0,,2
Alameda,B,2022,165390,1663823,9.94,4,1.0,2,1.0,50.0,1.0,47,4.7,3.1854580294116035,2.0,-3.0,2
Alameda,B,2021,166017,1673133,9.92,4,2.0,7,2.333333333333333,28.57,2.0,47,4.7,3.1854580294116035,2.0,-2.0,2
Alameda,B,2020,167316,1661584,10.07,4,1.0,6,1.5,16.67,2.0,47,4.7,3.1854580294116035,2.0,-2.0,2
Alameda,B,2019,171168,1656754,10.33,4,,2,1.0,,,47,4.7,3.1854580294116035,2.0,,2
Alameda,B,2023,159042,1651949,9.63,4,1.0,5,1.6666666666666667,20.0,2.0,47,4.7,3.1854580294116035,2.0,-2.0,2
Alameda,B,2018,172329,1643700,10.48,4,1.0,4,1.0,25.0,1.0,47,4.7,3.1854580294116035,2.0,-3.0,2
Alameda,B,2015,179400,1584983,11.32,5,6.0,9,3.0,66.67,1.0,47,4.7,3.1854580294116035,2.0,-4.0,2
Alameda,B,2017,175063,1629615,10.74,4,2.0,7,2.333333333333333,28.57,2.0,47,4.7,3.1854580294116035,2.0,-2.0,2
Alameda,H,2016,362070,1605217,22.56,3,2.0,4,1.3333333333333333,50.0,1.0,47,4.7,2.962383210476708,1.86,-2.0,2


### County with Multiple Disparities in Histroy

In [0]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

def plot_disparity_for_county_race(county_df, county_name):
    race_mapping = {
        "W": "White",
        "B": "Black",
        "A": "Asian",
        "N": "Native American",
        "H": "Hispanic",
        "N;H": "Native American: Hispanic",
        "W;H": "White: Hispanic",
        "B;H": "Black: Hispanic",
        "ToM": "Two or More Races",
        "O": "Other",
        "UN": "Unknown"
    }

    races_list = county_df["race"].unique().tolist()
    num_races = len(races_list)
    num_cols = 2  # Two plots per row
    num_rows = math.ceil(num_races / num_cols)  # Calculate required rows

    # Create subplot layout
    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[race_mapping.get(r, r) for r in races_list])
    # fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=["Population % vs Shooting %"])

    county_df = county_df.sort_values("race")
    # Plot each race in a separate subplot
    for idx, race_code in enumerate(races_list):
        if race_code not in race_mapping:
            continue

        # Get race-specific data
        race_data = county_df[county_df["race"] == race_code].sort_values("year")

        if race_data.empty:
            continue

        # Determine subplot position
        row = (idx // num_cols) + 1
        col = (idx % num_cols) + 1

        # Add Population % line
        fig.add_trace(go.Scatter(
            x=race_data["year"], y=race_data["%population_race"],
            mode="lines+markers", name=f"Population%",
            line=dict(color="blue"), legendgroup=race_code
        ), row=row, col=col)

        # Add Shooting % line
        fig.add_trace(go.Scatter(
            x=race_data["year"], y=race_data["%shooting_race"],
            mode="lines+markers", name=f"Shooting%",
            line=dict(color="red", dash="dash"), legendgroup=race_code
        ), row=row, col=col)

        # Add axis titles for each subplot
        fig.update_xaxes(title_text="Year", row=row, col=col)
        fig.update_yaxes(title_text="Percentage(%)", row=row, col=col)

    # Update layout for better readability
    fig.update_layout(
        title=f"{county_name} - Population% vs Shooting%",
        height=num_rows * 300, width=1000,
        showlegend=True
    )

    fig.show()


# all_disparity_df
county_disparity_races_count_df = county_disparity_races_count_df.orderBy("county")
pandas_disparity_df = county_disparity_races_count_df.toPandas()
pandas_disparity_df = pandas_disparity_df[["county", "year", "race", "%population_race", "%shooting_race"]]

counties_list = pandas_disparity_df["county"].unique().tolist()
for county in counties_list:
  county_disparity_df = pandas_disparity_df[pandas_disparity_df["county"] == county]
  county_disparity_df["%shooting_race"] = county_disparity_df["%shooting_race"].fillna(0)
  plot_disparity_for_county_race(county_disparity_df, county)



In [0]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

def plot_disparity_for_county_race(county_df, county_name):
    race_mapping = {
        "W": "White",
        "B": "Black",
        "A": "Asian",
        "N": "Native American",
        "H": "Hispanic",
        "N;H": "Native American: Hispanic",
        "W;H": "White: Hispanic",
        "B;H": "Black: Hispanic",
        "ToM": "Two or More Races",
        "O": "Other",
        "UN": "Unknown"
    }

    races_list = county_df["race"].unique().tolist()
    num_races = len(races_list)
    num_cols = 1  # Two plots per row
    num_rows = math.ceil(num_races / num_cols)  # Calculate required rows

    # Create subplot layout
    # fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[race_mapping.get(r, r) for r in races_list])
    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=["Count vs County Mean"])

    county_df = county_df.sort_values("race")
    # Plot each race in a separate subplot
    for idx, race_code in enumerate(races_list):
        if race_code not in race_mapping:
            continue

        # Get race-specific data
        race_data = county_df[county_df["race"] == race_code].sort_values("year")

        if race_data.empty:
            continue

        # Determine subplot position
        row = (idx // num_cols) + 1
        col = (idx % num_cols) + 1

        # Add Population % line
        fig.add_trace(go.Scatter(
            x=race_data["year"], y=race_data["shooting_count"],
            mode="lines+markers", name=f"Shooting Count",
            line=dict(color="orange", dash="dash"), legendgroup=race_code
        ), row=row, col=col)

        # Add Shooting % line
        fig.add_trace(go.Scatter(
            x=race_data["year"], y=race_data["mean_shooting"],
            mode="lines+markers", name=f"County Mean",
            line=dict(color="green"), legendgroup=race_code
        ), row=row, col=col)

        # Add axis titles for each subplot
        fig.update_xaxes(title_text="Year", row=row, col=col)
        fig.update_yaxes(title_text="Shooting", row=row, col=col)

    # Update layout for better readability
    fig.update_layout(
        title=f"{county_name} - {race_mapping[race_code]} Racial Disparity",
        height=num_rows * 400, width=1000,
        showlegend=True
    )

    fig.show()

# shooting_count, mean_shooting, county_mean_shooting, 

# all_disparity_df
county_disparity_races_count_df = county_disparity_races_count_df.orderBy("county")
pandas_disparity_df = county_disparity_races_count_df.toPandas()
# pandas_disparity_df = pandas_disparity_df[["county", "year", "race", "%population_race", "%shooting_race"]]

counties_list = pandas_disparity_df["county"].unique().tolist()
for county in counties_list:
  county_disparity_df = pandas_disparity_df[pandas_disparity_df["county"] == county]
  county_disparity_df["%shooting_race"] = county_disparity_df["%shooting_race"].fillna(0)
  plot_disparity_for_county_race(county_disparity_df, county)

