In [0]:
from pyspark.sql.functions import col, regexp_replace, when, lit, concat, mean


In [0]:
raw_police_shooting_df = spark.table("fatal_police_shootings_data_csv")
raw_police_shooting_df.count()


10429

In [0]:
filled_county_default_df = spark.table("default.police_shooting_data_filled_county_default_data")

# Add 'Parish' to the county column if the state is LA and county doesn't contain 'Parish'
filled_county_default_df = filled_county_default_df.withColumn("county", when((col("state") == "LA") & (~col("county").contains("Parish")), concat(col("county"), lit(" Parish"))).otherwise(col("county")))

city_state_imputed_df = spark.table("default.all_filled_county_by_city_state_df_default_data")
mis_spelled_imputed_df = spark.table("default.all_filled_misspelled_city_counties_df_default_data")



In [0]:
display(filled_county_default_df.filter(((col("state") == "MO")) & (col("county") == "St. Louis")))

id,date,threat_type,flee_status,armed_with,city,county,state,latitude,longitude,location_precision,name,age,gender,race,race_source,was_mental_illness_related,body_camera,agency_ids,year
117,3/2/2015,point,foot,gun,St. Louis,St. Louis,MO,38.74984,-90.20953,not_available,Ledarius D. Williams,23,male,B,not_available,False,False,134,2015
382,17/4/2015,threat,not,knife,Jennings,St. Louis,MO,38.724247,-90.24393,not_available,Thaddeus McCarroll,23,male,B,not_available,True,True,268,2015
985,5/11/2015,attack,not,blunt_object,Lakeshire,St. Louis,MO,38.538166,-90.34138,not_available,Jacob Hohman,30,male,W,not_available,False,False,906;268,2015
1061,5/12/2015,shoot,not,gun,St. Louis,St. Louis,MO,38.811077,-90.23915,not_available,Sheilah Huck,61,female,W,not_available,True,False,268,2015
2241,19/1/2017,attack,not,knife,Florissant,St. Louis,MO,38.79717,-90.299416,not_available,Elijah Smith,25,male,W,not_available,True,False,1620,2017
2441,19/3/2017,point,other,gun,Affton,St. Louis,MO,38.543407,-90.29839,not_available,Clifton Knickmeyer,59,male,W,not_available,False,False,268,2017
2798,24/7/2017,shoot,car,gun,St. Louis,St. Louis,MO,38.511242,-90.33596,not_available,Jerrod Kershaw,31,male,W,not_available,True,False,1817,2017
3022,1/10/2017,threat,not,gun;knife,St. Louis,St. Louis,MO,38.5694,-90.31096,not_available,Joshua Lanflisi,38,male,W,not_available,True,False,268,2017
3446,26/2/2018,shoot,not,gun,St. Louis,St. Louis,MO,38.7522,-90.2083,not_available,William Watson Jr.,40,male,B,not_available,False,False,2080,2018
8724,7/12/2022,threat,not,knife,Maryland Heights,St. Louis,MO,38.753838,-90.47093,address,Mark David Davenport,48,male,W,photo,False,False,28800,2022


In [0]:
print(filled_county_default_df.count())
print(city_state_imputed_df.count())
print(mis_spelled_imputed_df.count())

5737
4677
15


In [0]:
print(filled_county_default_df.count()+city_state_imputed_df.count()+mis_spelled_imputed_df.count())

10429


In [0]:
final_imputed_df = filled_county_default_df.unionByName(city_state_imputed_df).unionByName(mis_spelled_imputed_df)
final_imputed_df = final_imputed_df.withColumn("county", regexp_replace(col("county"), r"^Saint\b", "St."))


In [0]:
final_imputed_df.filter(col("county").isNotNull()).count()

10429

In [0]:
final_imputed_df.write.format("delta").mode("overwrite").saveAsTable("default.imputed_county_washington_police_shootings_data")

## Yearly Shooting Calculations

In [0]:
from pyspark.sql.functions import col, regexp_replace

# display(final_imputed_df)

## Distribution Of Race Shootings Per County By Year

In [0]:
from pyspark.sql.functions import to_date, col, trim, regexp_replace, count, sum, round, rank, initcap, split, dense_rank
from pyspark.sql.functions import col, when, to_date, split, length
from pyspark.sql.functions import col, lpad, concat_ws, split, to_date
from pyspark.sql.window import Window


### Count shooting for each race by county and year

In [0]:
imputed_county_shooting_df = spark.table("default.imputed_county_washington_police_shootings_data")
imputed_county_shooting_df = imputed_county_shooting_df.withColumn("race", when(col("race").isin(["W;A", "W;B", "W;B;N"]), "ToM")
                           .otherwise(col("race")))

# Fill 'null' values only in the 'name' column with 'NA'
imputed_county_shooting_df = imputed_county_shooting_df.fillna({"race": "NA"})
shooting_per_races_df = imputed_county_shooting_df.groupBy(["county", "year", "race"]).agg(count("id").alias("shooting_count"))


In [0]:
display(shooting_per_races_df.filter((col("county").isin("Richland", "Fulton", "Cook", "San Joaquin", "El Paso", "San Bernardino", "Baltimore", "New York", "Hennepin", "Cuyahoga")) & (col("race") == "B")))

county,year,race,shooting_count
Baltimore,2023,B,1
Fulton,2021,B,5
Cook,2020,B,5
Baltimore,2024,B,1
Fulton,2024,B,3
Baltimore,2019,B,7
Baltimore,2022,B,3
Richland,2022,B,1
Fulton,2022,B,5
New York,2015,B,1


### Add missing years shooting data for each race as 0

In [0]:
from pyspark.sql.functions import expr

# Pivot the DataFrame
pivot_df = shooting_per_races_df.groupBy("county", "race").pivot("year").agg({"shooting_count": "sum"})

# List all years explicitly from the pivoted DataFrame
years = [str(y) for y in range(2015, 2025)]  # Generating year column names as strings

# Construct the stack expression dynamically
stack_expr = f"stack({len(years)}, " + ", ".join([f"'{year}', `{year}`" for year in years]) + ") as (year, shooting_count)"

# Unpivot the DataFrame
unpivot_shooting_race_df = pivot_df.selectExpr("county", "race", stack_expr)
# unpivot_shooting_race_df = unpivot_shooting_race_df.fillna(subset=["shooting_count"], value=0)


In [0]:
# Define the window specification for partitioning by state, county, and year
window_spec = Window.partitionBy("county", "year")

# Calculate total shootings using window function
unpivot_shooting_race_df = unpivot_shooting_race_df.withColumn("total_shooting_count", sum("shooting_count").over(window_spec))
unpivot_shooting_race_df = unpivot_shooting_race_df.withColumn("mean_shooting", mean("shooting_count").over(window_spec))


# Calculate the county standard deviation
mean_window = Window.partitionBy("county")
unpivot_shooting_race_df = unpivot_shooting_race_df.withColumn("county_total_shooting", round(sum(col("shooting_count")).over(mean_window), 2)).withColumn("county_mean_shooting", round(mean(col("total_shooting_count")).over(mean_window), 2))


# Define the window to partition by county and race
race_mean_window = Window.partitionBy("county", "race")

# Compute the county standard deviation in a single formula
unpivot_shooting_race_df = unpivot_shooting_race_df.withColumn("race_mean_shooting", round(mean(col("shooting_count")).over(race_mean_window), 2))

unpivot_shooting_race_df = unpivot_shooting_race_df.withColumn("%shooting_race", when(col("total_shooting_count").isNotNull() & col("shooting_count").isNotNull(), round((unpivot_shooting_race_df["shooting_count"] / unpivot_shooting_race_df["total_shooting_count"]) * 100, 2)).otherwise(lit(None)))


# display(unpivot_shooting_race_df)


In [0]:

# Define the window partition
window_race = Window.partitionBy("county", "year").orderBy(col("%shooting_race").desc_nulls_last())

# Apply dense_rank while ensuring NULLs are not ranked
shooting_per_races_df = (
    unpivot_shooting_race_df
    .withColumn("race_shooting_rank", 
                when(col("%shooting_race").isNotNull(), dense_rank().over(window_race)))
)

In [0]:


display(shooting_per_races_df.filter(col("county") == "Cook").select("county", "year", "race", "shooting_count", "total_shooting_count", "%shooting_race", "mean_shooting", "county_mean_shooting", "race_mean_shooting").orderBy("county", "race", "year"))


county,year,race,shooting_count,total_shooting_count,%shooting_race,mean_shooting,county_mean_shooting,race_mean_shooting
Cook,2015,B,9.0,11,81.82,3.6666666666666665,8.5,6.2
Cook,2016,B,12.0,15,80.0,5.0,8.5,6.2
Cook,2017,B,5.0,10,50.0,3.333333333333333,8.5,6.2
Cook,2018,B,6.0,6,100.0,6.0,8.5,6.2
Cook,2019,B,5.0,6,83.33,3.0,8.5,6.2
Cook,2020,B,5.0,8,62.5,4.0,8.5,6.2
Cook,2021,B,10.0,13,76.92,4.333333333333333,8.5,6.2
Cook,2022,B,4.0,4,100.0,4.0,8.5,6.2
Cook,2023,B,4.0,8,50.0,2.0,8.5,6.2
Cook,2024,B,2.0,4,50.0,1.3333333333333333,8.5,6.2


In [0]:
shooting_per_races_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("default.percentage_shooting_race_per_county_by_year")