### Intsall Geopy Library

In [0]:
!pip install geopy

Collecting geopy
  Obtaining dependency information for geopy from https://files.pythonhosted.org/packages/e5/15/cf2a69ade4b194aa524ac75112d5caac37414b20a3a03e6865dfe0bd1539/geopy-2.4.1-py3-none-any.whl.metadata
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Obtaining dependency information for geographiclib<3,>=1.52 from https://files.pythonhosted.org/packages/9f/5a/a26132406f1f40cf51ea349a5f11b0a46cec02a2031ff82e391c2537247a/geographiclib-2.0-py3-none-any.whl.metadata
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/125.4 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geographiclib-2.0-py3-none-any.whl (40 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [0]:
dbutils.library.restartPython()

### Method to get county by state, city combination

In [0]:
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from pyspark.sql.functions import udf, pandas_udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, pandas_udf, col, trim, regexp_replace
import pandas as pd
import time

def get_county_by_lati_longi(latitude, longitude):    
    geolocator = Nominatim(user_agent="shooting_data_geocoding", timeout=10)
    geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1)  # Delay of 1 second per request

    try:
        location = geocode((latitude, longitude))
        if location:
            address = location.raw['address']
            if 'county' in address:
                county = address['county'].replace("County", "")
                return county
            elif 'state_district' in address:
                return address['state_district']
        return None
    except GeocoderTimedOut:
        print(f"Timeout for coordinates: {latitude}, {longitude}")
        return None
    except Exception as e:
        print(f"Error geocoding {latitude}, {longitude}: {e}")
        return None


def get_county_by_city_state(city, state):
    geolocator = Nominatim(user_agent="shooting_data_geocoding", timeout=10)
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # Delay of 1 second per request
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)  # For reverse geocoding

    # First, get the coordinates of the location
    location = geocode(f"{city}, {state}, United States")
    if location:
        print(f"Coordinates for {city}, {state}: {location.latitude}, {location.longitude}")
        # Perform reverse geocoding using coordinates
        reverse_location = reverse((location.latitude, location.longitude))
        if reverse_location:
            address = reverse_location.raw.get('address', {})
            county = address.get('county') or address.get('city')
            return county
        else:
            return None
    else:
        return None

@pandas_udf(StringType())
def get_county_by_lati_longi_udf(latitude_series, longitude_series):
    return latitude_series.combine(longitude_series, get_county_by_lati_longi)


@pandas_udf(StringType())
def get_county_by_city_state_udf(city_series, state_series):
    return city_series.combine(state_series, get_county_by_city_state)



### Deafult Missing Counties

In [0]:
from pyspark.sql.functions import to_date, col, trim, regexp_replace, count, sum, round, rank, initcap, split, collect_list
from pyspark.sql.functions import col, when, to_date, split, length
from pyspark.sql.functions import col, lpad, concat_ws, split, to_date
from pyspark.sql.window import Window


In [0]:
raw_police_shooting_df = spark.table("fatal_police_shootings_data_csv")
raw_police_shooting_df = raw_police_shooting_df.withColumn("year", split(col("date"), "/")[2])
raw_police_shooting_df = raw_police_shooting_df.filter(col("year").isNotNull())

# Assuming imputed_police_shooting_df is your DataFrame
display(raw_police_shooting_df)


In [0]:
police_shooting_df = raw_police_shooting_df.withColumn("id", raw_police_shooting_df["id"].cast("integer"))
police_shooting_df = police_shooting_df.withColumn("age", police_shooting_df["age"].cast("integer"))
police_shooting_df = raw_police_shooting_df.withColumn("year", raw_police_shooting_df["year"].cast("integer"))
police_shooting_df = police_shooting_df.withColumn("longitude", police_shooting_df["longitude"].cast("float"))
police_shooting_df = police_shooting_df.withColumn("latitude", police_shooting_df["latitude"].cast("float"))

display(police_shooting_df)


In [0]:
missed_county_df = police_shooting_df.filter(police_shooting_df["county"].isNull())
filled_county_df = police_shooting_df.filter(police_shooting_df["county"].isNotNull())


In [0]:


# Apply the UDF to populate a new 'county' column using latitude and longitude
# filled_county_by_city_state_df = police_shooting_missing_county_data.withColumn("county", get_county_by_city_state_udf(col("city"), col("state")))
# filled_county_by_city_state_df = filled_county_by_city_state_df.withColumn("county", trim(regexp_replace(col("county"), "County", "")))
# filled_county_by_city_state_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("default.filled_county_by_city_state_df_default_data")


In [0]:
filled_county_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("default.police_shooting_data_filled_county_default_data")
missed_county_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("default.police_shooting_missing_county_default_data")


In [0]:

police_shooting_missing_county_data = spark.table("default.police_shooting_missing_county_default_data")

# Apply the UDF to populate a new 'county' column using latitude and longitude
filled_county_by_city_state_df = police_shooting_missing_county_data.withColumn("county", get_county_by_city_state_udf(col("city"), col("state")))
filled_county_by_city_state_df = filled_county_by_city_state_df.withColumn("county", trim(regexp_replace(col("county"), "County", "")))
filled_county_by_city_state_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("default.filled_county_by_city_state_df_default_data")



In [0]:
filled_by_city_sate_df = spark.table("default.filled_county_by_city_state_df_default_data")
missing_county_by_city_state_df = filled_by_city_sate_df.filter(col("county").isNull())
filled_county_by_city_state_df = filled_by_city_sate_df.filter(col("county").isNotNull())


In [0]:
filled_county_by_city_state_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("default.all_filled_county_by_city_state_df_default_data")

In [0]:
police_shooting_missing_county_data.count()

4692

In [0]:
filled_county_by_city_state_df.count()

4677

In [0]:
missing_county_by_city_state_df.count()


15

In [0]:
display(missing_county_by_city_state_df)

id,date,threat_type,flee_status,armed_with,city,county,state,latitude,longitude,location_precision,name,age,gender,race,race_source,was_mental_illness_related,body_camera,agency_ids,year
3956,12/8/2018,shoot,car,gun,Waynesboro,,VA,38.06847,-78.889465,not_available,Anthony Makai Hutchinson,40.0,male,B,not_available,False,False,3547,2018
4153,4/10/2018,point,not,gun,Scarbo,,WV,37.979633,-81.27061,not_available,William Cox,45.0,male,,,False,False,360,2018
5410,6/2/2019,attack,not,knife,Joilet,,IL,41.51732,-88.08554,not_available,Bruce Carter Jr.,38.0,male,B,not_available,False,False,244,2019
4469,18/2/2019,shoot,not,gun,Pelehatchie,,MS,32.32031,-89.7889,not_available,Pierre Woods,31.0,male,B,not_available,True,False,2401,2019
4777,11/6/2019,attack,not,knife,Porteau,,OK,35.077915,-94.628914,not_available,Jaquan Derrick Diijon Thompson,27.0,male,B,not_available,False,False,2496;2497,2019
4895,28/7/2019,point,not,gun,Boyton Beach,,FL,26.548351,-80.090576,not_available,Adalberto Wolmar Rodriguez,62.0,male,H,not_available,False,False,2447,2019
5038,16/9/2019,shoot,foot,gun,Elizabethon,,TN,36.348473,-82.211784,not_available,Jeffrey Michael Gibble,33.0,male,W,not_available,False,False,1118,2019
5442,28/1/2020,point,not,gun,Tuscson,,AZ,32.17786,-110.9984,not_available,Robert Cocio,39.0,male,H,not_available,False,False,964;397;215,2020
5569,28/1/2020,attack,not,vehicle,Shelby Gap,,KY,37.232872,-82.56576,not_available,Jonathan Bentley,37.0,male,W,not_available,False,False,130,2020
5665,13/3/2020,move,not,knife,Mufreesboro,,TN,35.80718,-86.31118,not_available,Christopher Mullins,,male,W,not_available,True,False,2404,2020


### Correct City Names

In [0]:
from pyspark.sql.functions import when, col, create_map, lit

city_misspelled_dict = {"Boyton Beach": "Boynton Beach", "Citrus Heighs": "Citrus Heights", "Elizabethon": "Elizabethton", "Joilet": "Joliet", "Mufreesboro": "Murfreesboro", "Pelehatchie": "Pelahatchie", "Porteau": "Poteau", "Scarbo": "Scarbro", "Tuscson": "Tucson"}

# Create a new column with the replacements
for misspelled_city, correct_city in city_misspelled_dict.items():
    missing_county_by_city_state_df = missing_county_by_city_state_df.withColumn("City", when(col("City") == misspelled_city, correct_city).otherwise(col("City")))

display(missing_county_by_city_state_df)

id,date,threat_type,flee_status,armed_with,City,county,state,latitude,longitude,location_precision,name,age,gender,race,race_source,was_mental_illness_related,body_camera,agency_ids,year
3956,12/8/2018,shoot,car,gun,Waynesboro,,VA,38.06847,-78.889465,not_available,Anthony Makai Hutchinson,40.0,male,B,not_available,False,False,3547,2018
4153,4/10/2018,point,not,gun,Scarbro,,WV,37.979633,-81.27061,not_available,William Cox,45.0,male,,,False,False,360,2018
5410,6/2/2019,attack,not,knife,Joliet,,IL,41.51732,-88.08554,not_available,Bruce Carter Jr.,38.0,male,B,not_available,False,False,244,2019
4469,18/2/2019,shoot,not,gun,Pelahatchie,,MS,32.32031,-89.7889,not_available,Pierre Woods,31.0,male,B,not_available,True,False,2401,2019
4777,11/6/2019,attack,not,knife,Poteau,,OK,35.077915,-94.628914,not_available,Jaquan Derrick Diijon Thompson,27.0,male,B,not_available,False,False,2496;2497,2019
4895,28/7/2019,point,not,gun,Boynton Beach,,FL,26.548351,-80.090576,not_available,Adalberto Wolmar Rodriguez,62.0,male,H,not_available,False,False,2447,2019
5038,16/9/2019,shoot,foot,gun,Elizabethton,,TN,36.348473,-82.211784,not_available,Jeffrey Michael Gibble,33.0,male,W,not_available,False,False,1118,2019
5442,28/1/2020,point,not,gun,Tucson,,AZ,32.17786,-110.9984,not_available,Robert Cocio,39.0,male,H,not_available,False,False,964;397;215,2020
5569,28/1/2020,attack,not,vehicle,Shelby Gap,,KY,37.232872,-82.56576,not_available,Jonathan Bentley,37.0,male,W,not_available,False,False,130,2020
5665,13/3/2020,move,not,knife,Murfreesboro,,TN,35.80718,-86.31118,not_available,Christopher Mullins,,male,W,not_available,True,False,2404,2020


### Correct Spelled Counties

In [0]:
# Data to create the DataFrame
miss_spelled_data = [
    ("Waynesboro", "VA", "Waynesboro"),
    ("Scarbro", "WV", "Fayette"),
    ("Joliet", "IL", "Will"),
    ("Pelahatchie", "MS", "Rankin"),
    ("Poteau", "OK", "Le Flore"),
    ("Boynton Beach", "FL", "Palm Beach"),
    ("Elizabethton", "TN", "Carter"),
    ("Tucson", "AZ", "Pima"),
    ("Shelby Gap", "KY", "Pike"),
    ("Murfreesboro", "TN", "Rutherford"),
    ("Colonial Heights", "VA", "Colonial Heights"),
    ("Citrus Heights", "CA", "Sacramento"),
    ("Martinsville", "VA", "Martinsville"),
    ("Covington", "VA", "Covington"),
    ("Tohono O'odham Nation Reservation", "AZ", "Pima")
]

# Column names
columns = ["city", "state", "us_county"]
missed_spelled_df = spark.createDataFrame(miss_spelled_data, columns)
display(missed_spelled_df)


city,state,us_county
Waynesboro,VA,Waynesboro
Scarbro,WV,Fayette
Joliet,IL,Will
Pelahatchie,MS,Rankin
Poteau,OK,Le Flore
Boynton Beach,FL,Palm Beach
Elizabethton,TN,Carter
Tucson,AZ,Pima
Shelby Gap,KY,Pike
Murfreesboro,TN,Rutherford


In [0]:
filled_missing_county_by_city_state_df = missing_county_by_city_state_df.join(missed_spelled_df, on=["city", "state"], how="inner")
filled_missing_county_by_city_state_df = filled_missing_county_by_city_state_df.drop("county")
filled_missing_county_by_city_state_df = filled_missing_county_by_city_state_df.withColumnRenamed("us_county", "county")
display(filled_missing_county_by_city_state_df)


City,state,id,date,threat_type,flee_status,armed_with,latitude,longitude,location_precision,name,age,gender,race,race_source,was_mental_illness_related,body_camera,agency_ids,year,county
Waynesboro,VA,3956,12/8/2018,shoot,car,gun,38.06847,-78.889465,not_available,Anthony Makai Hutchinson,40.0,male,B,not_available,False,False,3547,2018,Waynesboro
Scarbro,WV,4153,4/10/2018,point,not,gun,37.979633,-81.27061,not_available,William Cox,45.0,male,,,False,False,360,2018,Fayette
Joliet,IL,5410,6/2/2019,attack,not,knife,41.51732,-88.08554,not_available,Bruce Carter Jr.,38.0,male,B,not_available,False,False,244,2019,Will
Pelahatchie,MS,4469,18/2/2019,shoot,not,gun,32.32031,-89.7889,not_available,Pierre Woods,31.0,male,B,not_available,True,False,2401,2019,Rankin
Poteau,OK,4777,11/6/2019,attack,not,knife,35.077915,-94.628914,not_available,Jaquan Derrick Diijon Thompson,27.0,male,B,not_available,False,False,2496;2497,2019,Le Flore
Boynton Beach,FL,4895,28/7/2019,point,not,gun,26.548351,-80.090576,not_available,Adalberto Wolmar Rodriguez,62.0,male,H,not_available,False,False,2447,2019,Palm Beach
Elizabethton,TN,5038,16/9/2019,shoot,foot,gun,36.348473,-82.211784,not_available,Jeffrey Michael Gibble,33.0,male,W,not_available,False,False,1118,2019,Carter
Tucson,AZ,5442,28/1/2020,point,not,gun,32.17786,-110.9984,not_available,Robert Cocio,39.0,male,H,not_available,False,False,964;397;215,2020,Pima
Shelby Gap,KY,5569,28/1/2020,attack,not,vehicle,37.232872,-82.56576,not_available,Jonathan Bentley,37.0,male,W,not_available,False,False,130,2020,Pike
Murfreesboro,TN,5665,13/3/2020,move,not,knife,35.80718,-86.31118,not_available,Christopher Mullins,,male,W,not_available,True,False,2404,2020,Rutherford


In [0]:
filled_missing_county_by_city_state_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("default.all_filled_misspelled_city_counties_df_default_data")