```
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import os
from fake_useragent import UserAgent
ua = UserAgent()

# Generate a random user-agent
user_agent = ua.random
# change working directory
os.chdir('../')

data = pd.read_csv(r"flint_multifields.csv")
data.columns

geolocator = Nominatim(user_agent=user_agent)

# Increase min_delay_seconds if needed and set a higher timeout for geocode requests
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, error_wait_seconds=10)

def full_address(row):
    return f"{row['Address']}, {row['City']}, {row['County']}, {row['State']}, {row['Zip']}, {row['Country']}"

# Assuming 'data' is your DataFrame
data['location'] = data.apply(lambda row: geocode(full_address(row), timeout=10), axis=1)
data['point'] = data['location'].apply(lambda loc: (loc.latitude, loc.longitude) if loc else None)

print(data[['Address', 'point']])

data.to_csv('flint_geocoded_revised.csv', index = False)
```

The above code is the geocoded result using fre version. I think the default OSM map is not working because it can not achieve most of the accurate results even half of them are empty. So we have to use the opencage map to extract the result.

Using free version so I retrieve one per second.

In [4]:
import pandas as pd
import os
from opencage.geocoder import OpenCageGeocode
import time

# change working directory
os.chdir('../')

# using free version
api_key = "your key here"
geocoder = OpenCageGeocode(api_key)

In [6]:

def geocode_address(address):
    try:
        result = geocoder.geocode(address, no_annotations='1')
        if result and len(result):
            latitude = result[0]['geometry']['lat']
            longitude = result[0]['geometry']['lng']
            print(f"Geocoded {address} to ({latitude}, {longitude})")
            return (latitude, longitude)
    except Exception as e:
        print(f"Error geocoding address: {address} - {e}")
    finally:
        time.sleep(1)  # Wait for 1 second before making the next request
    return None

In [6]:
data = pd.read_csv(r"flint_multifields.csv")

def full_address(row):
    return f"{row['Address']}, {row['City']}, {row['County']}, {row['State']}, {row['Zip']}, {row['Country']}"


data['point'] = data.apply(lambda row: geocode_address(full_address(row)), axis=1)

print(data[['Address', 'point']])
data.to_csv('flint_geocoded_revised_opencage.csv', index=False)

# Check for any null values in the 'point' column to see how many addresses weren't geocoded
null_counts = data.isnull().sum()
print(null_counts)


Geocoded 911 Wickes Park Dr, Saginaw, Saginaw County, Michigan, 48601, USA to (43.39628, -83.963774)
Geocoded 812 N Harrison St, Saginaw, Saginaw County, Michigan, 48602, USA to (43.4234228, -83.9588168)
Geocoded 9102 Copper Ridge Dr, Davison, Genesee County, Michigan, 48423, USA to (43.037347, -83.519247)
Geocoded 4713 N State Rd, Davison, Genesee County, Michigan, 48423, USA to (43.0759952, -83.5181921)
Geocoded 4252 McCormick Dr, Linden, Genesee County, Michigan, 48451, USA to (42.816845, -83.759468)
Geocoded 611 Taylor Rd, Brighton, Livingston County, Michigan, 48114, USA to (42.594851, -83.7480478)
Geocoded 1927 W Murphy Lake Rd, Millington, Tuscola County, Michigan, 48746, USA to (43.208573, -83.601103)
Geocoded 3901 Payne Rd, Attica, Lapeer County, Michigan, 48412, USA to (43.0246753, -83.1895966)
Geocoded 1306 Glenwood Ct, Milford, Oakland County, Michigan, 48381, USA to (41.80621, -72.51786)
Geocoded 998 Eagle Dr, Fenton, Genesee County, Michigan, 48430, USA to (42.791953, -83

you can see that there is some points that are not well documented, so we can write another code to particularlly deal with them.

In [7]:
data = pd.read_csv(r"flint_geocoded_revised_opencage.csv")

def simplified_address(row):
    # Simplified address with only Address and City
    return f"{row['Address']}, {row['City']}"

# I check this for several times and I see that these points are wrong regardless the range of bound we specified
def is_within_bounds(point, lat_bounds=[41, 45], lon_bounds=[-85, -81]):
    lat, lon = point
    return lat_bounds[0] <= lat <= lat_bounds[1] and lon_bounds[0] <= lon <= lon_bounds[1]

for index, row in data.iterrows():
    point = row['point']
    # Assuming 'point' is stored as a string, convert it back to tuple (float, float)
    if isinstance(point, str):
        point = tuple(map(float, point.strip("()").split(',')))
    if not is_within_bounds(point):
        print(point)
        print(f"Address {simplified_address(row)} is outside the bounds of Michigan")
        new_point = geocode_address(simplified_address(row))
        data.at[index, 'point'] = new_point if new_point else data.at[index, 'point']
        print(f"Updated point to {new_point}")

data.to_csv('flint_geocoded_revised_opencage_updated.csv', index=False)

(41.80621, -72.51786)
Address 1306 Glenwood Ct, Milford is outside the bounds of Michigan
Geocoded 1306 Glenwood Ct, Milford to (42.5761932, -83.6207763)
Updated point to (42.5761932, -83.6207763)
(41.66704, -72.66648)
Address 2929 Lanier Ct, Howell is outside the bounds of Michigan
Geocoded 2929 Lanier Ct, Howell to (42.5913969, -83.8958804)
Updated point to (42.5913969, -83.8958804)
(41.66704, -72.66648)
Address 2603 Brunkow Ct, Saginaw is outside the bounds of Michigan
Geocoded 2603 Brunkow Ct, Saginaw to (43.3971579, -83.90536)
Updated point to (43.3971579, -83.90536)
(41.66704, -72.66648)
Address 8323 Winnesk Ct, Brighton is outside the bounds of Michigan
Geocoded 8323 Winnesk Ct, Brighton to (42.5763072, -83.7883575)
Updated point to (42.5763072, -83.7883575)
(41.66704, -72.66648)
Address 4672 McDonald Ct, Brighton is outside the bounds of Michigan
Geocoded 4672 McDonald Ct, Brighton to (42.52948, -83.78022)
Updated point to (42.52948, -83.78022)
(41.66704, -72.66648)
Address 124

after the above operations, we have finally cleaned the flint dataframe, and we can work it similarly with the next dataframe

In [8]:
data = pd.read_csv(r"grapids_multifields.csv")

def full_address(row):
    return f"{row['Address']}, {row['City']}, {row['County']}, {row['State']}, {row['Zip']}, {row['Country']}"

data['point'] = data.apply(lambda row: geocode_address(full_address(row)), axis=1)

print(data[['Address', 'point']])
data.to_csv('grapids_multifields_revised.csv', index=False)

# Check for any null values in the 'point' column to see how many addresses weren't geocoded
null_counts = data.isnull().sum()
print(null_counts)


Geocoded 754 Ridgefield Dr, Coopersville, Ottawa County, Michigan, 49404, USA to (43.0737709, -85.9446474)
Geocoded 3746 Knapp St NE, Grand Rapids, Kent County, Michigan, 49525, USA to (41.78334, -99.27538)
Geocoded 4115 64th St SW, Grandville, Kent County, Michigan, 49418, USA to (42.848666, -85.768765)
Geocoded 2017 Harding St, Conklin, Ottawa County, Michigan, 49403, USA to (43.1330166, -85.8396773)
Geocoded 1410 117th Ave, Otsego, Allegan County, Michigan, 49078, USA to (42.544991, -85.681864)
Geocoded 2420 Valentine Blvd NE, Grand Rapids, Kent County, Michigan, 49525, USA to (41.78334, -99.27538)
Geocoded 315 Wilson Ave NW, Grand Rapids, Kent County, Michigan, 49534, USA to (43.020777, -85.782079)
Geocoded 293 Dratz St, Muskegon, Muskegon County, Michigan, 49442, USA to (43.2471012, -86.2152248)
Geocoded 3232 Lake Dr SE, Grand Rapids, Kent County, Michigan, 49546, USA to (42.937718, -85.5886066)
Geocoded 23149 16th Ave, Conklin, Ottawa County, Michigan, 49403, USA to (43.187681, -

In [15]:
data = pd.read_csv(r"grapids_multifields_revised.csv")

def simplified_address(row):
    # Simplified address with only Address and City
    return f"{row['Address']}, {row['City']}"

# I check this for several times and I see that these points are wrong regardless the range of bound we specified
def is_within_bounds(point, lat_bounds=[41, 45], lon_bounds=[-87, -84]):
    lat, lon = point
    return lat_bounds[0] <= lat <= lat_bounds[1] and lon_bounds[0] <= lon <= lon_bounds[1]

for index, row in data.iterrows():
    point = row['point']
    # Assuming 'point' is stored as a string, convert it back to tuple (float, float)
    if isinstance(point, str):
        point = tuple(map(float, point.strip("()").split(',')))
    if not is_within_bounds(point):
        print(point)
        print(f"Address {simplified_address(row)} is outside the bounds of Michigan")
        new_point = geocode_address(simplified_address(row))
        data.at[index, 'point'] = new_point if new_point else data.at[index, 'point']
        print(f"Updated point to {new_point}")

data.to_csv('grapids_multifields_revised_update.csv', index=False)

(41.78334, -99.27538)
Address 3746 Knapp St NE, Grand Rapids is outside the bounds of Michigan
Geocoded 3746 Knapp St NE, Grand Rapids to (42.9988157, -85.5768828)
Updated point to (42.9988157, -85.5768828)
(41.78334, -99.27538)
Address 2420 Valentine Blvd NE, Grand Rapids is outside the bounds of Michigan
Geocoded 2420 Valentine Blvd NE, Grand Rapids to (43.0203444, -85.6106791)
Updated point to (43.0203444, -85.6106791)
(41.78334, -99.27538)
Address 15460 Larsen Ave NE, Gowen is outside the bounds of Michigan
Updated point to None
(41.78334, -99.27538)
Address 33 Pond Ridge Dr NE, Grand Rapids is outside the bounds of Michigan
Geocoded 33 Pond Ridge Dr NE, Grand Rapids to (42.9624644, -85.5624006)
Updated point to (42.9624644, -85.5624006)
(41.78334, -99.27538)
Address 300 Rockford Park Dr NE, Rockford is outside the bounds of Michigan
Geocoded 300 Rockford Park Dr NE, Rockford to (43.1410608, -85.5521781)
Updated point to (43.1410608, -85.5521781)
(41.78334, -99.27538)
Address 14501

In [17]:
data = pd.read_csv(r"grapids_multifields_revised_update.csv")

def simplified_address_revised(row):
    # Simplified address with only Address and City
    return f"{row['Address']}, {row['State']}"

# I check this for several times and I see that these points are wrong regardless the range of bound we specified
def is_within_bounds(point, lat_bounds=[41, 45], lon_bounds=[-87, -84]):
    lat, lon = point
    return lat_bounds[0] <= lat <= lat_bounds[1] and lon_bounds[0] <= lon <= lon_bounds[1]

for index, row in data.iterrows():
    point = row['point']
    # Assuming 'point' is stored as a string, convert it back to tuple (float, float)
    if isinstance(point, str):
        point = tuple(map(float, point.strip("()").split(',')))
    if not is_within_bounds(point):
        print(point)
        print(f"Address {simplified_address_revised(row)} is outside the bounds of Michigan")
        new_point = geocode_address(simplified_address_revised(row))
        data.at[index, 'point'] = new_point if new_point else data.at[index, 'point']
        print(f"Updated point to {new_point}")

data.to_csv('grapids_multifields_revised_update_again.csv', index=False)

(41.78334, -99.27538)
Address 15460 Larsen Ave NE, Michigan is outside the bounds of Michigan
Geocoded 15460 Larsen Ave NE, Michigan to (43.2483803, -85.3322334)
Updated point to (43.2483803, -85.3322334)
(42.63342, -71.31617)
Address 1153 Biggs Ave NE, Michigan is outside the bounds of Michigan
Geocoded 1153 Biggs Ave NE, Michigan to (42.9852933, -85.3420117)
Updated point to (42.9852933, -85.3420117)
(52.25, 5.75)
Address 2290 Tunnel Breeze Ct, Michigan is outside the bounds of Michigan
Geocoded 2290 Tunnel Breeze Ct, Michigan to (42.7948455, -86.2046508)
Updated point to (42.7948455, -86.2046508)
(34.85262, -82.39401)
Address 9067 Wellman Rd NE, Michigan is outside the bounds of Michigan
Geocoded 9067 Wellman Rd NE, Michigan to (43.161825, -85.382207)
Updated point to (43.161825, -85.382207)
(14.5315, 7.73697)
Address 5844 Treebrook Ln NE, Michigan is outside the bounds of Michigan
Geocoded 5844 Treebrook Ln NE, Michigan to (46.238544, -84.676493)
Updated point to (46.238544, -84.67

In [18]:
data = pd.read_csv(r"grapids_multifields_revised_update_again.csv")

def simplified_address_revised(row):
    # Simplified address with only Address and City
    return f"{row['Address']}, {row['State']}"

# I check this for several times and I see that these points are wrong regardless the range of bound we specified
def is_within_bounds(point, lat_bounds=[41, 45], lon_bounds=[-87, -84]):
    lat, lon = point
    return lat_bounds[0] <= lat <= lat_bounds[1] and lon_bounds[0] <= lon <= lon_bounds[1]

for index, row in data.iterrows():
    point = row['point']
    # Assuming 'point' is stored as a string, convert it back to tuple (float, float)
    if isinstance(point, str):
        point = tuple(map(float, point.strip("()").split(',')))
    if not is_within_bounds(point):
        print(point)
        print(f"Address {simplified_address_revised(row)} is outside the bounds of Michigan")
        # new_point = geocode_address(simplified_address_revised(row))
        # data.at[index, 'point'] = new_point if new_point else data.at[index, 'point']
        # print(f"Updated point to {new_point}")

# data.to_csv('grapids_multifields_revised_update_again.csv', index=False)

(46.238544, -84.676493)
Address 5844 Treebrook Ln NE, Michigan is outside the bounds of Michigan
(46.431187, -84.468795)
Address 11750 Cedar Rock Dr NE, Michigan is outside the bounds of Michigan
