In [1]:
# Import dependencies
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from requests.exceptions import ConnectionError, Timeout
import time

In [2]:
# Load CSV file
df = pd.read_csv('data/small_dataset.csv')

In [3]:
# Set up geolocator
geolocator = Nominatim(user_agent="ufo-sightings")

In [4]:
# Define the geocoding function with a timeout
def geocode_with_timeout(location): 
    return geolocator.geocode(location, timeout=10)

In [5]:
# Wrap it with a rate limiter to avoid API limits
geocode = RateLimiter(geocode_with_timeout, min_delay_seconds=1)

In [6]:
# Create a location string with City, State, Country
df["location"] = df[["City", "State Province", "Country"]].fillna("").agg(", ".join, axis=1)


In [7]:
# Define a function with error handling
def safe_geocode(location):
    try:
        result = geocode(location)
        if result:
            return pd.Series([result.latitude, result.longitude])
    except ConnectionError:
        print("🔌 Connection error. Retrying in 5 seconds...")
        time.sleep(5)
        return pd.Series([None, None])
    except Timeout:
        print("⏳ Timeout. Skipping...")
        return pd.Series([None, None])
    except Exception as e:
        print(f"❌ Error for {location}: {e}")
        return pd.Series([None, None])
    return pd.Series([None, None])

# Apply geocoding to the dataset
df[["latitude", "longitude"]] = df["location"].apply(safe_geocode)
print("✅ Done geocoding!")

✅ Done geocoding!


In [8]:
# Save results to a CSV file
df.to_csv("data/small_dataset_with_coords.csv", index=False)

In [14]:
#Display data has coordinates
df[['latitude','longitude']]


Unnamed: 0,latitude,longitude
0,26.271192,-80.270604
1,38.870842,-94.173834
2,26.461462,-80.072820
3,53.450869,-2.078260
4,54.702354,-3.276575
...,...,...
95,43.616616,-116.200886
96,43.132950,-115.691197
97,50.877524,5.981507
98,50.905669,5.068392
