This code takes geolocation data and dates. The data is then plotted as a density plot.

Future work:
Add in date ranges to include (if applicable)
Remove data in files where a lat/lng column is present but doesn't have a location or lat/lng data

In [48]:
import folium
from geopy.geocoders import Nominatim
import numpy as np
from folium import plugins
import pandas as pd
#import geopandas
#import geojsoncontour

In [111]:
#Data loading functions
def get_geocode_api(location):
    """Gets the lattitude and longitude of location
    location: geographical location or address"""
    place = geolocator.geocode(location)
    lat = place.latitude
    lng = place.longitude
    return(lat, lng)

def is_same_as_previous(l, idx):
    """Checks if the current index in a list is the same as previous"""
    if idx==0:
        return False
    else:
        return (l[idx-1]==l[idx])

def get_geocodes(df, addr_h = 'Address',lat_h = 'Lattitude',lng_h = 'Longitude', repeat_addresses = False):
    """Gets the lattitude and longitude of all locations and returns db with lat, lng values
    addr_h: column header for locations/addresses
    repeat_addresses: Bool, if True it will check to see if that address is already found
        - May be much faster than using API mulitple times, depending on addr_h column length and number of repeats etc"""
    
    if repeat_addresses==True: 
        df = df.sort_values(by=[addr_h], ignore_index=True) #Reduce computation time when finding repeats
    for i in range(len(df)):
        if repeat_addresses and is_same_as_previous(df[addr_h],i):
            df.loc[i,[lat_h,lng_h]] = df.loc[i-1,[lat_h,lng_h]]
        else:
            df.loc[i,[lat_h,lng_h]] = get_geocode_api(df.loc[i,addr_h])
    return(df)

def fill_missing_lat_lngs(df, addr_h="Address",lat_h='Lattitude',lng_h='Longitude'):
    """Fills in missing lattitude and longitude values in an existing df
    df: dataframe with location, lat, lng data
    addr_h,lat_h,lng_h: the dataframe header for the location,lattitude and longitude data respectively"""
    for i in range(len(df)):
        if any(pd.isna(df.loc[i, [lat_h,lng_h]])):
            df.loc[i, [lat_h,lng_h]] = get_geocode_api(addr_h)
    return(df)

def remove_bad_data(df, addr_h, lat_h, lng_h):
    """Removes all rows that do not include either an address or lat/lng combination
    df: dataframe with location, lat, lng data
    addr_h,lat_h,lng_h: the dataframe header for the location,lattitude and longitude data respectively"""
    no_addr = pd.isna(df[addr_h])
    no_lat_lng = pd.isna(df[lat_h])|pd.isna(df[lng_h])
    has_either = ~(no_addr&no_lat_lng)
    return(df.loc[has_either])

def get_data_and_clean(f, addr_h='Address',lat_h='Lattitude',lng_h='Longitude', repeat_addresses = False, reload_gis=False):
    """Reads in data from a csv file fills in missing lattitude and lng data, then saves file.
    f: csv file path
    addr_h: column header for addresses in csv
    lat_h/lng_h: column header for lattitudes/longitudes in csv
    """
    df = pd.read_csv(f)
    locations = df[addr_h]
    if (lat_h in df.columns) and not reload_gis:
        df = remove_bad_data(df, addr_h, lat_h, lng_h)
        df = fill_missing_lat_lngs(df, addr_h, lat_h, lng_h)
    else:
        df = df.dropna(subset = [addr_h])
        df = get_geocodes(df, addr_h, repeat_addresses = False)
    return(df)
    

In [143]:
True & ~False

1

In [136]:
df.loc[df['Address']=='Arco','Address']=pd.NA

In [138]:
df.loc[pd.isna(df['Address'])]=3

In [139]:
df

Unnamed: 0,Address,Lattitude,Longitude,Date,Notes:
0,"Pocatello, ID",42.681613,-112.229214,,
1,"Twin Falls, ID",51.575646,-0.098647,,
2,"Boise, ID",51.575646,-0.098647,,
3,"Idaho Falls, ID",42.852022,-112.222143,,
4,"Bone, Idaho",0.0,0.0,,
5,"Arco, ID",51.575646,-0.098647,,
6,"Hailey, ID",51.575646,-0.098647,,
7,3,3.0,3.0,3.0,3.0


In [201]:
#Plotting Functions
def get_midpoint(x):
    """finds the min and max of a numpy array
    x: numpy array of numerical values
    """
    return((x.min()+x.max())/2)

def get_name(df, addr_h, lat_h, lng_h, idx):
    """Gets address if given, else returns lat/long string
    df: dataframe with the data
    addr_h: string header for the address
    idx: column index of the pandas dataframe (ie for .loc)"""
    val= df.loc[idx, addr_h]
    if not pd.isna(val):
        return(val)
    else:
        return(f'Lat: {df.loc[idx,lat_h]:.4f} Lng: {df.loc[idx,lng_h]:.4f}')

def make_heatmap(df, lat_h, lng_h):
    m = folium.Map(location=(get_midpoint(df[lat_h]), get_midpoint(df[lng_h])),zoom_start=7,)
    plugins.HeatMap(df[[lat_h,lng_h]],radius=15, blur = 10).add_to(folium.FeatureGroup(name='Heat Map')).add_to(m)
    return(m)

def make_pinmap(df, addr_h, lat_h, lng_h):
    m = folium.Map(location=(get_midpoint(df[lat_h]), get_midpoint(df[lng_h])),zoom_start=7,)
    if addr_h not in df.columns:
        df[addr_h]=[f'Lat: {df.loc[i,lat_h]:.4f} Lng: {df.loc[i,lng_h]:.4f}' for i in range(len(df))]
    for i in df.index:
        name = get_name(df, addr_h, lat_h, lng_h, i)
        folium.Marker(location=list(df.loc[i,[lat_h,lng_h]]), 
                        popup=f"<strong>{i}</strong>", 
                        tooltip=f'{name}', 
                        icon=folium.Icon(color='green')).add_to(folium.FeatureGroup(name='Markers')).add_to(m)
    return(m)

def make_html(f_out, m):
    m.save('map.html')

In [113]:
#Auxillary Funcitons
def generate_fake_data(location):
    lat, lng = get_geocode_api(location)
    rnd_lats = lat+np.random.normal(0,0.1,100)
    rnd_lngs = lng+np.random.normal(0,0.1,100)
    return(pd.DataFrame({'Lattitude':rnd_lats,'Longitude':rnd_lngs}))

def replace_blank_names(df, addr_h='Address',lat_h='Lattitude',lng_h='Longitude'):
    if any(pd.isna(df[addr_h])):
        dfTemp = df[pd.isna(df[addr_h])]
        df.loc[pd.isna(df[addr_h]),'Address'] = [f'Lat: {dfTemp.loc[i,lat_h]:.4f} Lng: {dfTemp.loc[i,lng_h]:.4f}' for i in dfTemp.index]
    return(df)

In [203]:
addr_h='Address'
lat_h='Lattitude'
lng_h= 'Longitude'
repeat_addresses = True
file='Test Data/Test_data.csv'
reload_gis = False #If True re-finds the GIS coordinates, drops rows with no address 
fake_data_location = None#'Nampa, Idaho' #Will load this if not None

In [204]:
if fake_data_location is not None:
    df=generate_fake_data(fake_data_location)
else:
    df=get_data_and_clean(file,
                          addr_h=addr_h, 
                          lat_h=lat_h, 
                          lng_h=lng_h, 
                          repeat_addresses=repeat_addresses, 
                          reload_gis=reload_gis)

In [205]:
geolocator = Nominatim(user_agent='Data_Map_App')

In [206]:
'{:4f}'.format(4.45325)

'4.453250'

In [207]:
m1 = make_heatmap(df, lat_h, lng_h)
m1

In [208]:
m2 = make_pinmap(df, addr_h, lat_h, lng_h)
m2

In [128]:
df

Unnamed: 0,Address,Lattitude,Longitude,Date,Notes:
0,"Pocatello, ID",42.681613,-112.229214,,
1,"Twin Falls, ID",51.575646,-0.098647,,
2,"Boise, ID",51.575646,-0.098647,,
3,"Idaho Falls, ID",42.852022,-112.222143,,
4,"Bone, Idaho",0.0,0.0,,
5,"Arco, ID",51.575646,-0.098647,,
6,"Hailey, ID",51.575646,-0.098647,,
7,,42.681613,-112.227214,,
