In [1]:
#!pip install geopandas
#!pip install geopy
#!pip install folium

In [None]:
# notes
# changed the heading -- some of the inconsistencies that are more human readable, are less machine readable
# no LAST NAME 


# ideas: 
# change color of circle based on: occupation, person name, year, decade, etc.

In [None]:
#custom tiles: https://stackoverflow.com/questions/62621475/python-folium-custom-tile-setting 
#hover mouse: https://stackoverflow.com/questions/41095716/hover-in-popup-in-folium
#limiter error: https://stackoverflow.com/questions/58439692/convert-physical-addresses-to-geographic-locations-latitude-and-longitud

In [39]:
import geopandas
import pandas as pd

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import folium

import sys
import os

import re

from datetime import date

from legend import add_categorical_legend

In [45]:
def import_data(df):
    df = pd.read_csv(df)
    
    if 'decade' not in df:
        df['decade'] = df['year'].apply(lambda x: x - (x % 10))
    
    df['address'] = df['street'] + ',' + df['city'] + ',' + df['state'] + ',' + 'United States' # I am not sure if I need US 
    df = df[['address', 'decade']]

    return df


def time_stamp():
    today = date.today()
    d_today = today.strftime("%m-%d-%Y")

    return d_today


def address_standardization(row):
    row = str(row)

    if re.search(r'(?<=\d[2]).*(?=d )', row):
        return re.sub(r'(?<=\d[2]).*(?=d )', 'n', row)

    elif re.search(r'(?<=\d[3]).*(?=d )', row):
        return re.sub(r'(?<=\d[3]).*(?=d )', 'r', row)

    else:
        return row


def export_unmapped_data(df, dir, d_today):

    missing_data = df.isnull().any(axis=1)
    unmapped_addresses = df[missing_data]
    
    if not os.path.exists(dir):
        os.mkdir(dir)

    exp = os.path.join(dir, 'unmapped_addresses_' + d_today + ".csv")

    unmapped_addresses.to_csv(exp)


def geolocate(df, dir, d_today, clean_data):

    if clean_data == True:
        df['address'] = df['address'].apply(address_standardization)

    locator = Nominatim(user_agent = "myGeocoder")

    # adding a 1 second delay between requesets to avoid spamming the API
    geocode = RateLimiter(locator.geocode, min_delay_seconds = 1)

    df['location'] = df['address'].apply(geocode)
    df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)

    export_unmapped_data(df, dir, d_today)

    # code will throw an error if you try and explode or map an empty value
    df = df.dropna()

    df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index = df.index)

    exp = os.path.join(dir, "geolocated_data_" + d_today + ".csv")

    df.to_csv(exp)
    
    return(df)


def export_map(df, color_pallet):
    map1 = folium.Map(
        # starting location on map:
        location=[41.925098, -74.026143],
        tiles='cartodbpositron',
        zoom_start = 8) # I can add back min zoom is desired 

    for index, row in df.iterrows():
        for k, v in color_pallet.items():
            if k == row['decade']:
                color = v
                folium.CircleMarker([row['latitude'], row['longitude']], popup = row['address'], fill_color=color, fill=True, color=color).add_to(map1)

    map1 = add_categorical_legend(map1, 'Decades', colors = color_pallet.values(), labels = color_pallet.keys())

    map1.save('THADS_map_' + ts + '.html')

In [41]:
df = import_data('ai_life_members_1882.csv')

dir = os.getcwd()

ts = time_stamp()

#df = df.head(200)

# Nov. 24 version takes 24 minutes
df = geolocate(df, dir, ts, clean_data = True)

color_pallet = {
    1820:'#2acaea',
    1830:'#DFFF00', 
    1840:'#FFBF00', 
    1850:'#FF7F50',
    1860:'#DE3163',
    1870:'#9FE2BF',
    1880:'#6495ED'
    }

export_map(df, color_pallet)

In [42]:
df = geolocate(df, dir, ts, clean_data = True)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('New Church st.,New York,NY,United States',), **{}).
Traceback (most recent call last):
  File "/home/stephbuon/miniconda/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/stephbuon/miniconda/lib/python3.8/site-packages/urllib3/connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "/home/stephbuon/miniconda/lib/python3.8/http/client.py", line 1332, in getresponse
    response.begin()
  File "/home/stephbuon/miniconda/lib/python3.8/http/client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "/home/stephbuon/miniconda/lib/python3.8/http/client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/stephbuon/miniconda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock

In [46]:
color_pallet = {
    1820:'#2acaea',
    1830:'#DFFF00', 
    1840:'#FFBF00', 
    1850:'#FF7F50',
    1860:'#DE3163',
    1870:'#9FE2BF',
    1880:'#6495ED'
    }

export_map(df, color_pallet)

In [None]:
if __name__ == '__main__':
    try:
        input_file = sys.argv[1]
    except IndexError:
        exit('Missing input file argument')
        
    df = import_data(input_file)

    dir = os.getcwd()
    ts = time_stamp()
    
    df = geolocate(df, dir, ts, clean_data = True)
    
    color_pallet = {
        1820:'#2acaea',
        1830:'#DFFF00', 
        1840:'#FFBF00', 
        1850:'#FF7F50',
        1860:'#DE3163',
        1870:'#9FE2BF',
        1880:'#6495ED'
        }

    export_map(df, color_pallet)