In [165]:
# Import Depencies

import pandas as pd
import polars as pl
import requests
from pprint import pprint
from requests.structures import CaseInsensitiveDict

from apikey import apikey

# Turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [166]:
# Section for user input. 
# Create empty variables to let the user input coordinates. Create city/coordinate options for users who do not want to input their own coordinates.
lat = ''
lon = ''

default = [0, 0]
cleveland = [41.4993, -81.6944]
atlanta = [33.7488, -84.3877]
los_angeles = [34.0549, -118.2426]
new_york_city = [40.7128, -74.0060]
chicago = [41.8781, -87.6298]
seattle = [47.6061, -122.3328]

# Set up variable for user to search for their own coordinates. User_defined or written to pre-determined
location_search = [123456789,123456789] 

# Create list of cities with pre-written coordinates
default_city_list =  {'Cleveland' : cleveland , 'Atlanta' : atlanta, 'Los Angeles' : los_angeles, 'New York City' : new_york_city, 'Chicago' : chicago, 'Seattle' : seattle}

# Display cities for user to consider pre-written options
print('Available list of cities\n')
for k, v in default_city_list.items():
    print(k)

Available list of cities

Cleveland
Atlanta
Los Angeles
New York City
Chicago
Seattle


In [167]:
# Ask user for input on if they would like to pick from pre-written cities
user_coordinates = input("Would you like pick from a list of available cities? yes or no")

# If they choose to use a pre-written city, have the user pick a city from the pre-written list
if user_coordinates.lower() == 'y' or user_coordinates.lower() == 'yes':

    match = input("Please pick a city from the list.\n")

    for k, v in default_city_list.items():
        if match.lower() == k.lower():
            location_search = v

# If the user does not want to pick from the pre-written list, allow them to input their own coordinates
elif user_coordinates.lower() == 'n' or user_coordinates.lower() == 'no':
    user_lat = input("Please enter the latitude. ")
    user_lon = input("Please enter the longitude. ")
    location_search = [float(user_lat), float(user_lon)]

# Let the user know that their only options are to pick from the list or not pick from the list
else:
    raise Exception("Invalid Input. Must specify Yes (y) or No (n)")

# Make sure the location search is filled in with city coordinates every time
if location_search == [123456789, 123456789]:
    raise Exception("Invalid location. Use a name that is in the list, or choose No (n) your own coordinates.")

# Display the coordinates that will be used in the location search    
print(f'Using coordinates {location_search}\n')


Using coordinates [41.4993, -81.6944]



In [168]:
# Set search parameters as default values, except for the country code which will be filled in later
radius_meters = 5000
country_code = ''
limit = 500
offset = 0 #default is zero


# Set the filter to search for locations within the parameters
location_filter = f'circle:{location_search[1]},{location_search[0]},{radius_meters}'

# Keep for future flexibility
if country_code != "":
    country_code = country_code + f"|countrycode:{country_code}"

# Format headers to be used later
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"

In [169]:
# Create list of categories to be filled in to filter data in search.
# any category can be added - it must also be added to category_dict

category_string = ""

activity = False
commercial = True
commercial_catering = False
commercial_supermarket = False
internet_access = True
accomodations = False
entertainment = True
leisure = False
parking = True
tourism = False
wheelchair = False
dogs = False
vegetarian = False
vegan = True
gluten_free = False
organic = False


In [170]:
# Create dictionary to house each category that will be captured
category_dict = {'activity' : activity, 'commercial' : commercial, 'commerical.catering' : commercial_catering, 'accomodation' : accomodations, \
                        'entertainment' : entertainment, 'leisure': leisure, 'parking' : parking, 'wheelchair.yes' : wheelchair, 'internet_access' : internet_access, \
                            'dogs': dogs, 'vegetarian': vegetarian, 'vegan' : vegan, 'gluten_free': gluten_free, 'organic' : organic, 'tourism': tourism}

# Set up string to input categories to add to base url
string_of_categories = ""

# Capture the key for each category we want to include in the search
for k,v in category_dict.items():
    if v:
        string_of_categories = string_of_categories + k + ','

# Remove last comma
string_of_categories = string_of_categories[:-1]

# API string built for flexibility
url2 = f"https://api.geoapify.com/v2/places?categories={string_of_categories}&filter={location_filter}&limit={limit}&offset={offset}&apiKey={apikey}"


In [171]:
# Search API / get API dictionary
resp = requests.get(url2, headers=headers)

# View response status to see if search was successful
print(resp.status_code)

# Store the response to manipulate data received
json_info = resp.json()

200


In [172]:
# View first response, only for debugging, may be commented out later
pprint(json_info["features"][0])

{'geometry': {'coordinates': [-81.69677670688294, 41.50747877025738],
              'type': 'Point'},
 'properties': {'address_line1': 'Great Lakes Science Center',
                'address_line2': '601 Erieside Avenue, Cleveland, OH 44114, '
                                 'United States of America',
                'categories': ['building',
                               'building.tourism',
                               'entertainment',
                               'entertainment.museum',
                               'internet_access',
                               'wheelchair',
                               'wheelchair.yes'],
                'city': 'Cleveland',
                'country': 'United States',
                'country_code': 'us',
                'county': 'Cuyahoga County',
                'datasource': {'attribution': '© OpenStreetMap contributors',
                               'license': 'Open Database Licence',
                               'raw': {'addr:

In [173]:
# Create error checking to make sure it is only US locations. String split is based on US cities.
# Do this to prevent the code from stalling with cities in other countries which have differently formatted data
if json_info["features"][0]["properties"]["country_code"] != 'us':
    raise Exception('This process is built for US cities only.')

In [174]:
# Create dataframe from the stored json response from the API search
features_pd = pd.json_normalize(json_info["features"])

In [175]:
# Create dataframe which will become the first table in the future database. Only take categories which hold address and location information as columns for new df
properties_df = features_pd[["properties.address_line1", "properties.address_line2", "properties.place_id"]]

In [176]:
# Rename columns
properties_df = properties_df.rename(columns = {"properties.address_line1": "Property", "properties.address_line2": "Address2", "properties.place_id": "Place_ID"})

# View dataframe to see if this was successful, may be commented out once confirmed
#properties_df

In [177]:
# View top of new dataframe to see what we are working with and decide next steps
properties_df.head(5)

Unnamed: 0,Property,Address2,Place_ID
0,Great Lakes Science Center,"601 Erieside Avenue, Cleveland, OH 44114, Unit...",51512b54fd976c54c059bad57810f5c04440f00102f901...
1,International Women's Air and Space Museum,"1501 North Marginal Road, Cleveland, OH 44114,...",51b2dc88dd2d6c54c059cf10648275c14440f00103f901...
2,Main Branch Cleveland Public Library,"Superior Avenue East, Cleveland, OH 44114, Uni...",519cd26ebb406c54c059c44519842bc04440f00102f901...
3,A Christmas Story House Museum,"1103 Rowley Avenue, Cleveland, OH 44109, Unite...",5164ffa097066c54c0594f6fd80affbb4440f00102f901...
4,Hilton Cleveland Downtown,"100 Lakeside Avenue East, Cleveland, OH 44114,...",5125dac4ea846c54c0594a4c1efa5ac04440f00102f901...


In [178]:
# Create new separate columns for the address fields from the renamed address column
properties_df[['Address', 'City', 'State_Zip', 'Country']] = properties_df.Address2.str.split(",", expand = True)


In [179]:
# View top of dataframe to confirm process was successful
properties_df.head(5)

Unnamed: 0,Property,Address2,Place_ID,Address,City,State_Zip,Country
0,Great Lakes Science Center,"601 Erieside Avenue, Cleveland, OH 44114, Unit...",51512b54fd976c54c059bad57810f5c04440f00102f901...,601 Erieside Avenue,Cleveland,OH 44114,United States of America
1,International Women's Air and Space Museum,"1501 North Marginal Road, Cleveland, OH 44114,...",51b2dc88dd2d6c54c059cf10648275c14440f00103f901...,1501 North Marginal Road,Cleveland,OH 44114,United States of America
2,Main Branch Cleveland Public Library,"Superior Avenue East, Cleveland, OH 44114, Uni...",519cd26ebb406c54c059c44519842bc04440f00102f901...,Superior Avenue East,Cleveland,OH 44114,United States of America
3,A Christmas Story House Museum,"1103 Rowley Avenue, Cleveland, OH 44109, Unite...",5164ffa097066c54c0594f6fd80affbb4440f00102f901...,1103 Rowley Avenue,Cleveland,OH 44109,United States of America
4,Hilton Cleveland Downtown,"100 Lakeside Avenue East, Cleveland, OH 44114,...",5125dac4ea846c54c0594a4c1efa5ac04440f00102f901...,100 Lakeside Avenue East,Cleveland,OH 44114,United States of America


In [180]:
# Drop any rows with empty values
properties_df = properties_df.dropna()

In [181]:
# Remove any spaces or whitespaces from data to more easily manipulate
properties_df["State_Zip"] = properties_df['State_Zip'].str.strip()

In [182]:
# Split State_Zip column into separate columns for the state and the zip code
properties_df[['State', 'Zip']] = properties_df.State_Zip.str.split(" ", expand = True)

In [183]:
# View top of dataframe to confirm process was successful
properties_df.head(5)

Unnamed: 0,Property,Address2,Place_ID,Address,City,State_Zip,Country,State,Zip
0,Great Lakes Science Center,"601 Erieside Avenue, Cleveland, OH 44114, Unit...",51512b54fd976c54c059bad57810f5c04440f00102f901...,601 Erieside Avenue,Cleveland,OH 44114,United States of America,OH,44114
1,International Women's Air and Space Museum,"1501 North Marginal Road, Cleveland, OH 44114,...",51b2dc88dd2d6c54c059cf10648275c14440f00103f901...,1501 North Marginal Road,Cleveland,OH 44114,United States of America,OH,44114
2,Main Branch Cleveland Public Library,"Superior Avenue East, Cleveland, OH 44114, Uni...",519cd26ebb406c54c059c44519842bc04440f00102f901...,Superior Avenue East,Cleveland,OH 44114,United States of America,OH,44114
3,A Christmas Story House Museum,"1103 Rowley Avenue, Cleveland, OH 44109, Unite...",5164ffa097066c54c0594f6fd80affbb4440f00102f901...,1103 Rowley Avenue,Cleveland,OH 44109,United States of America,OH,44109
4,Hilton Cleveland Downtown,"100 Lakeside Avenue East, Cleveland, OH 44114,...",5125dac4ea846c54c0594a4c1efa5ac04440f00102f901...,100 Lakeside Avenue East,Cleveland,OH 44114,United States of America,OH,44114


In [184]:
# Remove the address2 column now that the addresses have been separated into new columns
properties_df = properties_df.drop('Address2', axis=1)

In [185]:
# Remove the State_Zip column now that the state and zip code have been separated into new columns
properties_df = properties_df.drop('State_Zip', axis=1)

In [186]:
# Order the columns in the way in which we will put the data into the database
properties_df = properties_df.reindex(['Place_ID', 'Property', 'Address', 'City', 'State', 'Zip', 'Country'], axis=1)

In [187]:
# View top of dataframe to confirm process was successful
properties_df.head(5)

Unnamed: 0,Place_ID,Property,Address,City,State,Zip,Country
0,51512b54fd976c54c059bad57810f5c04440f00102f901...,Great Lakes Science Center,601 Erieside Avenue,Cleveland,OH,44114,United States of America
1,51b2dc88dd2d6c54c059cf10648275c14440f00103f901...,International Women's Air and Space Museum,1501 North Marginal Road,Cleveland,OH,44114,United States of America
2,519cd26ebb406c54c059c44519842bc04440f00102f901...,Main Branch Cleveland Public Library,Superior Avenue East,Cleveland,OH,44114,United States of America
3,5164ffa097066c54c0594f6fd80affbb4440f00102f901...,A Christmas Story House Museum,1103 Rowley Avenue,Cleveland,OH,44109,United States of America
4,5125dac4ea846c54c0594a4c1efa5ac04440f00102f901...,Hilton Cleveland Downtown,100 Lakeside Avenue East,Cleveland,OH,44114,United States of America


In [188]:
# Convert pandas dataframe to polars
properties_df_pl = pl.from_pandas(properties_df)

In [199]:
# View top of dataframe to confirm process was successful
properties_df_pl.head(5)

Place_ID,Property,Address,City,State,Zip,Country
str,str,str,str,str,str,str
"""51512b54fd976c…","""Great Lakes Sc…","""601 Erieside A…",""" Cleveland""","""OH""","""44114""",""" United States…"
"""51b2dc88dd2d6c…","""International …","""1501 North Mar…",""" Cleveland""","""OH""","""44114""",""" United States…"
"""519cd26ebb406c…","""Main Branch Cl…","""Superior Avenu…",""" Cleveland""","""OH""","""44114""",""" United States…"
"""5164ffa097066c…","""A Christmas St…","""1103 Rowley Av…",""" Cleveland""","""OH""","""44109""",""" United States…"
"""5125dac4ea846c…","""Hilton Clevela…","""100 Lakeside A…",""" Cleveland""","""OH""","""44114""",""" United States…"


In [190]:
# Create empty dataframe to hold data for the second table we would like to make that will house category data from the API search
category_table_df = pd.DataFrame(columns = ['Place_ID', 'Category'])

In [191]:
# Insert each row from the stored json response into categories dataframe
num_of_records = len(json_info['features'])

for i in range(num_of_records):
    place_id_var = json_info["features"][i]["properties"]["place_id"]

    for category in json_info["features"][i]["properties"]["categories"]:
        #print(j)
        new_row = {'Place_ID': place_id_var, 'Category': category}

        #https://stackoverflow.com/questions/75956209/error-dataframe-object-has-no-attribute-append
        category_table_df = category_table_df._append(new_row, ignore_index = True)

In [192]:
# Add index to category df
# https://stackoverflow.com/questions/12168648/how-to-add-a-column-with-values-1-to-lendf-to-a-dataframe
category_table_df["Index"] = range(1, len(category_table_df) + 1)

# View the top of the dataframe to make sure the column was added
category_table_df.head()

Unnamed: 0,Place_ID,Category,Index
0,51512b54fd976c54c059bad57810f5c04440f00102f901...,building,1
1,51512b54fd976c54c059bad57810f5c04440f00102f901...,building.tourism,2
2,51512b54fd976c54c059bad57810f5c04440f00102f901...,entertainment,3
3,51512b54fd976c54c059bad57810f5c04440f00102f901...,entertainment.museum,4
4,51512b54fd976c54c059bad57810f5c04440f00102f901...,internet_access,5


In [193]:
# reorder columns
category_table_df = category_table_df[['Index', 'Place_ID', 'Category']]

In [200]:
# Convert to polars dataframe, display head of dataframe so that we can see if the process was successful
category_table_df_pl = pl.from_pandas(category_table_df)
category_table_df_pl.head(5)

Index,Place_ID,Category
i64,str,str
1,"""51512b54fd976c…","""building"""
2,"""51512b54fd976c…","""building.touri…"
3,"""51512b54fd976c…","""entertainment"""
4,"""51512b54fd976c…","""entertainment.…"
5,"""51512b54fd976c…","""internet_acces…"


In [195]:
# Create CSV file for address data to be imported to PostgreSQL from dataframes, use polars
# https://docs.pola.rs/user-guide/io/csv/
properties_df_pl.write_csv("../CSV Files/Addresses.csv", separator=",")

In [196]:
# Create CSV file for category data to be imported to PostgreSQL from dataframes, use polars
# https://docs.pola.rs/user-guide/io/csv/
category_table_df_pl.write_csv("../CSV Files/Categories.csv", separator=",")