In [189]:
# Dependencies
import os
import pathlib
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

# Import the API key
from config import geoapify_key


In [128]:
# URL for GET requests to retrieve brewery data
base_url = 'https://api.openbrewerydb.org/v1/breweries?by_country=United_States'

#example of the url for the first page
#https://api.openbrewerydb.org/v1/breweries?by_country=United_States&page=1&per_page=200
#query_url = base_url + "&page="+ page + "&per_page=200"

# Define an empty list to fetch the page data for each set of data
brewery_data = []

# Loop through the pages to get all the data
for page in range(1, 156):
    query_url = base_url + "&page="+ str(page) + "&per_page=200"
    response = requests.get(query_url).json()
    brewery_data.append(response)

# Create a list of all the breweries
breweries = []
for page in brewery_data:
    for brewery in page:
        breweries.append(brewery)

# Create a dataframe from the list of breweries
breweries_df = pd.DataFrame(breweries)

breweries_df.head()


Unnamed: 0,id,name,brewery_type,address_1,address_2,address_3,city,state_province,postal_code,country,longitude,latitude,phone,website_url,state,street
0,5128df48-79fc-4f0f-8b52-d06be54d0cec,(405) Brewing Co,micro,1716 Topeka St,,,Norman,Oklahoma,73069-8224,United States,-97.46818222,35.25738891,4058160490,http://www.405brewing.com,Oklahoma,1716 Topeka St
1,9c5a66c8-cc13-416f-a5d9-0a769c87d318,(512) Brewing Co,micro,407 Radam Ln Ste F200,,,Austin,Texas,78745-1197,United States,,,5129211545,http://www.512brewing.com,Texas,407 Radam Ln Ste F200
2,ef970757-fe42-416f-931d-722451f1f59c,10 Barrel Brewing Co,large,1501 E St,,,San Diego,California,92101-6618,United States,-117.129593,32.714813,6195782311,http://10barrel.com,California,1501 E St
3,6d14b220-8926-4521-8d19-b98a2d6ec3db,10 Barrel Brewing Co,large,62970 18th St,,,Bend,Oregon,97701-9847,United States,-121.281706,44.08683531,5415851007,http://www.10barrel.com,Oregon,62970 18th St
4,e2e78bd8-80ff-4a61-a65c-3bfbd9d76ce2,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,,,Bend,Oregon,97703-2465,United States,-121.3288021,44.0575649,5415851007,,Oregon,1135 NW Galveston Ave Ste B


In [129]:
# Pretty print JSON for all launchpads

#response = requests.get(query_url).json()
#print(json.dumps(response, indent=4, sort_keys=True))

In [130]:
#create a dataframe from the json response
#breweries_df = pd.DataFrame(response)
#breweries_df.head(4)

In [131]:
#creates a summary table of the data
breweries_df.describe()


Unnamed: 0,id,name,brewery_type,address_1,address_2,address_3,city,state_province,postal_code,country,longitude,latitude,phone,website_url,state,street
count,7936,7936,7936,7153,4,0.0,7936,7936,7936,7936,5544.0,5544.0,7141,6787,7936,7153
unique,7936,7791,10,7064,4,0.0,2908,54,7695,2,5433.0,5433.0,6950,6437,54,7064
top,5128df48-79fc-4f0f-8b52-d06be54d0cec,Granite City Food & Brewery,micro,303 Main St,Estacada,,Portland,California,64108,United States,-112.0773456,33.4485866,5122442739,http://www.gcfb.net,California,303 Main St
freq,1,6,4115,3,1,,105,912,5,7935,6.0,6.0,5,23,912,3


In [193]:
#Prepare the breweries Database for analysis 
#drop brevery id, address_2, address_3, country, phone, website_url, and street
clean_breweries_df = breweries_df.drop(columns=['id', 'address_2', 'address_3', 'country', 'phone', 'website_url', 'street'])
clean_breweries_df.head()

Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state
0,(405) Brewing Co,micro,1716 Topeka St,Norman,Oklahoma,73069-8224,-97.46818222,35.25738891,Oklahoma
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas
2,10 Barrel Brewing Co,large,1501 E St,San Diego,California,92101-6618,-117.129593,32.714813,California
3,10 Barrel Brewing Co,large,62970 18th St,Bend,Oregon,97701-9847,-121.281706,44.08683531,Oregon
4,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,Bend,Oregon,97703-2465,-121.3288021,44.0575649,Oregon


In [194]:
#count missing values for each column
clean_breweries_df.isnull()
#count missing values for each column 
clean_breweries_df.isnull().sum()

# TEAM: We need to decide how to handle the missing values.  Do we drop the rows?

name                 0
brewery_type         0
address_1          783
city                 0
state_province       0
postal_code          0
longitude         2392
latitude          2392
state                0
dtype: int64

In [203]:
#create a new column in dataframe with first 5 digits of zip code - geopify doesn't look up +4
clean_breweries_df['zip_code'] = clean_breweries_df['postal_code'].str[:5]
clean_breweries_df.head()


Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state,zip_code
0,(405) Brewing Co,micro,1716 Topeka St,Norman,Oklahoma,73069-8224,-97.46818222,35.25738891,Oklahoma,73069
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas,78745
2,10 Barrel Brewing Co,large,1501 E St,San Diego,California,92101-6618,-117.129593,32.714813,California,92101
3,10 Barrel Brewing Co,large,62970 18th St,Bend,Oregon,97701-9847,-121.281706,44.08683531,Oregon,97701
4,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,Bend,Oregon,97703-2465,-121.3288021,44.0575649,Oregon,97703


In [205]:


#load dataframe into csv
clean_breweries_df.to_csv('breweries.csv', index=False)
#show count by the length of the postal code to see if there are any invalid postal codes
clean_breweries_df['zip_code'].str.len().value_counts()

5    7936
Name: zip_code, dtype: int64

In [206]:
#use dataframe from the csv breveries.csv
input_path = os.path.join("breweries.csv")
csv_breweries_df = pd.read_csv(input_path)

In [None]:
#postcode = "78745"
#target_url = f"https://api.geoapify.com/v1/geocode/autocomplete?text={postcode}&limit=1&type=postcode&format=json&apiKey={geoapify_key}"
#response = requests.get(target_url).json()
#print(json.dumps(response, indent=4, sort_keys=True))


In [207]:
#exctract rows with missing longitude and create a new dataframe
missing_long_df = csv_breweries_df[csv_breweries_df['longitude'].isnull()]
missing_long_df.head()


Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state,zip_code
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas,78745
18,12 Gates Brewing Company,brewpub,80 Earhart Dr Ste 20,Williamsville,New York,14221-7804,,,New York,14221
19,12 West Brewing Company,micro,3000 E Ray Rd Bldg 6,Gilbert,Arizona,85296-7832,,,Arizona,85296
24,12welve Eyes Brewing,micro,141 E 4th St Ste LL2,Saint Paul,Minnesota,55101-1639,,,Minnesota,55101
26,13 Stripes Brewery,brewpub,"250 Mill St, Suite PW3101",Taylors,South Carolina,29687,,,South Carolina,29687


In [202]:

#if longitude and latitude null then use the value from respective zip_code to get the longitude from the api
#store the longitude in the dataframe
for index, row in missing_long_df.iterrows():
                if pd.isnull(row['longitude']) and pd.isnull(row['latitude']):
                        postcode = row['zip_code']
                        target_url = f"https://api.geoapify.com/v1/geocode/autocomplete?text={postcode}&limit=1&type=postcode&format=json&apiKey={geoapify_key}"
                        response = requests.get(target_url).json()
                        missing_long_df.loc[index, 'longitude'] = response['results'][0]['lon']
                        missing_long_df.loc[index, 'latitude'] = response['results'][0]['lat']
                        print(f"index: {index}, postcode: {postcode}, longitude: {response['results'][0]['lon']}, latitude: {response['results'][0]['lat']}")
                        
missing_long_df.head()

index: 1, postcode: 78745, longitude: -97.792614846, latitude: 30.208605656
index: 18, postcode: 14221, longitude: -78.729920877, latitude: 42.980952841
index: 19, postcode: 85296, longitude: -111.762518433, latitude: 33.335136701
index: 24, postcode: 55101, longitude: -93.088300242, latitude: 44.955919207
index: 26, postcode: 29687, longitude: -82.327800669, latitude: 34.991467109
index: 28, postcode: 27603, longitude: -78.66089377, latitude: 35.71301003
index: 30, postcode: 56442, longitude: -94.116315338, latitude: 46.677860236
index: 32, postcode: 5478, longitude: -68.67620435, latitude: -31.547226
index: 36, postcode: 21157, longitude: -76.983937143, latitude: 39.558197671
index: 39, postcode: 22553, longitude: -77.60125509, latitude: 38.231176411
index: 49, postcode: 11741, longitude: -73.070358674, latitude: 40.794962148
index: 50, postcode: 5452, longitude: -88.922086084, latitude: 45.542678653
index: 55, postcode: 20109, longitude: -77.506105128, latitude: 38.784153209
index: 

IndexError: list index out of range