In [1]:
# Dependencies
import os
import pathlib
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from uszipcode import SearchEngine


# Import the API key
from config import geoapify_key
from config import census_key




ModuleNotFoundError: No module named 'api_keys'

DATA CLEANUP

1. The first set of data was obtained from the OpenBreweryAPI. The API is a free API, however it limits the number of queries to 200 rows and restricting sourcing of data to 2 pages at a time. In order to source all necessary data, it was necessary to create a loop to move through all 156 pages of the API and collect 100 cases per page or 200 every two pages. 

In [3]:
# URL for GET requests to retrieve brewery data
base_url = 'https://api.openbrewerydb.org/v1/breweries?by_country=United_States'


# Define an empty list to fetch the page data for each set of data
brewery_data = []

# Loop through the pages to get all the data
for page in range(1, 156):
    query_url = base_url + "&page="+ str(page) + "&per_page=200"
    response = requests.get(query_url).json()
    brewery_data.append(response)

# Create a list of all the breweries
breweries = []
for page in brewery_data:
    for brewery in page:
        breweries.append(brewery)

# Create a dataframe from the list of breweries
breweries_df = pd.DataFrame(breweries)

breweries_df.head()


Unnamed: 0,id,name,brewery_type,address_1,address_2,address_3,city,state_province,postal_code,country,longitude,latitude,phone,website_url,state,street
0,5128df48-79fc-4f0f-8b52-d06be54d0cec,(405) Brewing Co,micro,1716 Topeka St,,,Norman,Oklahoma,73069-8224,United States,-97.46818222,35.25738891,4058160490,http://www.405brewing.com,Oklahoma,1716 Topeka St
1,9c5a66c8-cc13-416f-a5d9-0a769c87d318,(512) Brewing Co,micro,407 Radam Ln Ste F200,,,Austin,Texas,78745-1197,United States,,,5129211545,http://www.512brewing.com,Texas,407 Radam Ln Ste F200
2,ef970757-fe42-416f-931d-722451f1f59c,10 Barrel Brewing Co,large,1501 E St,,,San Diego,California,92101-6618,United States,-117.129593,32.714813,6195782311,http://10barrel.com,California,1501 E St
3,6d14b220-8926-4521-8d19-b98a2d6ec3db,10 Barrel Brewing Co,large,62970 18th St,,,Bend,Oregon,97701-9847,United States,-121.281706,44.08683531,5415851007,http://www.10barrel.com,Oregon,62970 18th St
4,e2e78bd8-80ff-4a61-a65c-3bfbd9d76ce2,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,,,Bend,Oregon,97703-2465,United States,-121.3288021,44.0575649,5415851007,,Oregon,1135 NW Galveston Ave Ste B


It can be seen right away that the dataframe might be missing some values - the most importantly latitude and longitue which was important for maping visualizations. 

The describe function below shows that 2,392 rows are misisng the longitude and latitude. Since the number of rows consitutde 30% of the full data set, 
it was important to backfull the missing information.   

In [5]:
#creates a summary table of the data
breweries_df.describe()

Unnamed: 0,id,name,brewery_type,address_1,address_2,address_3,city,state_province,postal_code,country,longitude,latitude,phone,website_url,state,street
count,7936,7936,7936,7153,4,0.0,7936,7936,7936,7936,5544.0,5544.0,7141,6787,7936,7153
unique,7936,7791,10,7064,4,0.0,2908,54,7695,2,5433.0,5433.0,6950,6437,54,7064
top,5128df48-79fc-4f0f-8b52-d06be54d0cec,Granite City Food & Brewery,micro,303 Main St,Estacada,,Portland,California,64108,United States,-112.0773456,33.4485866,5122442739,http://www.gcfb.net,California,303 Main St
freq,1,6,4115,3,1,,105,912,5,7935,6.0,6.0,5,23,912,3


In [6]:
#Prepare the breweries Database for analysis 
#drop brevery id, address_2, address_3, country, phone, website_url, and street
clean_breweries_df = breweries_df.drop(columns=['id', 'address_2', 'address_3', 'country', 'phone', 'website_url', 'street'])
clean_breweries_df.head()

Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state
0,(405) Brewing Co,micro,1716 Topeka St,Norman,Oklahoma,73069-8224,-97.46818222,35.25738891,Oklahoma
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas
2,10 Barrel Brewing Co,large,1501 E St,San Diego,California,92101-6618,-117.129593,32.714813,California
3,10 Barrel Brewing Co,large,62970 18th St,Bend,Oregon,97701-9847,-121.281706,44.08683531,Oregon
4,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,Bend,Oregon,97703-2465,-121.3288021,44.0575649,Oregon


Here's further confirmation that the necessary longtidue and latitude info is misisng in large numbers.   

In [7]:
#count missing values for each column
clean_breweries_df.isnull()
#count missing values for each column 
clean_breweries_df.isnull().sum()

# TEAM: We need to decide how to handle the missing values.  Do we drop the rows?

name                 0
brewery_type         0
address_1          783
city                 0
state_province       0
postal_code          0
longitude         2392
latitude          2392
state                0
dtype: int64

Decided to use Geopify to pull latitude and longtiude for the missing zip codes. Since Geopify did not recognize the postal code +4 format; 
opted to use the uszipcode package to extract the first 5 digits. While long/lat can be also pulled via uszipcode package, the exercise was to demonstrate the use of API for sourcing additional information. Using the package will also allow pulling additional information later.  

In [8]:
#create a new column in dataframe with first 5 digits of zip code ensuring the information is stored as stings & recognized as zipcodes.

search = SearchEngine()

def extract_zipcode(zipcode):
    # Remove +4 extension if present
    zipcode = zipcode.split('-')[0]
    zipcode_obj = search.by_zipcode(zipcode)
    if zipcode_obj is not None:
        return str(zipcode_obj.zipcode)[:5].zfill(5)
    else:
        return None

clean_breweries_df['zip_code'] = clean_breweries_df['postal_code'].apply(extract_zipcode).astype(str)

#add a ditinct ID number to each row 
clean_breweries_df['brewery_id'] = clean_breweries_df.index + 1
clean_breweries_df.head()


Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state,zip_code,brewery_id
0,(405) Brewing Co,micro,1716 Topeka St,Norman,Oklahoma,73069-8224,-97.46818222,35.25738891,Oklahoma,73069,1
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas,78745,2
2,10 Barrel Brewing Co,large,1501 E St,San Diego,California,92101-6618,-117.129593,32.714813,California,92101,3
3,10 Barrel Brewing Co,large,62970 18th St,Bend,Oregon,97701-9847,-121.281706,44.08683531,Oregon,97701,4
4,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,Bend,Oregon,97703-2465,-121.3288021,44.0575649,Oregon,97703,5


In [9]:

#load dataframe into csv
clean_breweries_df.to_csv('csv building blocks/breweries.csv', index=False)
#show count by the length of the postal code to see if there are any invalid postal codes
clean_breweries_df['zip_code'].str.len().value_counts()

5    7929
4       7
Name: zip_code, dtype: int64

In [10]:
#use dataframe from the csv breveries.csv
input_path = os.path.join("csv building blocks/breweries.csv")
csv_breweries_df = pd.read_csv(input_path)

In [12]:
#Quick test if the API request url worked for a specific zip code
#postcode = "78745"
#target_url = f"https://api.geoapify.com/v1/geocode/autocomplete?text={postcode}&limit=1&type=postcode&format=json&apiKey={geoapify_key}"
#response = requests.get(target_url).json()
#print(json.dumps(response, indent=4, sort_keys=True))


Creating a dataframe with missing information to temporarily separate from the clean of the data.

In [11]:
#extract rows with missing longitude and create a new dataframe - all longitude missing values correspond to missinging latitude values
missing_long_df = csv_breweries_df[csv_breweries_df['longitude'].isnull()]

#create a new dataframe with only the missing longitude rowsbased on clean_breweries_df
missing_long_df = clean_breweries_df[clean_breweries_df['longitude'].isnull()]

missing_long_df.head()


Unnamed: 0,name,brewery_type,address_1,city,state_province,postal_code,longitude,latitude,state,zip_code,brewery_id
1,(512) Brewing Co,micro,407 Radam Ln Ste F200,Austin,Texas,78745-1197,,,Texas,78745,2
18,12 Gates Brewing Company,brewpub,80 Earhart Dr Ste 20,Williamsville,New York,14221-7804,,,New York,14221,19
19,12 West Brewing Company,micro,3000 E Ray Rd Bldg 6,Gilbert,Arizona,85296-7832,,,Arizona,85296,20
24,12welve Eyes Brewing,micro,141 E 4th St Ste LL2,Saint Paul,Minnesota,55101-1639,,,Minnesota,55101,25
26,13 Stripes Brewery,brewpub,"250 Mill St, Suite PW3101",Taylors,South Carolina,29687,,,South Carolina,29687,27


In [12]:
#drop the missing_long dataframe into a csv file

missing_long_df.to_csv('csv building blocks/breweries_missing_values.csv', index=False)


In [13]:

#use dataframe from the csv_breweries_missing_values.csv
input_path2 = os.path.join("csv building blocks/breweries_missing_values.csv")
csv_breweries_missing_df = pd.read_csv(input_path2)


Finally demonstrate use of Geopify for obtianing missing information - encountered errors were bypassed with the try/except. 

In [23]:
#for missing longitude, use geopify to look up the longitude and latitude
for index, row in csv_breweries_missing_df.iterrows():
    postcode = row['zip_code']
    target_url = f"https://api.geoapify.com/v1/geocode/autocomplete?text={postcode}&limit=1&type=postcode&format=json&apiKey={geoapify_key}"
    response = requests.get(target_url)

    try:
        json_response = json.loads(response.text)
        if json_response:
            csv_breweries_missing_df.loc[index, 'longitude'] = json_response['results'][0]['lon']
            csv_breweries_missing_df.loc[index, 'latitude'] = json_response['results'][0]['lat']
            print(f"index: {index}, postcode: {postcode}, longitude: {json_response['results'][0]['lon']}, latitude: {json_response['results'][0]['lat']}") 

        else:
            print(f"Error: Empty JSON response for index {index}, postcode {postcode}")
    except IndexError:
        print(f"Error: IndexError occurred for index {index}, postcode {postcode}")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON response for index {index}, postcode {postcode}")

csv_breweries_missing_df.head()

index: 0, postcode: 78745, longitude: -97.792614846, latitude: 30.208605656
index: 1, postcode: 14221, longitude: -78.729920877, latitude: 42.980952841
index: 2, postcode: 85296, longitude: -111.762518433, latitude: 33.335136701
index: 3, postcode: 55101, longitude: -93.088300242, latitude: 44.955919207
index: 4, postcode: 29687, longitude: -82.327800669, latitude: 34.991467109
index: 5, postcode: 27603, longitude: -78.66089377, latitude: 35.71301003
index: 6, postcode: 56442, longitude: -94.116315338, latitude: 46.677860236
index: 7, postcode: 05478, longitude: -73.104390267, latitude: 44.803484605
index: 8, postcode: 21157, longitude: -76.983937143, latitude: 39.558197671
index: 9, postcode: 22553, longitude: -77.60125509, latitude: 38.231176411
index: 10, postcode: 11741, longitude: -73.070358674, latitude: 40.794962148
index: 11, postcode: 05452, longitude: -73.08637315, latitude: 44.51595767
index: 12, postcode: 20109, longitude: -77.506105128, latitude: 38.784153209
index: 13, po

In [1]:
#save the dataframe with filled info into a csv file csv_breweries_missing_completed_df
csv_breweries_missing_df.to_csv('csv building blocks/breweries_missing_completed.csv', index=False)
csv_breweries_missing_df.head()

NameError: name 'csv_breweries_missing_df' is not defined

In [20]:
# Base URL for census
base_url = 'https://api.census.gov/data/2021/acs/acs1/profile?'

# Define the parameter
# Parameters google sheet link https://docs.google.com/spreadsheets/d/1Hm1cXEWH2ccg9TD9DUkOKDIaYO88VxDwFeTTsmGfNik/edit#gid=0

parameter_list_final = ['DP02_0001E','DP02_0006E','DP02_0010E','DP02_0025E','DP02_0025PE','DP02_0031E','DP02_0031PE','DP02_0067E','DP02_0067PE','DP02_0068E',
'DP02_0068PE','DP03_0001PE','DP03_0001E','DP03_0051E','DP03_0062E','DP03_0063E','DP03_0075E','DP03_0086E','DP03_0087E','DP03_0088E','DP03_0089E',
'DP03_0090E','DP03_0091E','DP03_0119E','DP03_0119PE','DP03_0128E','DP03_0128PE','DP04_0001E','DP04_0001PE','DP04_0006E','DP04_0006PE','DP05_0001E',
'P05_0001PE','DP05_0022E','DP05_0022PE','DP05_0008E','DP05_0008PE','DP05_0026E','DP05_0026PE','DP05_0027E','DP05_0027PE','DP03_0089E']

parameter_list_test = ['DP02_0001E','DP02_0006E','DP02_0010E','DP02_0025E']

census_data = []

# Loop through parameter list
for parameter in parameter_list_test:
    query_url = base_url + "get=NAME," + parameter + "&for=place:*&in=state:*" + "&key=" + census_key 
    response = requests.get(query_url).json()
    census_data.append(response)

# Create initial dataframce
first = census_data[0]
code = census_data[0][0][1]
merged_df = pd.DataFrame(first, columns = ['City',code, 'State ID', 'City ID'])

# Append multiple dataframes together (not finished yet)
for x in census_data:
    new_df = pd.DataFrame(x, columns = ['City',x[0][1], 'State ID', 'City ID'])
    result = pd.concat([merged_df, new_df], axis=1)

# Drop duplicate columns
df2 = result.T.drop_duplicates().T

# Drop 1st row
header_row = df2.iloc[0]
final_census_df = pd.DataFrame(df2.values[1:], columns=header_row)

#census_data
final_census_df

Unnamed: 0,NAME,DP02_0001E,state,place,DP02_0025E
0,"O'Fallon city, Missouri",34412,29,54074,34707
1,"St. Louis city, Missouri",139736,29,65000,118761
2,"Passaic city, New Jersey",20446,34,56550,24484
3,"Nashua city, New Hampshire",36986,33,50260,38993
4,"Rochester city, Minnesota",49984,27,54880,46609
...,...,...,...,...,...
629,"Ankeny city, Iowa",27720,19,02305,26730
630,"Waterloo city, Iowa",29948,19,82425,27250
631,"West Des Moines city, Iowa",36066,19,83910,29474
632,"Wichita city, Kansas",156668,20,79000,154469


In [None]:
#merge back missing info into the original breweries csv:
#you might need to first drop the longitude and latitude empty rows from the original dataframe clean_breweries_df and then add 
#the csv_breweries_missing_completed_df to the clean_breweries_df


#create a new clean dataframe that we can use for analysis by dropping remaining missing lat&long


#load the the final "clean dataframe" into the csv file



In [None]:
#extract a separate data set for "closed" breweries for separate analysis - VERY FEW btw 162 - and create a new dataframe for that one

#extract a separate data set for "planning" breweries for separate analysis - should be about 704 - and create a new dataframe for that one

#QUESTION FOR THE TEAM: Do we want to focus on "micro" and "brewpubs" only? if so, delete rows with all other brewery types


In [None]:
# create a summary dataframe with number of breweries by zip_code - listing City, State, and Zip Code, and brewery count  

# load additional information regrdidng the zipcode/each brewery cluster data from https://uszipcode.readthedocs.io/
# review documentation by following the link to see how the information populates and if useful for our analysis
# Some of the data to consider --> household_income; median_household_income population, population density, families_vs_singles, 
# median_home_value, educational_attainment_for_population_25_and_over 




In [None]:
#explore Census API Community Data API for any additional information that might be useful for our analysis - gender, race etc.
#explore articles for inspiration on what other data might be useful for our analysis
# https://www.census.gov/library/visualizations/interactive/breweries.html  --> Article about breweries for the Census 
#https://cbb.census.gov/cbb/#industry0=312120&geoId=17031&geoType=county&view=report&reportType=summary --> Brewery Busisness in Illinois 


#load relevant data into the zipcode summary table 