# Acquiring Beer Data 

From Brewerydb: https://www.brewerydb.com/

In [None]:
#import dependencies
import api_setup as ap
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
beer_data = pd.DataFrame()
# withBreweries and withIngredients are completely empty
beer_features = ['id', 'name', 'nameDisplay', 'abv', 'styleId', 'year', 'glasswareId','isOrganic', 'breweries', 'glasswareId', 'isRetired', 'status', 'statusDisplay']

for feature in beer_features:
    beer_data[feature]=''

count=0
for p in tqdm(range(1,24)):
    data = ap.get_data(page=p, custom='&withBreweries=Y&withIngredients=Y')
    try:
        for i in range(50):
            for feature in beer_features:
                if feature == 'breweries':
                    beer_data.set_value(count,feature,data['data'][i][feature][0]['id'])
                else:
                    try:
                        beer_data.set_value(count,feature,data['data'][i][feature])

                    except Exception as e: 
                        print(f"name:{data['data'][i]['name']}, page:{p}, item:{i}, feature:{feature}, error:{e}")
            count+=1
    except Exception as e:
        print(e)
                      
                      

In [None]:
beer_data.head()

> we'll change the column names from camel case to snake case for better consistency in notebook

In [None]:
beer_data.rename(columns={'id':'beer_id', 'nameDisplay':'display_name','isOrganic':'organic','isRetired':'retired','breweries':'brewery_id', 'styleId':'style_id', 'glasswareId':'glass_id'}, inplace=True)

In [None]:
beer_data.drop('statusDisplay',axis=1, inplace=True)

In [None]:
beer_data.shape

In [None]:
beer_data.head()

In [None]:
beer_data.to_csv("data/beers.csv")

> we can see that there are 263 beers of the 1109 total beers that have a value for glass_id

In [None]:
beer_glass = beer_data[beer_data["glass_id"].notna()]
print(beer_glass.shape)
beer_glass.head()

> Below, we can see the distribution of the glasses. The majority of the beers use glass5. Because of the lack of data in several other glass_id's, it would not be safe to generalize the data over other beers/glasses

In [None]:
beer_glass.groupby('glass_id').count()['beer_id']

> before we can create our plots, we need to clean the data and make sure that the data types are consistent with the requirements for the plot. 

In [None]:
# beer_glass.info() 
# shows that all variables are categorical and abv should be turned into a float and glass_id to int to be able to plot
beer_glass['abv'] = beer_glass['abv'].astype(float)
beer_glass['glass_id'] = beer_glass['glass_id'].astype(int)
beer_glass.info()

In [None]:
abv_glass = beer_glass.boxplot('abv', "glass_id")
abv_glass

In [None]:
abv_data = beer_data[beer_data["abv"].notna()]
abv_data['abv']=pd.to_numeric(abv_data['abv'])
# print(abv_data.shape)
# abv_data.head()

avg_abv = abv_data['abv'].mean()
avg_abv = '{0:.3}'.format(avg_abv)

In [None]:
#Make a histogram
plt.hist(abv_data['abv'], bins=40)

#Chart elements
plt.title(f"Alcohol by Volume Distribution for All Beers")
plt.xlabel('abv (%)')
plt.ylabel('Number of Beers')
plt.grid(False)
plt.text(12, 100, f"Mean: {avg_abv} % abv", bbox=dict(facecolor='yellow', alpha=0.5), fontsize=11)

#Save and print
plt.savefig("images/Alcohol by Volume Distribution for All Beers.png")
plt.show()