In [1]:
import pandas as pd
import numpy as np


In [2]:
brewers_data= pd.read_csv("resources/Brewers_Association_Data.csv", index_col=0)
brewers_data.head()

Unnamed: 0,Brewery,Address,City,State,ZIP Code
0,5 Rivers Brewing LLC,,Spanish Fort,AL,36527.0
1,Avondale Brewing Co,201 41st St S,Birmingham,AL,35222.0
2,Back Forty Beer Co,200 N 6th St,Gadsden,AL,35901.0
3,Back Forty Beer Company - Birmingham,3201 1st Avenue N,Birmingham,AL,35222.0
4,Below the Radar Brewing Co,220 Holmes Ave NE,Huntsville,AL,35801.0


In [3]:
brewers_data.dtypes

Brewery      object
Address      object
City         object
State        object
ZIP Code    float64
dtype: object

In [4]:
brewers_data["Address"] = brewers_data["Address"].astype("str")
brewers_data["Brewery"] = brewers_data["Brewery"].astype("str")
brewers_data["ZIP Code"] = brewers_data["ZIP Code"].astype("str")
brewers_data.dtypes

Brewery     object
Address     object
City        object
State       object
ZIP Code    object
dtype: object

In [5]:
brewers_data["Brewery"] = brewers_data["Brewery"].str.lower()
brewers_data["Address"] = brewers_data["Address"].str.lower()
brewers_data.head()

Unnamed: 0,Brewery,Address,City,State,ZIP Code
0,5 rivers brewing llc,,Spanish Fort,AL,36527.0
1,avondale brewing co,201 41st st s,Birmingham,AL,35222.0
2,back forty beer co,200 n 6th st,Gadsden,AL,35901.0
3,back forty beer company - birmingham,3201 1st avenue n,Birmingham,AL,35222.0
4,below the radar brewing co,220 holmes ave ne,Huntsville,AL,35801.0


In [6]:
brewers_data.count()
# Every brewery has a zip code, this may be enough. Otherwise, we could potentially extract addresses from google maps 
# programatically for the ~ 1000 missing addresses


Brewery     8501
Address     8501
City        8497
State       8501
ZIP Code    8501
dtype: int64

In [7]:
print(brewers_data["Address"].duplicated().value_counts())
# There appear to be 1051 duplicate values in the addresses

False    7450
True     1051
Name: Address, dtype: int64


In [8]:
print(brewers_data["Brewery"].duplicated().value_counts())
# 163 duplicate brewery names before anything's been dropped by address.

False    8333
True      168
Name: Brewery, dtype: int64


In [9]:
# brewers_no_duplicate_addresses_df = brewers_data.drop_duplicates(subset=brewers_data["Address"], keep= "first")

no_double_address_df = brewers_data.drop_duplicates(["Address"])

no_double_address_df.count()

Brewery     7450
Address     7450
City        7450
State       7450
ZIP Code    7450
dtype: int64

In [10]:
no_double_address_or_brewery_df = no_double_address_df.drop_duplicates(["Brewery"])
no_double_address_or_brewery_df.count()

Brewery     7328
Address     7328
City        7328
State       7328
ZIP Code    7328
dtype: int64

In [11]:
# Still a lot of " - Production Facility", " (Production Facility)" and " Production Facility " elements that need to be dropped
# So need to drop row if string contains "Production Facility"
no_duplicates_or_PF_df = no_double_address_or_brewery_df[~no_double_address_or_brewery_df["Brewery"].str.contains("production facility")]
no_duplicates_or_PF_df.count()

Brewery     7287
Address     7287
City        7287
State       7287
ZIP Code    7287
dtype: int64

In [12]:
no_duplicates_or_PF_df.to_csv("resources/brewers_association_no_duplicates_or_PF.csv")