# ETL Project Breweries and Census Data
### The purpose of this project is to practice extracting, transforming, and loading data from two sources. 

### The goal is to provide data for US breweries per state population.


In [None]:
# Dependencies
import pandas as pd
import requests
import pymongo

# (E)xtraction
### Sources used for this project:

#### From Kaggle:
* Breweries: https://www.kaggle.com/brkurzawa/us-breweries#breweries_us.csv

#### From US Census Bureau:
* Population: https://www.census.gov/newsroom/press-kits/2018/pop-estimates-national-state.html![image.png](attachment:image.png)

In [None]:
#read breweries csv into dataframe and display head.

breweries_df = pd.read_csv("Resources/breweries_us.csv")
breweries_df.head()


In [None]:
#read census csv into dataframe and display head.
census_df = pd.read_csv("Resources/nst-est2018-popchg2010_2018.csv")
census_df.head()

# (T)ransformation

### Transform breweries data

In [None]:
# explore column names
breweries_df.columns

In [None]:
# select columns specific columns and rename for simplicity
 
breweries_col = ['state',
       'state_breweries']
breweries = breweries_df[breweries_col].copy()
breweries = breweries.rename(columns={'state_breweries': 'count'})
breweries['state']=breweries['state'].str.lower()
breweries['state']=breweries['state'].str.replace(r'-',' ')

# remove duplicates
breweries.drop_duplicates(inplace=True)
breweries


In [None]:
# explore columns in census dataframe.
census_df.columns

In [None]:
#Tranform and clean-up

census_df.head(10)



In [None]:
# drop rows that are not states.
census_df = census_df[census_df['STATE'] > 0]
census_df

In [None]:
# We are only interested in state name and most recent population, i,e, 2018 population.
# Rank data, change in population was not relevant to our projects.

census_columns= ['NAME', 'POPESTIMATE2018']
census = census_df[census_columns].copy()
census = census.rename(columns={'NAME': 'state','POPESTIMATE2018':'pop_2018'})
census['state'] = census['state'].str.lower()

census = census.reset_index()
del census['index']

census


In [None]:
census.iloc[8,0] 
census.iloc[8,0] = 'washington dc'
census

In [None]:
# Merge breweries and census data into a single dataframe.

breweries_by_state = breweries.merge(census, left_on='state', right_on='state', how='outer')
breweries_by_state.sort_values('state')

In [None]:
# replace NaN with zero
breweries_by_state.iloc[51,1]
breweries_by_state.iloc[51,1]=0
breweries_by_state

# (L)oad

# Heatmap