In [13]:
# Imports

import requests
import pandas as pd
from bs4 import BeautifulSoup

from eda_functions import data_exploration, draw_histogram, draw_countplot


In [14]:
# Functions

def scrape_table(url, n_table):
    """Scrape the "n_table"-th in the web page at "url"."""
    rows = []
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find the table
    table = soup.find_all('table')
    headers = [header.text.strip() 
               for header in table[n_table].find_all('th')]
    # Read each rows
    data_rows = table[n_table].find_all('tr')
    for row in data_rows:
        value = row.find_all('td')
        value = [ele.text.strip() for ele in value]
        if len(value) == 0:
            continue
        rows.append(value)
    return pd.DataFrame(rows, columns=headers)


In [15]:
# Scrape the USA state codes
url = 'https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code'

df = scrape_table(url, 0)
data_exploration(df)

#  Save it as a csv file
df.to_csv('state_code.csv')


Unnamed: 0,Name,Alpha code,Numeric code,Status
0,Alabama,AL,1,State; counties
1,Alaska,AK,2,State; boroughs
2,American Samoa,AS,60,Outlying area under U.S. sovereignty
3,American Samoa *,,3,(FIPS 5-1 reserved code)
4,Arizona,AZ,4,State; counties


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          74 non-null     object
 1   Alpha code    74 non-null     object
 2   Numeric code  74 non-null     object
 3   Status        74 non-null     object
dtypes: object(4)
memory usage: 2.4+ KB


Duplicates: 0 



Unnamed: 0,count,unique,top,freq
Name,74,74,Alabama,1
Alpha code,74,66,,5
Numeric code,74,74,01,1
Status,74,9,State; counties,48


In [16]:
# Scrape the column names for the main table (CDC)
url = 'https://www.cdc.gov/brfss/annual_data/2021/llcp_varlayout_21_onecolumn.html'

df = scrape_table(url, 0)
data_exploration(df)

#  Save it as a csv file
df.to_csv('columns_name.csv')


Unnamed: 0,Starting Column,Variable Name,Field Length
0,1,_STATE,2
1,17,FMONTH,2
2,19,IDATE,8
3,19,IMONTH,2
4,21,IDAY,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Starting Column  303 non-null    object
 1   Variable Name    303 non-null    object
 2   Field Length     303 non-null    object
dtypes: object(3)
memory usage: 7.2+ KB


Duplicates: 0 



Unnamed: 0,count,unique,top,freq
Starting Column,303,301,19,2
Variable Name,303,303,_STATE,1
Field Length,303,8,1,220
