In [1]:
# This notebook pulls Land Area data from the government census bureau since they didn't have a download button.
# We are going to use this data to determine the road density of a state by comparing its land area with the 
# distance of roadways calculated in the Road_Data.ipynb file

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
target = 'https://www.census.gov/geographies/reference-files/2010/geo/state-area.html'

In [4]:
whole_page = requests.get(target).content
soup = BeautifulSoup(whole_page)

In [5]:
#confirming there is only one table to scrape on this page so we don't miss the mark
len(soup.find_all('table'))

1

In [6]:
table = soup.find('table')
rows = table.find_all('tr')

In [7]:
len(rows)

65

In [8]:
# 3 rows of headers, data starts at 3, headers will be manually added because this is a mess
rows[3]

<tr><td><strong>Total<sup><a href="https://www.census.gov/geo/reference/state-area.html#n3">3</a></sup></strong></td>
<td><strong>3,805,927</strong></td>
<td><strong>9,857,306</strong></td>
<td><strong>3,535,932</strong></td>
<td><strong>9,158,022</strong></td>
<td><strong>269,995</strong></td>
<td><strong>699,284</strong></td>
<td><strong>85,763</strong></td>
<td><strong>222,125</strong></td>
<td><strong>42,371</strong></td>
<td><strong>109,742</strong></td>
<td><strong>60,094</strong></td>
<td><strong>155,643</strong></td>
<td><strong>81,767</strong></td>
<td><strong>211,774</strong></td>
<td><br/></td>
<td><br/></td>
</tr>

In [9]:
matrix_out = {}
for i in rows[3:]:
    cells = i.find_all('td')
    
    for j,k in enumerate(cells[:5]):
        try:
            matrix_out[j].append(k.text)
        except:
            matrix_out[j] = []
            matrix_out[j].append(k.text)

In [10]:
matrix_out

{0: ['Total3',
  '',
  'United States4',
  'Alabama',
  'Alaska',
  'Arizona',
  'Arkansas',
  'California',
  'Colorado',
  'Connecticut',
  'Delaware',
  'District of Columbia',
  'Florida',
  'Georgia',
  'Hawaii',
  'Idaho',
  'Illinois',
  'Indiana',
  'Iowa',
  'Kansas',
  'Kentucky',
  'Louisiana',
  'Maine',
  'Maryland',
  'Massachusetts',
  'Michigan',
  'Minnesota',
  'Mississippi',
  'Missouri',
  'Montana',
  'Nebraska',
  'Nevada',
  'New Hampshire',
  'New Jersey',
  'New Mexico',
  'New York',
  'North Carolina',
  'North Dakota',
  'Ohio',
  'Oklahoma',
  'Oregon',
  'Pennsylvania',
  'Rhode Island',
  'South Carolina',
  'South Dakota',
  'Tennessee',
  'Texas',
  'Utah',
  'Vermont',
  'Virginia',
  'Washington',
  'West Virginia',
  'Wisconsin',
  'Wyoming',
  '',
  'Puerto Rico',
  '',
  'Island Areas:5',
  'American Samoa',
  'Guam',
  'Northern Mariana Islands',
  'U.S. Virgin Islands'],
 1: ['3,805,927',
  '',
  '3,796,742',
  '52,420',
  '665,384',
  '113,990',

In [11]:
headers = ['State','Total_SqMi','Total_SqKm','LandArea_SqMi','LandArea_SqKm']
land_area_df = pd.DataFrame(data = matrix_out)
land_area_df.head()

Unnamed: 0,0,1,2,3,4
0,Total3,3805927.0,9857306.0,3535932.0,9158022.0
1,,,,,
2,United States4,3796742.0,9833517.0,3531905.0,9147593.0
3,Alabama,52420.0,135767.0,50645.0,131171.0
4,Alaska,665384.0,1723337.0,570641.0,1477953.0


In [12]:
land_area_df.columns = headers
land_area_df = land_area_df[3:]


In [13]:
land_area_df

Unnamed: 0,State,Total_SqMi,Total_SqKm,LandArea_SqMi,LandArea_SqKm
3,Alabama,52420.0,135767.0,50645.0,131171.0
4,Alaska,665384.0,1723337.0,570641.0,1477953.0
5,Arizona,113990.0,295234.0,113594.0,294207.0
6,Arkansas,53179.0,137732.0,52035.0,134771.0
7,California,163695.0,423967.0,155779.0,403466.0
8,Colorado,104094.0,269601.0,103642.0,268431.0
9,Connecticut,5543.0,14357.0,4842.0,12542.0
10,Delaware,2489.0,6446.0,1949.0,5047.0
11,District of Columbia,68.0,177.0,61.0,158.0
12,Florida,65758.0,170312.0,53625.0,138887.0


In [14]:
land_area_df = land_area_df.drop(labels = [54, 56])

In [15]:
for i in [j for j in land_area_df.columns if j != 'State']:
    land_area_df[i] = land_area_df[i].apply(lambda x: int(x.replace(',','')))

In [16]:
land_area_df

Unnamed: 0,State,Total_SqMi,Total_SqKm,LandArea_SqMi,LandArea_SqKm
3,Alabama,52420,135767,50645,131171
4,Alaska,665384,1723337,570641,1477953
5,Arizona,113990,295234,113594,294207
6,Arkansas,53179,137732,52035,134771
7,California,163695,423967,155779,403466
8,Colorado,104094,269601,103642,268431
9,Connecticut,5543,14357,4842,12542
10,Delaware,2489,6446,1949,5047
11,District of Columbia,68,177,61,158
12,Florida,65758,170312,53625,138887


In [17]:
land_area_df.to_csv('output_tables/land_area_by_state_scraped.csv', index = False)

# Getting Weed Legalization
Another Dataset without a download so we're stealing it

In [18]:
target = 'https://disa.com/map-of-marijuana-legality-by-state'
page = requests.get(target).content

In [19]:
soup = BeautifulSoup(page)

In [20]:
#confirming there is only one table to scrape:
len(soup.find_all('table'))

1

In [21]:
table = soup.find('table')
rows = table.find_all('tr')

In [22]:
matrix_out = {}
for i in rows:
    cells = i.find_all('td')
    
    for j,k in enumerate(cells):
        try:
            matrix_out[j].append(k.text)
        except:
            matrix_out[j] = []
            matrix_out[j].append(k.text)

In [23]:
headers = table.find('thead').find_all('th')

In [24]:
weed_df = pd.DataFrame(data = matrix_out)
weed_df.columns = [i.text for i in headers]
for i in weed_df.columns:
    weed_df[i] = weed_df[i].str.replace('*','')
weed_df

  weed_df[i] = weed_df[i].str.replace('*','')


Unnamed: 0,State,Legal Status,Medicinal,Decriminalized,State Laws
0,Alabama,Mixed,Yes,No,View State Laws
1,Alaska,Fully Legal,Yes,Yes,View State Laws
2,Arizona,Fully Legal,Yes,Yes,View State Laws
3,Arkansas,Mixed,Yes,No,View State Laws
4,California,Fully Legal,Yes,Yes,View State Laws
5,Colorado,Fully Legal,Yes,Yes,View State Laws
6,Connecticut,Mixed,Yes,Yes,View State Laws
7,Delaware,Mixed,Yes,Yes,View State Laws
8,District of Columbia,Fully Legal,Yes,Yes,View State Laws
9,Florida,Mixed,Yes,No,View State Laws


In [26]:
weed_df.to_csv('output_tables/weed_legalization_raw_scraped.csv', index = False)