In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from splinter import Browser
import datefinder

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless = False)

In [3]:
nyt_url = 'https://www.nytimes.com/interactive/2020/us/states-reopen-map-coronavirus.html'

In [4]:
browser.visit(nyt_url)

In [5]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [6]:
#Grabbing the states from the article.

dirty_states = soup.find_all('div', class_= 'g-name')
print(dirty_states[0])

<div class="g-name">Alaska</div>


In [7]:
#Stripping down to just the states.

states = []
for state in dirty_states:
    #print(state.contents[0])
    states.append(state.contents[0])

In [8]:
print(states[0:10])

['Alaska', 'Iowa', 'Kansas', 'Kentucky', 'Minnesota', 'Missouri', 'Montana', 'Nebraska', 'New Hampshire', 'North Dakota']


In [9]:
#Grabbing the order types from the article.

dirty_orders = soup.find_all('div', class_= 'g-date-details')
print(dirty_orders[0])

<div class="g-date-details">
<span>Stay-at-home order</span>
<span class="g-date-details-text">expired on April 24.</span>
</div>


In [10]:
order_type = []
for order in dirty_orders:
    order_type.append(order.contents[1].text)

In [11]:
#Cleaned Order Types
order_type[0:5]

['Stay-at-home order',
 'Did not have a statewide stay-at-home order.',
 'Stay-at-home order',
 'Healthy at home',
 'Stay-at-home order']

In [12]:
#grabbing descriptions and dates of the orders
dirty_desc = soup.find_all('span', class_= 'g-date-details-text')
len(dirty_desc)

52

In [13]:
kinda_dirty = []
for stuff in dirty_desc: 
    kinda_dirty.append(stuff.contents[0])

In [14]:
''''''
kinda_dirty[0]

'expired on April 24.'

In [15]:
#status_list = ['expired', 'struck down', 'in effect', 'Did not have a statewide stay-at-home order.']

In [16]:
status = []

In [17]:
print('''important lists {order_type} {status} {states}
still need to make {dates}''')

important lists {order_type} {status} {states}
still need to make {dates}


In [18]:
for dirt in kinda_dirty:
    if dirt[0:7] == 'expired':
        status.append(dirt[0:7])
    elif dirt[0:11] == 'struck down':
        status.append(dirt[0:11])
    elif dirt[0:9] == 'in effect':
        status.append(dirt[0:9])
    else: 
        status.append('N/A');

In [19]:
print(f'''States: {len(states)}
Order Types: {len(order_type)}
Statuses: {len(status)}''')

States: 52
Order Types: 52
Statuses: 52


In [20]:
#Now to collect the dates from the 'kinda_dirty' list
#kinda_dirty[28] is Puerto Rico, which has 2 dates listed. That gets sorted out later.
kinda_dirty[28]

'expired on May 8.'

In [21]:
kinda_dirty.index('in effect since March 15 and set to expire July 22.')

27

In [22]:
dates = []

In [23]:
for dirt in kinda_dirty:
    matches = datefinder.find_dates(dirt)
    for match in matches:
        dates.append(match)

In [24]:
#Sorting out the Puerto Rico dates problem
del dates[29]

In [25]:
#some states don't have dates to scrape, so i'm just looking for where these states are in my lists
indices = [i for i, x in enumerate(kinda_dirty) if x == "Did not have a statewide stay-at-home order."]

In [26]:
indices

[1, 7, 9, 10, 12, 13, 32, 44]

In [27]:
#placing date values as N/A for states with no stay-at-home orders
for i in indices:
    dates.insert(i, "N/A")

In [28]:
#"Did not have a statewide stay-at-home order." 
# ^ is REALLY wordy, so we're just replacing those with a order type of "None"
# and removing redundancy of the word "order" after "stay-at-home" 
for i, x in enumerate(order_type):
    if x == "Did not have a statewide stay-at-home order.":
        order_type[i] = "None"
    elif x == "Stay-at-home order":
        order_type[i] = "Stay-at-home"

In [29]:
print(f'''State       Order Type
{states[32]}        {order_type[32]}
{states[40]}        {order_type[40]}''')

State       Order Type
Arkansas        None
North Carolina        Stay-at-home


In [30]:
print(f'''Dates:{len(dates)}
States:{len(states)}
Orders:{len(order_type)}
Statusi:{len(status)}''')

Dates:52
States:52
Orders:52
Statusi:52


In [31]:
#Now we're good to put everything into into a dataframe

In [32]:
df = pd.DataFrame(list(zip(states, order_type, dates, status)), 
               columns =['State', 'Order_type', 'Date', 'Status'])
df.head()

Unnamed: 0,State,Order_type,Date,Status
0,Alaska,Stay-at-home,2020-04-24 00:00:00,expired
1,Iowa,,,
2,Kansas,Stay-at-home,2020-05-03 00:00:00,expired
3,Kentucky,Healthy at home,2020-03-26 00:00:00,in effect
4,Minnesota,Stay-at-home,2020-05-17 00:00:00,expired


In [33]:
#df.to_csv('CSVs/Quarantine_Dates_By_State.csv', encoding='utf-8', index=False)

In [34]:
df.sort_values('State', ascending=True, inplace=True)

In [35]:
df

Unnamed: 0,State,Order_type,Date,Status
31,Alabama,Stay-at-home,2020-05-15 00:00:00,expired
0,Alaska,Stay-at-home,2020-04-24 00:00:00,expired
45,Arizona,Stay-at-home,2020-05-15 00:00:00,expired
32,Arkansas,,,
46,California,Stay-at-home,2020-03-19 00:00:00,in effect
47,Colorado,Stay-at-home,2020-04-26 00:00:00,expired
33,Connecticut,Stay-at-home,2020-04-30 00:00:00,expired
34,Delaware,Shelter in place,2020-05-20 00:00:00,expired
17,District of Columbia,Stay-at-home,2020-05-29 00:00:00,expired
48,Florida,Stay-at-home,2020-05-04 00:00:00,expired


In [36]:
df.set_index('State', inplace=True)

In [37]:
df

Unnamed: 0_level_0,Order_type,Date,Status
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,Stay-at-home,2020-05-15 00:00:00,expired
Alaska,Stay-at-home,2020-04-24 00:00:00,expired
Arizona,Stay-at-home,2020-05-15 00:00:00,expired
Arkansas,,,
California,Stay-at-home,2020-03-19 00:00:00,in effect
Colorado,Stay-at-home,2020-04-26 00:00:00,expired
Connecticut,Stay-at-home,2020-04-30 00:00:00,expired
Delaware,Shelter in place,2020-05-20 00:00:00,expired
District of Columbia,Stay-at-home,2020-05-29 00:00:00,expired
Florida,Stay-at-home,2020-05-04 00:00:00,expired


In [44]:
df.to_csv(r'C:\Users\Blu\bootcamp\Project2_Covid\MapFiles\csv\nyt_csv.csv')

In [45]:
len(df)

52