In [31]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from pandas import DataFrame
from datetime import datetime

In [3]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [4]:
# Define database and collection
db = client.stayathome_db
collection = db.state_quarantine

In [5]:
print(db.state_orders.find_one())
db.state_orders.insert_one({'key': 'value'})

{'_id': ObjectId('5e7fb2005ce6cf72a85ffc22'), 'key': 'value'}


<pymongo.results.InsertOneResult at 0x233abf80c88>

In [6]:
# URL of page to be scraped
url = 'https://www.nytimes.com/interactive/2020/us/coronavirus-stay-at-home-order.html'

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
# print(soup.find_all('div', class_='state-wrap'))

In [8]:
ca_measures_state = soup.find('div', id='item-CA').find('h3').text
ca_measures_order = soup.find('div', id='item-CA').find('p', class_='l-order').text

print(ca_measures_state)
print(ca_measures_order)

California About 39.6 million people
Stay at home, effective March 19


In [9]:
# soup.find_all('div', class_="state-wrap")
# soup.find_all('div', class_="state-wrap")

In [10]:
# # Retrieve the parent divs for all states
sah_measures = soup.find_all('div', class_="state-wrap")


# loop over results to get article data
for each_measure in sah_measures:
    # scrape the article header     
    state = each_measure.find('h3').text
    
    # scrape the article subheader
    sah_order = each_measure.find('p', class_='l-order').text
    
    # scrape the datetime
    effective_date = each_measure.find('span', class_='l-date').text
    
    # print article data
    print('-----------------')
    print(state)
    print(sah_order)
    print(effective_date)

    # Dictionary to be inserted into MongoDB
    state_measures = {
        'state': state,
        'sah_order': sah_order,
        'effective_date': effective_date,
    }
    
    

    # Insert dictionary into MongoDB as a document
    collection.insert_one(state_measures)

-----------------
Alabama
Shelter in place, effective March 24 at 12 p.m.
, effective March 24 at 12 p.m.
-----------------
Alaska
Stay at home, effective March 22
, effective March 22
-----------------
California About 39.6 million people
Stay at home, effective March 19
, effective March 19
-----------------
Colorado About 5.7 million people
Stay at home, effective March 26 at 6 a.m.
, effective March 26 at 6 a.m.
-----------------
Connecticut About 3.6 million people
Stay at home, effective March 23 at 8 p.m.
, effective March 23 at 8 p.m.
-----------------
Delaware About 973,000 people
Shelter in place, effective March 24 at 8 a.m.
, effective March 24 at 8 a.m.
-----------------
Florida
Stay at home, effective March 24 at 12:01 a.m.
, effective March 24 at 12:01 a.m.
-----------------
Georgia
Stay at home, effective March 24 at 12 a.m.
, effective March 24 at 12 a.m.
-----------------
Hawaii About 1.4 million people
Stay at home, effective March 25 at 12:01 a.m.
, effective March 

In [11]:
top_six = ['New York','New Jersey','California','Washington','Michigan','Florida']
state_measures

{'state': 'Wisconsin About 5.8 million people',
 'sah_order': 'Stay at home, effective March 25 at 8 a.m.',
 'effective_date': ', effective March 25 at 8 a.m.',
 '_id': ObjectId('5e812334b941078a4b56dc2a')}

In [12]:
# state_measures_df = DataFrame(list(state_measures.items()), columns = ['state','sah_order','effective_date'])
state_measures_df = DataFrame(list(db.state_quarantine.find({})))

In [13]:
state_measures_df = state_measures_df[['state','sah_order']]
state_measures_df

Unnamed: 0,state,sah_order
0,Missouri,"Stay at home, effective March 24 at 12:01 a.m."
1,Montana About 1.1 million people,"Stay at home, effective March 28 at 12:01 a.m."
2,New Hampshire About 1.4 million people,"Stay at home, effective March 27 at 11:59 p.m."
3,New Jersey About 8.9 million people,"Stay at home, effective March 21 at 9 p.m."
4,New Mexico About 2.1 million people,"Stay at home, effective March 24 at 8 a.m."
...,...,...
135,Utah,"Stay at home, effective March 27 at 12:01 a.m."
136,"Vermont About 626,000 people","Stay at home, effective March 25 at 5 p.m."
137,Washington About 7.5 million people,"Stay at home, effective March 23"
138,West Virginia About 1.8 million people,"Stay at home, effective March 24 at 8 p.m."


In [14]:
state_measures_df[['order','date']] = state_measures_df.sah_order.str.split(', effective ', expand=True)

state_measures_df.head(2)

Unnamed: 0,state,sah_order,order,date
0,Missouri,"Stay at home, effective March 24 at 12:01 a.m.",Stay at home,March 24 at 12:01 a.m.
1,Montana About 1.1 million people,"Stay at home, effective March 28 at 12:01 a.m.",Stay at home,March 28 at 12:01 a.m.


In [15]:
state_measures_df[['State','Population']] = state_measures_df.state.str.split(' About ', expand=True)

state_measures_df.head(2)

Unnamed: 0,state,sah_order,order,date,State,Population
0,Missouri,"Stay at home, effective March 24 at 12:01 a.m.",Stay at home,March 24 at 12:01 a.m.,Missouri,
1,Montana About 1.1 million people,"Stay at home, effective March 28 at 12:01 a.m.",Stay at home,March 28 at 12:01 a.m.,Montana,1.1 million people


In [16]:
top_six = ['New York','New Jersey','California','Washington','Michigan','Florida']
state_measures_df = state_measures_df[['State','order','date']]
state_measures_df.head(3)


Unnamed: 0,State,order,date
0,Missouri,Stay at home,March 24 at 12:01 a.m.
1,Montana,Stay at home,March 28 at 12:01 a.m.
2,New Hampshire,Stay at home,March 27 at 11:59 p.m.


In [17]:
state_measures_df.iloc[2,0]

'New Hampshire'

In [18]:
top_six_df = state_measures_df[state_measures_df['State'].isin(top_six)]
top_six_df1=top_six_df.iloc[:6]
top_six_df1

Unnamed: 0,State,order,date
3,New Jersey,Stay at home,March 21 at 9 p.m.
5,New York,Stay at home,March 22 at 8 p.m.
17,Washington,Stay at home,March 23
22,California,Stay at home,March 19
26,Florida,Stay at home,March 24 at 12:01 a.m.
37,Michigan,Stay at home,March 24 at 12:01 a.m.


In [19]:
# db.users.remove({})
# collection.insert_one(state_measures)

collection.db.state_quarantine.remove({})

  after removing the cwd from sys.path.


{'n': 0, 'ok': 1.0}

In [20]:
top_six_df = top_six_df1
top_six_df

Unnamed: 0,State,order,date
3,New Jersey,Stay at home,March 21 at 9 p.m.
5,New York,Stay at home,March 22 at 8 p.m.
17,Washington,Stay at home,March 23
22,California,Stay at home,March 19
26,Florida,Stay at home,March 24 at 12:01 a.m.
37,Michigan,Stay at home,March 24 at 12:01 a.m.


In [21]:
top_six_df.set_index('State', inplace=True)
top_six_df

Unnamed: 0_level_0,order,date
State,Unnamed: 1_level_1,Unnamed: 2_level_1
New Jersey,Stay at home,March 21 at 9 p.m.
New York,Stay at home,March 22 at 8 p.m.
Washington,Stay at home,March 23
California,Stay at home,March 19
Florida,Stay at home,March 24 at 12:01 a.m.
Michigan,Stay at home,March 24 at 12:01 a.m.


In [22]:
top_six_df.to_csv('top_six.csv')

AttributeError: 'DataFrame' object has no attribute 'to_datetime'

In [24]:
top_six_df[['date','time']] = top_six_df.date.str.split(' at ', expand=True)
top_six_df


Unnamed: 0_level_0,order,date,time
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
New Jersey,Stay at home,March 21,9 p.m.
New York,Stay at home,March 22,8 p.m.
Washington,Stay at home,March 23,
California,Stay at home,March 19,
Florida,Stay at home,March 24,12:01 a.m.
Michigan,Stay at home,March 24,12:01 a.m.


In [27]:
top_six_df = top_six_df[['order','date']]
top_six_df


Unnamed: 0_level_0,order,date
State,Unnamed: 1_level_1,Unnamed: 2_level_1
New Jersey,Stay at home,March 21
New York,Stay at home,March 22
Washington,Stay at home,March 23
California,Stay at home,March 19
Florida,Stay at home,March 24
Michigan,Stay at home,March 24


In [36]:
datetime.strptime('March 21 2020', '%B %d %Y')

datetime.datetime(2020, 3, 21, 0, 0)

In [37]:
top_six_df['date'] = top_six_df['date'] + ' 2020'

In [43]:
top_six_df['date'] = pd.to_datetime(top_six_df.date, format='%B %d %Y')


In [44]:
top_six_df.to_csv()

Unnamed: 0_level_0,order,date
State,Unnamed: 1_level_1,Unnamed: 2_level_1
New Jersey,Stay at home,2020-03-21
New York,Stay at home,2020-03-22
Washington,Stay at home,2020-03-23
California,Stay at home,2020-03-19
Florida,Stay at home,2020-03-24
Michigan,Stay at home,2020-03-24
