**Step 1:** Get articles from [Nature](https://www.nature.com/search?q=covid).
We can use the requests library to do this.

In [2]:
# import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# fetch web page
r = requests.get("https://www.nature.com/search?q=covid")

**Step 2:** Use BeautifulSoup to remove HTML tags.
Use "lxml" rather than "html5lib".
Outputting all the results may overload the space available to load this notebook, so we omit a print statement here.


In [4]:
soup = BeautifulSoup(r.text, "lxml")

**Step 3:** Find all course summaries
Use the BeautifulSoup's find_all method to select based on tag type and class name. On Chrome, you can right click on the item, and click "Inspect" to view its html on a web page.

In [5]:
# Find all articles
articles = soup.findAll("li", {"class":"mb20"})

In [31]:
# Extract Link
link = articles[1].select_one("h2").select_one("a")['href']
link

'/articles/s41375-020-0836-7'

In [9]:
# Extract article title
articles[1].select_one("h2").get_text().strip()

'COVID-19 in persons with haematological cancers'

In [10]:
# Extract article Author
articles[1].find("li", {"itemprop":"creator"}).get_text().strip()

'Wenjuan He'

In [11]:
# Extract article subject
articles[1].find("a", {"class": "emphasis"}).get_text().strip()

'Leukemia'

In [19]:
# Extract type
articles[1].p.find(text=True, recursive=False).strip()

'Research'

In [23]:
# Extract Published Date
articles[1].find("time", {"itemprop":"datePublished"})['datetime']

'2020-04-24'

## Create dataset from All Articles

In [28]:
# Create data list
data = list()
for article in articles:
    data.append(
        [article.select_one("h2").get_text().strip(),
         article.select_one("h2").select_one("a")['href'],
         article.find("li", {"itemprop":"creator"}).get_text().strip() if article.find("li", {"itemprop":"creator"}) else None,
         "",
         article.p.find(text=True, recursive=False).strip(),
         article.find("a", {"class": "emphasis"}).get_text().strip(),
         article.find("time", {"itemprop":"datePublished"})['datetime']
    ])

In [30]:
# Create pandas dataframe
df = pd.DataFrame(data, columns = ['Title', 'Link', 'Author', 'Abstract', 'Type', 'Subject', 'PublishedDate'])
df.head()

Unnamed: 0,Title,Link,Author,Abstract,Type,Subject,PublishedDate
0,Immune cell profiling of COVID-19 patients in ...,/articles/s41421-020-0168-9,Wen Wen,,Research,Cell Discovery,2020-05-04
1,COVID-19 in persons with haematological cancers,/articles/s41375-020-0836-7,Wenjuan He,,Research,Leukemia,2020-04-24
2,Pathological inflammation in patients with COV...,/articles/s41577-020-0331-4,Miriam Merad,,Reviews,Nature Reviews Immunology,2020-05-06
3,Single-cell landscape of bronchoalveolar immun...,/articles/s41591-020-0901-9,Mingfeng Liao,,Research,Nature Medicine,2020-05-12
4,Coronavirus: the first three months as it happ...,/articles/d41586-020-00154-w,,,News,Nature,2020-04-22


In [57]:
# Export CSV
df.to_csv('data/emea_articles.csv')

In [49]:
# Get Abstract from link
def get_abstract():
    d = requests.get("https://www.nature.com" + link)
    content_soup = BeautifulSoup(d.text, "lxml")
    abstract = content_soup.find("div", {"id":"Abs1-content"})
    doi = content_soup.select_one("#article-info-content").find("a", {"data-track-action": "view doi"}).get_text()
    doi.replace("https://doi.org/", "")

'10.1038/s41375-020-0836-7'