**Step 1:** Get articles from [The lancet](https://www.thelancet.com).
We can use the requests library to do this.

In [17]:
# import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# fetch web page
r = requests.get("https://www.thelancet.com/coronavirus")

**Step 2:** Use BeautifulSoup to remove HTML tags.
Use "lxml" rather than "html5lib".
Outputting all the results may overload the space available to load this notebook, so we omit a print statement here.


In [3]:
soup = BeautifulSoup(r.text, "lxml")

**Step 3:** Find all course summaries
Use the BeautifulSoup's find_all method to select based on tag type and class name. On Chrome, you can right click on the item, and click "Inspect" to view its html on a web page.

In [4]:
# Find all articles
articles = soup.find_all("div", {"class":"articleCitation"})
print('Number of articles:', len(articles))

Number of articles: 43


In [5]:
# print the first summary in articles
print(articles[0].prettify())

<div class="articleCitation">
 <li>
  <div class="detail">
   <div class="article-details">
    <div class="articleType doctopic-1-analysisAndInterpretation label-correspondence">
     Correspondence
    </div>
    <div class="articleTitle">
     <h4 class="title" id="S0140-6736(20)30980-6-title">
      <div class="rightTitleInfo">
       <div class="icons atype-cor">
        <!--${freeContentIcon: 10.1016/S0140-6736(20)30980-6}-->
       </div>
      </div>
      <a href="/journals/lancet/article/PIIS0140-6736(20)30980-6/fulltext">
       Atypical presentation of COVID-19 in young infants
      </a>
     </h4>
    </div>
    <div class="authors" id="S0140-6736(20)30980-6-au">
     Nadia Nathan, Blandine Prevost, Harriet Corvol
    </div>
    <div class="doi" data-doi="10.1016/S0140-6736(20)30980-6">
     DOI:
     <a href="https://doi.org/10.1016/S0140-6736(20)30980-6">
      https://doi.org/10.1016/S0140-6736(20)30980-6
     </a>
    </div>
    <div class="citation">
     <span class

In [56]:
# Extract article type
articles[0].select_one(".articleType").get_text().strip()

'Correspondence'

In [34]:
# Extract article title
articles[0].select_one(".articleTitle").get_text().strip()

'Atypical presentation of COVID-19 in young infants'

In [35]:
# Extract article Digital Object Identifier
articles[0].select_one(".doi").get_text().strip()

'DOI: https://doi.org/10.1016/S0140-6736(20)30980-6'

In [55]:
# Extract article authors
articles[0].select_one(".authors").get_text().strip()

'Nadia Nathan, Blandine Prevost, Harriet Corvol'

In [57]:
# Extract citation
articles[0].select_one(".citation").get_text().strip()

'The Lancet'

In [58]:
# Extract published date
articles[0].select_one(".published-online").get_text().strip()

'Published: April 27, 2020'

In [14]:
# Extract availability
'Open' if articles[0].find_all(".OALabel") else 'Closed'

'Closed'

## Create dataset from All Articles

In [89]:
# Create data list
data = list()
for article in articles:
    data.append(
        [article.select_one(".articleType").get_text().strip() if article.find_all("div", {"class": "articleType"}) else 'N/A',
         article.select_one(".articleTitle").get_text().strip(),
         article.select_one(".articleTitle").select_one("a")['href'],
         article.select_one(".doi").get_text().split('DOI:')[1],
         article.select_one(".authors").get_text() if article.find_all("div", {"class": "authors"}) else 'N/A',
         article.select_one(".citation").get_text().strip(),
         article.select_one(".published-online").get_text().split('Published:')[1] if article.find_all("div", {"class": "published-online"}) else 'N/A',
         'Open' if article.find_all(".OALabel") else 'Closed'
    ])

In [93]:
# Create pandas dataframe
df = pd.DataFrame(data, columns = ['Type', 'Title', 'Link', 'DOI_link', 'Authors', 'Citation', 'PublishedDate', 'Availability'])
df.head()

Unnamed: 0,Type,Title,Link,DOI_link,Authors,Citation,PublishedDate,Availability
0,Correspondence,Atypical presentation of COVID-19 in young inf...,/journals/lancet/article/PIIS0140-6736(20)3098...,https://doi.org/10.1016/S0140-6736(20)30980-6,"Nadia Nathan, Blandine Prevost, Harriet Corvol",The Lancet,"April 27, 2020",Closed
1,Correspondence,What does it mean to be made vulnerable in the...,/journals/lancet/article/PIIS0140-6736(20)3097...,https://doi.org/10.1016/S0140-6736(20)30979-X,"Ayesha Ahmad, Ryoa Chung, Lisa Eckenwiler, Ago...",The Lancet,"April 27, 2020",Closed
2,Correspondence,An international registry for emergent pathoge...,/journals/lancet/article/PIIS0140-6736(20)3098...,https://doi.org/10.1016/S0140-6736(20)30981-8,"Alice Panchaud, Guillaume Favre, Leo Pomar, Ma...",The Lancet,"April 27, 2020",Closed
3,Comment,What policy makers need to know about COVID-19...,/journals/lancet/article/PIIS0140-6736(20)3098...,https://doi.org/10.1016/S0140-6736(20)30985-5,"Daniel M Altmann, Daniel C Douek, Rosemary J B...",The Lancet,"April 27, 2020",Closed
4,Comment,Management of patients with multiple myeloma d...,/journals/lanhae/article/PIIS2352-3026(20)3012...,https://doi.org/10.1016/S2352-3026(20)30124-1,"Florent Malard, Mohamad Mohty",The Lancet Haematology,"April 27, 2020",Closed


In [92]:
# Export CSV
df.to_csv('data/the_lancet_articles.csv')