In [1]:
import requests
from bs4 import BeautifulSoup

### One Article

In [2]:
article_url = 'https://edition.cnn.com/2025/02/17/europe/pope-francis-polymicrobial-infection-intl/index.html'

In [3]:
html = requests.get(article_url)
if html.status_code < 300 and html.status_code>=200:
    article_soup = BeautifulSoup(html.content)
else:
    print(f"Can't retrieve source code from {article_url}. \nException: {html.reason}. \nStatus Code: {html.status_code}")
    article_soup = None

Print the soup

In [4]:
article_soup

<!DOCTYPE html>
<html data-layout-uri="cms.cnn.com/_layouts/layout-with-rail/instances/world-article-v1@published" data-uri="cms.cnn.com/_pages/cm790qrmg005a2cp51vir47av@published" lang="en">
<head>
<link href="//tpc.googlesyndication.com" rel="dns-prefetch"/>
<link href="//tpc.googlesyndication.com" rel="preconnect"/>
<link href="//pagead2.googlesyndication.com" rel="dns-prefetch"/>
<link href="//pagead2.googlesyndication.com" rel="preconnect"/>
<link href="//www.googletagservices.com" rel="dns-prefetch"/>
<link href="//www.googletagservices.com" rel="preconnect"/>
<link href="//www.google.com" rel="dns-prefetch"/>
<link href="//www.google.com" rel="preconnect"/>
<link href="//c.amazon-adsystem.com" rel="dns-prefetch"/>
<link href="//c.amazon-adsystem.com" rel="preconnect"/>
<link href="//ib.adnxs.com" rel="dns-prefetch"/>
<link href="//ib.adnxs.com" rel="preconnect"/>
<link href="//cdn.adsafeprotected.com" rel="dns-prefetch"/>
<link href="//cdn.adsafeprotected.com" rel="preconnect"/>

#### Step 1. Grap the title

###### Get the HTML element


In [7]:
title_element = article_soup.find( 'h1' , {'id' : 'maincontent'} )
print(title_element)

<h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      Pope Francis‚Äô condition ‚Äòslightly improving‚Äô after pneumonia diagnosis, Vatican says
    </h1>


###### Get the element's text

In [8]:
title = title_element.text
print(title)


      Pope Francis‚Äô condition ‚Äòslightly improving‚Äô after pneumonia diagnosis, Vatican says
    


###### Format the text


In [9]:
print(title.strip())

Pope Francis‚Äô condition ‚Äòslightly improving‚Äô after pneumonia diagnosis, Vatican says


In [10]:
title = title.strip()
print(title)

Pope Francis‚Äô condition ‚Äòslightly improving‚Äô after pneumonia diagnosis, Vatican says


#### Step 2. Grap the date

###### Get the HTML element


In [11]:
date_element = article_soup.find( 'div' , { 'class' : 'timestamp'})
print(date_element)

<div class="timestamp vossi-timestamp" data-editable="settings" data-uri="cms.cnn.com/_components/timestamp/instances/cm790qrnk005o2cp55m0hfy8c@published">
    Updated
          2:27 PM EST, Thu February 20, 2025
      </div>


###### Get the text

In [12]:
date = date_element.text
print(date)


    Updated
          2:27 PM EST, Thu February 20, 2025
      


###### Format the text


In [13]:
def parse_date(date_string):
  date_string = date_string.strip()

  date_list = date_string.split()[1:]

  return {
      'time': ' '.join(date_list[0:2]),
      'zone': date_list[2].replace(',',''),
      'week_day': date_list[3],
      'day': int(date_list[5].replace(',','')),
      'month': date_list[4],
      'year': int(date_list[6]),
  }

In [14]:
date = parse_date(date)
print(date)

{'time': '2:27 PM', 'zone': 'EST', 'week_day': 'Thu', 'day': 20, 'month': 'February', 'year': 2025}


#### Step 3. Grap the Authors

###### Get the HTML element


In [15]:
authors_element = article_soup.find( 'div' , { 'class' : 'byline__names' })
print(authors_element)

<div class="byline__names vossi-byline__names">
			By <span class="byline__name">Christopher Lamb</span>, <a class="byline__link vossi-byline__link" href="https://www.cnn.com/profiles/antonia-mortensen"><span class="byline__name">Antonia Mortensen</span></a> and <span class="byline__name">Sharon Braithwaite</span>, CNN
		</div>


###### Get the text

In [16]:
authors = authors_element.text
print(authors)


			By Christopher Lamb, Antonia Mortensen and Sharon Braithwaite, CNN
		


###### Format the text

In [17]:
def format_authors(author_string):
  author_string = author_string.replace('\n','').replace('\r','').replace('\t', ' ')
  author_string = author_string.replace('By','').replace(', CNN','').strip()
  return author_string

In [18]:
authors = authors_element.text
print(authors_element)

<div class="byline__names vossi-byline__names">
			By <span class="byline__name">Christopher Lamb</span>, <a class="byline__link vossi-byline__link" href="https://www.cnn.com/profiles/antonia-mortensen"><span class="byline__name">Antonia Mortensen</span></a> and <span class="byline__name">Sharon Braithwaite</span>, CNN
		</div>


#### Step 4. Grap the main article

###### Get the HTML elements

In [26]:
paragraphs = article_soup.find_all( 'p' , {'class' : 'paragraph'} )

How many paragraphs?

In [25]:
len(paragraphs)

27

###### Get the text and format it

In [21]:
def format_paragraph(text):
  text = text.replace('\n','')
  return text.strip()

In [22]:
full_text = ''

for paragraph in paragraphs:
  processed_paragraph = format_paragraph(paragraph.text)
  full_text = f'{full_text} {processed_paragraph}'.strip()



In [23]:
full_text

'Pope Francis‚Äô condition is ‚Äúslightly improving‚Äù and he continues to have no fever, the Vatican said on Thursday evening, adding that he carried out work activities after receiving Eucharist this morning. A spokesperson for the Vatican said Thursday that the pope‚Äôs slight improvement in condition indicates that he is reacting positively to his treatment, adding that his heart is holding up well. He is breathing on his own but receiving oxygen, the spokesperson added. Francis has been hospitalized since last week after being plagued by a string of lung-related medical struggles. He had a CT scan and was diagnosed with pneumonia in both lungs on Tuesday, following an earlier diagnosis of ‚Äúpolymicrobial infection‚Äù of respiratory tract. The Vatican said at the time that the medical tests continued to indicate ‚Äúa complex picture‚Äù for one of the oldest popes in the church‚Äôs history. On Wednesday, the Vatican described Francis‚Äô clinical condition as ‚Äústable‚Äù and said h

This can also be done in one line:

``` python
full_text = ' '.join([format_paragraph(item.text) for item in paragraphs])
```

### Get all articles from the catalog

In [27]:
main_url = 'https://edition.cnn.com/'

In [28]:
html = requests.get(main_url)
if html.status_code < 300:
    soup = BeautifulSoup(html.content)
else:
    print(f"Can't retrieve source code from {main_url}. \nException: {html.reason}. \nStatus Code: {html.status_code}")

    soup = None

#### Grab all the article hyperlinks

In [33]:
articles_elements = soup.find_all( 'a' , { 'data-link-type' : 'article'})

How many articles?

In [34]:
len(articles_elements)

111

Check the first one

In [35]:
articles_elements[0]

<a class="container__link container__link--type-article container_ribbon__link container_ribbon__left container_ribbon__light" data-link-type="article" href="/2025/03/03/politics/trump-administration-ukraine-aid/index.html">
<div class="container__text container_ribbon__text">
<div class="container__headline container_ribbon__headline">
<!-- This needs to be all one one line or it will cause unwanted spacing due to handlebar output -->
<span class="container__headline-text" data-editable="headline">Ukraine military aid</span>
</div>
</div>
</a>

Grab the hperlink (href)

In [36]:
articles_elements[0]['href']

'/2025/03/03/politics/trump-administration-ukraine-aid/index.html'

Do it for all

In [57]:
articles = []
base_url = 'https://edition.cnn.com'
for item in articles_elements:

  # Create temporary URL
  current_url = f'{ base_url }{ item["href"] }'

  # Append to articles
  articles.append(current_url)

Print articles

In [58]:
articles

['https://edition.cnn.com/2025/03/03/politics/trump-administration-ukraine-aid/index.html',
 'https://edition.cnn.com/2025/03/04/economy/trade-mexico-canada-china-tariffs-trump-hnk-intl/index.html',
 'https://edition.cnn.com/2025/03/04/asia/india-avalanche-survivors-mana-intl-hnk/index.html',
 'https://edition.cnn.com/2025/03/03/europe/pope-francis-respiratory-failure-intl/index.html',
 'https://edition.cnn.com/2025/03/03/science/starship-launch-test-flight-8/index.html',
 'https://edition.cnn.com/2025/03/03/asia/japan-prince-hisahito-news-conference-intl-hnk/index.html',
 'https://edition.cnn.com/2025/03/03/travel/disney-cruise-biggest-ship-adventure-singapore/index.html',
 'https://edition.cnn.com/2025/03/03/style/julia-fox-oscars-after-party-lotw/index.html',
 'https://edition.cnn.com/2025/03/04/economy/global-markets-trump-tariffs/index.html',
 'https://edition.cnn.com/2025/03/03/politics/trump-tariffs-economy-gamble/index.html',
 'https://edition.cnn.com/2025/03/04/economy/trade-m

#### Iterate through these links to scrape the content


In [59]:
import pandas as pd

In [60]:
data = pd.DataFrame(columns = [
    'URL',
    'Title',
    'Authors',
    'Time',
    'Time Zone',
    'Week Day',
    'Day',
    'Month',
    'Year',
    'Text'
])

data

Unnamed: 0,URL,Title,Authors,Time,Time Zone,Week Day,Day,Month,Year,Text


In [61]:
index = 0
for article_url in articles:
    html = requests.get(article_url)
    if html.status_code < 300:
        # Step 0. Create soup
        article_soup = BeautifulSoup(html.content)

        # Step 1. Grap the title
        title_element = article_soup.find( 'h1' , {'id' : 'maincontent'} )
        title = title_element.text.strip()

        # Step 2. Grap the date
        date_element = article_soup.find( 'div' , { 'class' : 'timestamp'})
        date = parse_date(date_element.text)

        # Step 3. Grap the authors
        authors_element = article_soup.find( 'div' , { 'class' : 'byline__names' })
        authors = format_authors(authors_element.text)

        # Step 4. Grap the Text
        paragraphs = article_soup.find_all( 'p' , {'class' : 'paragraph'} )
        full_text = ' '.join([format_paragraph(item.text) for item in paragraphs])

        data.at[index, 'URL'] = article_url
        data.at[index, 'Title'] = title
        data.at[index, 'Authors'] = authors
        data.at[index, 'Time'] = date['time']
        data.at[index, 'Time Zone'] = date['zone']
        data.at[index, 'Week Day'] = date['week_day']
        data.at[index, 'Day'] = date['day']
        data.at[index, 'Month'] = date['month']
        data.at[index, 'Year'] = date['year']
        data.at[index, 'Text'] = full_text

        index = len(data)
    else:
        print(f"Can't retrieve source code from {article_url}. \nException: {html.reason}. \nStatus Code: {html.status_code}")
        article_soup = None

In [62]:
data

Unnamed: 0,URL,Title,Authors,Time,Time Zone,Week Day,Day,Month,Year,Text
0,https://edition.cnn.com/2025/03/03/politics/tr...,Trump pauses military aid to Ukraine after Ova...,"Kevin Liptak, Samantha Waldenberg and Oren Lie...",9:45 PM,EST,Mon,3,March,2025,President Donald Trump is ordering a pause on ...
1,https://edition.cnn.com/2025/03/04/economy/tra...,China and Canada immediately retaliate against...,Elisabeth Buchwald,11:25 AM,EST,Tue,4,March,2025,President Donald Trump’s blanket 25% tariffs o...
2,https://edition.cnn.com/2025/03/04/asia/india-...,Workers survive 36 hours buried under India av...,Kathleen Magramo and Esha Mitra,1:30 AM,EST,Tue,4,March,2025,Dozens of construction workers have been pulle...
3,https://edition.cnn.com/2025/03/03/europe/pope...,Pope Francis had two episodes of ‚Äòacute resp...,"Antonia Mortensen, Christopher Lamb and Hira H...",7:21 AM,EST,Tue,4,March,2025,Pope Francis is being treated with supplementa...
4,https://edition.cnn.com/2025/03/03/science/sta...,SpaceX‚Äôs Starship: The most powerful rocket ...,Jackie Wattles,7:39 PM,EST,Mon,3,March,2025,SpaceX is standing down from the eighth uncrew...
...,...,...,...,...,...,...,...,...,...,...
106,https://edition.cnn.com/2025/02/26/business/tr...,Tropicana is in big financial trouble,Nathaniel Meyersohn,11:07 AM,EST,Wed,26,February,2025,Florida has been blasted with stronger hurrica...
107,https://edition.cnn.com/2025/02/26/australia/e...,Christian sect members who watched 8-year-old ...,Hilary Whiteman,5:59 AM,EST,Wed,26,February,2025,The parents of an 8-year-old girl who died aft...
108,https://edition.cnn.com/2025/02/26/media/joe-b...,New book on Biden by Jake Tapper and Alex Thom...,Brian Stelter,6:00 AM,EST,Wed,26,February,2025,The day after Donald Trump won the 2024 electi...
109,https://edition.cnn.com/2025/02/24/europe/germ...,Germany‚Äôs far-right may be frozen out of pow...,Analysis by Sophie Tanno and Nadine Schmidt,8:27 AM,EST,Mon,24,February,2025,Germany‚Äôs political system is set up to excl...


Statistics for fun :)

In [63]:
data.describe(include='all')

Unnamed: 0,URL,Title,Authors,Time,Time Zone,Week Day,Day,Month,Year,Text
count,111,111,111,111,111,111,111,111,111,111
unique,75,75,61,69,1,7,11,2,1,75
top,https://edition.cnn.com/2025/03/04/asia/india-...,Workers survive 36 hours buried under India av...,Taylor Nicioli,10:00 AM,EST,Tue,4,March,2025,Dozens of construction workers have been pulle...
freq,3,3,6,4,111,56,56,90,111,3


### Save data to CSV & Excel

In [64]:
data.to_csv('cnn_dataset.csv', index=False)

In [65]:
data.to_excel('cnn_dataset.xlsx', index=False)

### Load data

In [66]:
data_copy = pd.read_csv('cnn_dataset.csv')

In [67]:
data_copy.head()

Unnamed: 0,URL,Title,Authors,Time,Time Zone,Week Day,Day,Month,Year,Text
0,https://edition.cnn.com/2025/03/03/politics/tr...,Trump pauses military aid to Ukraine after Ova...,"Kevin Liptak, Samantha Waldenberg and Oren Lie...",9:45 PM,EST,Mon,3,March,2025,President Donald Trump is ordering a pause on ...
1,https://edition.cnn.com/2025/03/04/economy/tra...,China and Canada immediately retaliate against...,Elisabeth Buchwald,11:25 AM,EST,Tue,4,March,2025,President Donald Trump’s blanket 25% tariffs o...
2,https://edition.cnn.com/2025/03/04/asia/india-...,Workers survive 36 hours buried under India av...,Kathleen Magramo and Esha Mitra,1:30 AM,EST,Tue,4,March,2025,Dozens of construction workers have been pulle...
3,https://edition.cnn.com/2025/03/03/europe/pope...,Pope Francis had two episodes of ‚Äòacute resp...,"Antonia Mortensen, Christopher Lamb and Hira H...",7:21 AM,EST,Tue,4,March,2025,Pope Francis is being treated with supplementa...
4,https://edition.cnn.com/2025/03/03/science/sta...,SpaceX‚Äôs Starship: The most powerful rocket ...,Jackie Wattles,7:39 PM,EST,Mon,3,March,2025,SpaceX is standing down from the eighth uncrew...


### Can you get the articles for a specific topic?

In [73]:
main_url = 'https://edition.cnn.com/search?q=refugees&from=0&size=10&page=1&sort=newest&types=article&section='

html = requests.get(main_url)
if html.status_code < 300:
    soup = BeautifulSoup(html.content)
else:
    print(f"Can't retrieve source code from {main_url}. \nException: {html.reason}. \nStatus Code: {html.status_code}")
    soup = None

In [71]:
articles_elements = soup.find_all('div', {'class': 'container__item'})

In [74]:
print(f'Found {len(articles_elements)} Articles')

Found 0 Articles
