## ABS-CBN Latest News Scraper
Scrapes news.abs-cbn.com for the latest news.

In [1]:
import requests

URL="https://news.abs-cbn.com/news"

In [2]:
page=requests.get(URL)


### Get initial list of latest news
this portion will retrieve the official list of latest news found in [abs-cbn news page](https://news.abs-cbn.com/news). Saves every link to the actual article for navigation later.

In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
#get latest news div
latest_news_div=soup.find(id='latest-news')
#get each latest news
latest_news_list = latest_news_div.find_all('li')

In [5]:
news_links_list = list(map(lambda x: x.find('a')['href'], latest_news_list))
news_links_list

['/sports/03/19/21/mobile-legends-what-to-expect-on-mpl7s-opening-weekend',
 '/entertainment/03/19/21/iigo-pascual-collaborates-with-foreign-artists-annal-mateus-asato-mfmf-for-goodbye',
 '/news/03/19/21/palace-hits-query-on-slow-vaccine-procurement-despite-billions-in-loans',
 '/life/03/19/21/papet-pasyon-a-puppet-show-on-passion-of-christ-continues-online',
 '/news/03/19/21/pasig-city-enforces-granular-lockdown-in-37-areas-as-covid-19-cases-surge',
 '/video/news/03/19/21/naka-lockdown-na-govt-agencies-tuloy-pa-rin-ang-serbisyo-csc',
 '/news/03/19/21/duterte-oks-using-all-astrazeneca-doses-on-hand-as-first-jab-for-health-workers',
 '/business/03/19/21/antitrust-body-oks-manila-city-govt-waterfront-manila-joint-venture-for-reclamation-project',
 '/life/03/19/21/ph-international-dive-expo-goes-digital-amid-pandemic',
 '/entertainment/03/19/21/sylvia-sanchez-ibinahagi-ang-magandang-kapalit-nang-pagkakaroon-noon-ng-covid-19',
 '/life/03/19/21/look-after-dawn-zulueta-andrea-torres-dresses-

### Get individual news article details
Navigates to every link of the actual news article obtained earlier to obtain details.

In [19]:
import time

#array to store news data
news_list_json = []

multimedia = False
for link in news_links_list:
    #forge news link to navigate
    news_link = 'https://news.abs-cbn.com/' + link
    
    #get news article page
    page = requests.get(news_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get details
    title=soup.find(class_='news-title').contents[0].strip()
    
    #author div
    author_block = soup.find(class_='author-block') 
    author = author_block.find(class_='editor').text.strip()
    date = author_block.find(class_='date-posted').text.strip()
    
    #full article contents
    article_content = soup.find(class_='article-content')
    
    #for media articles
    if(article_content == None):
        article_content = soup.find(class_='media-caption')
    
    #for articles with different DOM structure
    if(article_content == None):
        article_content = soup.find(class_='block-content')
        multimedia = True

    if(multimedia): #articles with different DOM architecture
        article = article_content.find_all('p') #get each paragraph<p> of article
        full_article = article[len(article) - 1].text.strip()  
    else:
        article_paragraphs = article_content.find_all('p') #get each paragraph<p> of article
        clean_paragraphs = list(map(lambda x: x.text.strip(), article_paragraphs)) #get text from each p
        
         #remove 'related videos:' if included
        if(clean_paragraphs[-1].lower() == 'related videos:' or clean_paragraphs[-1].lower() == 'related video:' ):
            clean_paragraphs = clean_paragraphs[0:-1]
        full_article = ' '.join(clean_paragraphs) #join paragraphs into one text
    
    #save to array
    news_list_json.append({
        'title': title,
        'author': author,
        'date': date,
        'content': full_article
    })
    
    multimedia = False
    print(f'Saved article {title}')
    time.sleep(11) #10 sec delay from robots.txt

print('Done!')

Saved article Mobile Legends: What to expect on MPL7’s opening weekend
Saved article Iñigo Pascual collaborates with foreign artists Annalé, Mateus Asato, MFMF for “Goodbye”
Saved article 'Nasaan ka bakuna?': Palace hits query on slow vaccine procurement despite billions in loans
Saved article 'Papet Pasyon,' a puppet show on Passion of Christ, continues online
Saved article Pasig City enforces granular lockdown in 37 areas as COVID-19 cases surge
Saved article Naka-lockdown na gov't agencies tuloy pa rin ang serbisyo: CSC
Saved article Duterte OKs using all AstraZeneca doses on hand as first jab for health workers
Saved article Antitrust body OKs Manila city gov't, Waterfront Manila joint venture for reclamation project
Saved article PH international dive expo goes digital amid pandemic
Saved article Sylvia Sanchez, ibinahagi ang 'magandang' kapalit nang pagkakaroon noon ng COVID-19
Saved article LOOK: After Dawn Zulueta, Andrea Torres dresses up as 'Bridgerton' character
Saved articl

In [21]:
#check contents
news_list_json

[{'title': 'Mobile Legends: What to expect on MPL7’s opening weekend',
  'author': 'ABS-CBN News',
  'date': 'Mar 19 2021 02:44 PM',
  'content': 'MANILA - The Philippine leg of the Mobile Legends: Bang Bang Professional League kicks off its 7th installment Friday, March 19, with a string of games expected to keep Pinoy MLBB enthusiasts eyes peeled. Here are the matches scheduled on the opening weekend of the games, from March 19 to 21: MARCH 19 (FRIDAY) Last year’s first runner-up Smart Omega will go up against third placers Execration ML to open the season with a game starting at 4 p.m. on March 19. After getting in the tournament through the qualifying round, Work Auster Force will square up against AURA Philippines at 6 p.m. on the same day. MARCH 20 (SATURDAY) Another qualifying round clincher, Laus Playbook Esports Team, will go head to head with Nexplay Esports on Day 2 of the tournament at around 4 p.m. on March 20. Meanwhile, BREN Esports looks to carry the momentum from its d

In [23]:
#export
import json

with open('news_data.json', 'w', encoding='utf-8') as outfile:
    json.dump(news_list_json, outfile) 