## ABS-CBN Latest News Scraper
Scrapes news.abs-cbn.com for the latest news.

In [1]:
import requests

URL="https://news.abs-cbn.com/news"

In [2]:
page=requests.get(URL)


### Get initial list of latest news
this portion will retrieve the official list of latest news found in [abs-cbn news page](https://news.abs-cbn.com/news). Saves every link to the actual article for navigation later.

In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
#get latest news div
latest_news_div=soup.find(id='latest-news')
#get each latest news
latest_news_list = latest_news_div.find_all('li')

In [5]:
news_links_list = list(map(lambda x: x.find('a')['href'], latest_news_list))
news_links_list

['/news/03/19/21/sws-65-pct-of-filipinos-say-dangerous-to-publish-critical-news-vs-duterte-administration',
 '/news/03/19/21/sws-65-pct-of-filipinos-say-dangerous-to-publish-critical-news-vs-duterte-administration',
 '/sports/03/19/21/nba-julius-randle-triple-double-leads-knicks-past-magic',
 '/sports/03/19/21/nba-bogdan-bogdanovic-leads-streaking-hawks-to-over-thunder',
 '/sports/03/19/21/jaworski-recovering-from-pneumonia-tests-negative-for-coranivrus',
 '/sports/03/19/21/mobile-legends-what-to-expect-on-mpl7s-opening-weekend',
 '/entertainment/03/19/21/iigo-pascual-collaborates-with-foreign-artists-annal-mateus-asato-mfmf-for-goodbye',
 '/news/03/19/21/palace-hits-query-on-slow-vaccine-procurement-despite-billions-in-loans',
 '/life/03/19/21/papet-pasyon-a-puppet-show-on-passion-of-christ-continues-online',
 '/news/03/19/21/pasig-city-enforces-granular-lockdown-in-37-areas-as-covid-19-cases-surge',
 '/video/news/03/19/21/naka-lockdown-na-govt-agencies-tuloy-pa-rin-ang-serbisyo-csc',

### Get individual news article details
Navigates to every link of the actual news article obtained earlier to obtain details.

In [6]:
import time

#array to store news data
news_list_json = []

multimedia = False
for link in news_links_list:
    #forge news link to navigate
    news_link = 'https://news.abs-cbn.com/' + link
    
    #get news article page
    page = requests.get(news_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get details
    title=soup.find(class_='news-title').contents[0].strip()
    
    #author div
    author_block = soup.find(class_='author-block') 
    author = author_block.find(class_='editor').text.strip()
    date = author_block.find(class_='date-posted').text.strip()
    
    #full article contents
    article_content = soup.find(class_='article-content')
    
    #for media articles
    if(article_content == None):
        article_content = soup.find(class_='media-caption')
    
    #for articles with different DOM structure
    if(article_content == None):
        article_content = soup.find(class_='block-content')
        multimedia = True

    if(multimedia): #articles with different DOM architecture
        article = article_content.find_all('p') #get each paragraph<p> of article
        full_article = article[len(article) - 1].text.strip()  
    else:
        article_paragraphs = article_content.find_all('p') #get each paragraph<p> of article
        clean_paragraphs = list(map(lambda x: x.text.strip(), article_paragraphs)) #get text from each p
        
         #remove 'related videos:' if included
        if(clean_paragraphs[-1].lower() == 'related videos:' or clean_paragraphs[-1].lower() == 'related video:' ):
            clean_paragraphs = clean_paragraphs[0:-1]
        full_article = ' '.join(clean_paragraphs) #join paragraphs into one text
    
    #save to array
    news_list_json.append({
        'title': title,
        'author': author,
        'date': date,
        'content': full_article
    })
    
    multimedia = False
    print(f'Saved article {title}')
    time.sleep(11) #10 sec delay from robots.txt

print('Done!')

Saved article SWS: 65 pct of Filipinos say it's dangerous to publish critical news vs Duterte administration
Saved article SWS: 65 pct of Filipinos say it's dangerous to publish critical news vs Duterte administration
Saved article NBA: Julius Randle triple-double leads Knicks past Magic
Saved article NBA: Bogdan Bogdanovic leads streaking Hawks to over Thunder
Saved article Jaworski recovering from pneumonia, tests negative for coronavirus
Saved article Mobile Legends: What to expect on MPL7’s opening weekend
Saved article Iñigo Pascual collaborates with foreign artists Annalé, Mateus Asato, MFMF for “Goodbye”
Saved article 'Nasaan ka bakuna?': Palace hits query on slow vaccine procurement despite billions in loans
Saved article 'Papet Pasyon,' a puppet show on Passion of Christ, continues online
Saved article Pasig City enforces granular lockdown in 37 areas as COVID-19 cases surge
Saved article Naka-lockdown na gov't agencies tuloy pa rin ang serbisyo: CSC
Saved article Duterte OKs 

In [7]:
#check contents
news_list_json

[{'title': "SWS: 65 pct of Filipinos say it's dangerous to publish critical news vs Duterte administration",
  'author': 'Job Manahan, ABS-CBN News',
  'date': 'Mar 19 2021 03:07 PM',
  'content': 'MANILA - Majority or 65 percent of Filipinos agree that it is dangerous to broadcast or print anything critical of President Rodrigo Duterte\'s administration, a survey by the Social Weather Stations (SWS) showed Friday. This as the administration saw a shutdown of a major broadcast network and arrests, prosecution and red tagging of journalists. The survey, conducted from Nov. 21 to 25 last year, also found that less than a fifth (16 percent) of Filipinos disagreed with the statement: “It is dangerous to print or broadcast anything critical of the administration, even if it is the truth." Meanwhile, 18 percent were undecided. The responses have a net agreement score of +49, considered by the pollster as "strong." >EMBED CHART 1 HERE This net score increased by 28 points from the +21 reporte

In [8]:
#export
import json

with open('news_data.json', 'w', encoding='utf-8') as outfile:
    json.dump(news_list_json, outfile) 