In [None]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup 
import time

# Scraping the data

In [None]:
# get total number of pages
api_url = 'https://rappler.com/wp-json/wp/v2/posts?page=1&per_page=100&after=2022-01-01T00:00:00'

response = requests.get(api_url)
pages_count = response.headers['X-WP-TotalPages']
n = int(pages_count)

print(f'Total number of pages: {n}')

In [None]:
# trial run, get posts
r = requests.get('http://rappler.com/wp-json/wp/v2/posts',
                     params={'after': '2022-01-01 00:00:00.000'}
                ).json()

sample = pd.json_normalize(r)
display(sample)

In [None]:
# loop over the different pages
data = list()
current_page = 1

while current_page <= n:
    print(f'Scraping page: {current_page}...')
    r = requests.get('http://rappler.com/wp-json/wp/v2/posts',
                     params={'after': '2022-01-01 00:00:00.000',
                             'page': current_page,
                             'per_page':100}).json()
    data.extend(r)
    current_page += 1
    time.sleep(np.random.randint(2,5))

In [None]:
df = pd.json_normalize(data)
df[df.duplicated('id', keep=False)].sort_values('id')

In [None]:
df.drop_duplicates('id', keep='first', inplace=True)
# df.to_csv('2022-rappler-articles.csv', index=False)

# Filtering data for processing

In [None]:
df = pd.read_csv('2022-rappler-articles.csv')
df.shape

In [None]:
drop_features = ['prepublish_checks.xbs-valid-conversions.status',
       'prepublish_checks.xbs-valid-conversions.message',
       'prepublish_checks.xbs-valid-conversions.data',
       'prepublish_checks.xbs-valid-fallback.status',
       'prepublish_checks.xbs-valid-fallback.message',
       'prepublish_checks.xbs-valid-fallback.data', 'ab_tests.titles.started',
       'ab_tests.titles.start_time', 'ab_tests.titles.end_time',
       'ab_tests.titles.traffic_percentage',
       'ab_tests.titles.variant_traffic_percentage', 'ab_tests.titles.paused',
       'ab_tests.featured_images.started',
       'ab_tests.featured_images.start_time',
       'ab_tests.featured_images.end_time',
       'ab_tests.featured_images.traffic_percentage',
       'ab_tests.featured_images.variant_traffic_percentage',
       'ab_tests.featured_images.paused', '_links.self', '_links.collection',
       '_links.about', '_links.replies', '_links.version-history',
       '_links.predecessor-version', '_links.wp:featuredmedia',
       '_links.wp:attachment', '_links.wp:term', '_links.curies', 'meta.claim_author_type',
       'meta.claim_author_name', 'meta.claim_reviewed', 'meta.review_rating',
       'meta.schema_article_type', 'template', 'status', 'type', 'ab_test_titles',
       'ab_test_featured_images', 'sticky', 'format', 'content.protected', 'excerpt.protected'         
                ]

df.drop(columns=drop_features, inplace=True)

In [None]:
df['title.rendered'] = df['title.rendered'].apply(lambda x: BeautifulSoup(x).text)
df['content.rendered'] = df['content.rendered'].apply(lambda x: BeautifulSoup(x).string.strip())
df['content.rendered'] = df['content.rendered'].str.replace('\n',' ')

In [None]:
rel_cols = ['id', 'date', 'link', 'title.rendered', 'content.rendered']
df[rel_cols].to_csv('2022-rappler-articles-clean.csv', index=False)