In [1]:
import requests
import pandas as pd
import re
from time import sleep

In [2]:
base_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/posts?'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
params = 'page='

In [3]:
main_req = requests.get(url=base_url, headers=headers)
main_json = main_req.json()
main_json.keys()

dict_keys(['total', 'per_page', 'current_page', 'total_pages', 'posts'])

In [4]:
main_json['posts'][0].keys()

dict_keys(['id', 'date_gmt', 'modified_gmt', 'title', 'slug', 'status', 'type', 'link', 'content', 'vsitems', 'live_items', 'excerpt', 'author', 'editor', 'comment_status', 'comments_count', 'comments', 'featured_image', 'post_images', 'seo', 'categories', 'tags', 'companies', 'is_sponsored', 'sponsor', 'is_partnership', 'external_scripts', 'show_ads', 'is_subscriber_exclusive', 'is_paywalled', 'is_inappbrowser', 'read_time'])

In [5]:
def clean_tags(text):
    '''
    Clean html tags in content.
    ---------------------------
    Parameter:
    text(str): input text containing html.
    ---------------------------
    Return:
    clean_text
    '''
    pattern = re.compile('<.*?>')
    clean_text = re.sub(pattern, '', text).replace('\n', '').replace('\xa0', '')
    return clean_text

In [6]:
def get_data(url, params, headers):
    '''
    Get data from API
    ---------------------------
    Parameters:
    url(str): input url to get data.
    params(str): page parameter for pagination.
    headers(*args): headers to avoid 418 teapot.
    ---------------------------
    Return:
    pandas DataFrame
    '''
    all_data = []

    for page in range(1, 31):
        print(f"Page: {page}/30", end='\r')
        req = requests.get(url=url + params + str(page), headers=headers)
        json = req.json()

        for idx, post in enumerate(json['posts']):
            data = {
                'id': post['id'],
                'date_gmt': post['date_gmt'],
                'modified_gmt': post['modified_gmt'],
                'title': post['title'],
                'slug': post['slug'],
                'status': post['status'],
                'type': post['type'],
                'link': post['link'],
                'content': clean_tags(json['posts'][idx]['content']),
                'vsitems': post['vsitems'],
                'live_items': post['live_items'],
                'excerpt': post['excerpt'],
                'author_id': post['author']['id'],
                'author_name': post['author']['display_name'],
                'author_is_staff': post['author']['is_staff'],
                'editor': post['editor'].replace('Editing by ', ''),
                'comment_status': post['comment_status'],
                'comments_count': post['comments_count'],
                'comments': post['comments'],
                'categories': [cat['name'] for cat in json['posts'][idx]['categories']],
                'tags': [tag['name'] for tag in json['posts'][idx]['tags']],
                'companies': post['companies'],
                'is_sponsored': post['is_sponsored'],
                'sponsor_name': post['sponsor']['name'],
                'is_partnership': post['is_partnership'],
                'show_ads': post['show_ads'],
                'is_subscriber_exclusive': post['is_subscriber_exclusive'],
                'is_paywalled': post['is_paywalled'],
                'is_inappbrowser': post['is_inappbrowser'],
                'read_time': post['read_time']
            }
            all_data.append(data)
        sleep(3)        
    return pd.DataFrame(all_data)

In [7]:
df = get_data(url=base_url, params=params, headers=headers)
df.tail()

Page: 30/30

Unnamed: 0,id,date_gmt,modified_gmt,title,slug,status,type,link,content,vsitems,...,tags,companies,is_sponsored,sponsor_name,is_partnership,show_ads,is_subscriber_exclusive,is_paywalled,is_inappbrowser,read_time
895,724749,2021-12-14T05:19:21,2021-12-14T05:19:21,Fomo Pay to join DBS Bank’s crypto exchange,sg-fintech-firm-fomo-pay-joins-dbs-banks-crypt...,publish,post,https://www.techinasia.com/sg-fintech-firm-fom...,Singapore-based payment firm Fomo Pay is joini...,[],...,"[crypto currency, crypto exchange, DBS, dbs ba...",[],False,,False,True,False,True,False,1
896,724745,2021-12-14T05:05:48,2021-12-15T11:10:42,"Carousell HK, Atome, GoGoX join hands to serve...",carousell-gogox-atome-join-hands-caroubiz,publish,post,https://www.techinasia.com/carousell-gogox-ato...,"The Hong Kong unit of Carousell, the Singapore...",[],...,"[Atome, BNPL, CarouBiz, Carousell, Ecommerce, ...",[],False,,False,True,False,True,False,1
897,724633,2021-12-14T05:00:48,2021-12-13T12:54:19,"The hidden alliances between SEA’s VCs, uncove...",hidden-alliances-seas-vcs-uncovered-2021-edition,publish,visual-story,https://www.techinasia.com/visual-story/hidden...,You can view the actual number of co-investmen...,"[<p><iframe style=""width: 100%; height: 1350px...",...,"[investments, Southeast Asia, VC, venture capi...",[],False,,False,True,True,True,False,4
898,724737,2021-12-14T04:07:02,2021-12-14T04:07:02,Zoomcar to pump $25m into Philippine launch,zoomcar-pump-25m-philippine-launch,publish,post,https://www.techinasia.com/zoomcar-pump-25m-ph...,"Zoomcar, an India-based car rental startup, ha...",[],...,"[car rentals, Gene Angelo Ferrer, online car r...",[],False,,False,True,False,True,False,2
899,724732,2021-12-14T03:14:37,2021-12-14T03:14:37,East Ventures leads pre-series A round of supp...,praktis-pre-series-a-funding,publish,post,https://www.techinasia.com/praktis-pre-series-...,"Praktis (previously known as PTS.sc), an Indon...",[],...,"[Adrian Gilrandy, Dhimas Syahendra, Dipta Iman...",[],False,,False,True,False,True,False,1


In [8]:
df.columns

Index(['id', 'date_gmt', 'modified_gmt', 'title', 'slug', 'status', 'type',
       'link', 'content', 'vsitems', 'live_items', 'excerpt', 'author_id',
       'author_name', 'author_is_staff', 'editor', 'comment_status',
       'comments_count', 'comments', 'categories', 'tags', 'companies',
       'is_sponsored', 'sponsor_name', 'is_partnership', 'show_ads',
       'is_subscriber_exclusive', 'is_paywalled', 'is_inappbrowser',
       'read_time'],
      dtype='object')

In [9]:
# Dropping columns that only containing a single value, fully null, and redundant columns.
to_drop = [
    'status',
    'link',
    'vsitems',
    'live_items',
    'comment_status',
    'comments',
    'companies',
    'is_partnership'
]

In [10]:
df.drop(to_drop, axis=1, inplace=True)

In [11]:
df.to_csv('tia_posts.csv', index=False)