In [7]:
import pandas as pd 
import requests
import re
from time import sleep

In [2]:
base_url = 'https://www.techinasia.com/api/2.0/job-postings'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
params = '?page='

In [3]:
base_req = requests.get(url=base_url, params=params, headers=headers)
base_json = base_req.json()
base_json.keys()

dict_keys(['data', 'current_page', 'from', 'last_page', 'next_page_url', 'per_page', 'prev_page_url', 'to', 'total', 'after', 'before'])

In [4]:
base_json['data'][0].keys()

dict_keys(['id', 'status', 'title', 'is_remote', 'is_salary_visible', 'salary_min', 'salary_max', 'has_equity', 'vacancy_count', 'experience_min', 'experience_max', 'description', 'external_link', 'published_at', 'updated_at', 'expires_at', 'is_featured', 'is_saved', 'is_using_paid_quota', 'saved_job_id', 'applied_job_id', 'application_status', 'city', 'company', 'contacts', 'currency', 'job_skills', 'job_type', 'position', 'taxonomies'])

In [5]:
def clean_tags(text):
    '''
    Clean html tags in content.
    ---------------------------
    Parameter:
    text(str): input text containing html.
    ---------------------------
    Return:
    clean_text
    '''
    pattern = re.compile('<.*?>')
    clean_text = re.sub(pattern, '', text)
    return clean_text

def get_jobs(url, params, headers):
    '''
    Get jobs data from API
    ---------------------------
    Parameters:
    url(str): input url to get data.
    params(str): page parameter for pagination.
    headers(*args): headers to avoid 418 teapot.
    ---------------------------
    Return:
    pandas DataFrame
    '''
    all_data = []

    for page in range(1, base_json['last_page']):
        req = requests.get(url=base_url + params + str(page), headers=headers)
        json = req.json()
        print(f"Page: {page}/135", end='\r')

        for idx, job in enumerate(json['data']):
            data = {
                'id': job['id'],
                'title': job['title'],
                'is_remote': job['is_remote'],
                'salary_min': job['salary_min'],
                'salary_max': job['salary_max'],
                'has_equity': job['has_equity'],
                'experience_min': job['experience_min'],
                'experience_max': job['experience_max'],
                'description': clean_tags(job['description']),
                'city': json['data'][idx]['city']['name'],
                'company_name': json['data'][idx]['company']['name'],
                'currency': json['data'][idx]['currency']['currency_code'],
                'skills': [skill['name'] for skill in json['data'][idx]['job_skills']],
                'job_type': json['data'][idx]['job_type']['name'],
                'position': json['data'][idx]['position']['name']
            }
            all_data.append(data)
        sleep(3)
    return pd.DataFrame(all_data)

In [8]:
df = get_jobs(url=base_url, params=params, headers=headers)
df.tail()

Page: 134/135

Unnamed: 0,id,title,is_remote,salary_min,salary_max,has_equity,experience_min,experience_max,description,city,company_name,currency,skills,job_type,position
3345,0f09879b-de42-4f6d-b757-9fe3a663f1cc,Account Manager,False,8000000,17000000,False,1,4,Responsibilities:\nIdentify and source new bus...,Jakarta,PT. ASLI Rancangan Indonesia,IDR,"[Account Management, Customer Acquisition, Sal...",Full-time,Sales & Business Development
3346,400b5cc9-d924-4dac-bb1c-5e618fdf317a,DevOps,False,6000000,7500000,False,1,4,Job Descriptions \nManage infrastructure use C...,Sidoarjo,PT Bimasakti Multi Sinergi,IDR,"[Linux, Windows XP, DevOps, Information Techno...",Full-time,DevOps & Cloud Management
3347,5f24795c-c1b0-4170-818d-ae3a46710e5a,Business Manager,False,0,0,False,4,7,Job Description:\nSeek &amp; Assess New Busine...,Jakarta,Aruna,IDR,"[Business Development & Partnerships, Business...",Full-time,Sales & Business Development
3348,103c30a7-d39e-4d98-a68e-adb48ebc7b44,Content Marketing Specialist,True,0,0,True,1,4,RESPONSIBILITIES\nPlan and execute a series of...,Jakarta,PT Lingkaran Edukasi Kreatif,IDR,"[Sales & Marketing, Digital Marketing, Marketi...",Full-time,Marketing & PR
3349,71227dfb-d2f2-4540-8184-b5cf69740c08,React Native Developer,False,20000000,50000000,False,7,10,WE ARE HIRING!\nWho are we looking for?\nSpeed...,"Denpasar, Bali",Speedoc,IDR,"[React.js, Mobile Application Development, Ama...",Full-time,Enterprise Software & Systems


In [9]:
df.to_csv('tia_jobs.csv', index=False)