In [6]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
import numpy as np
import dotenv
import base64
import os
import json

# EdX Course

In [6]:
url = "https://igsyv1z1xi-dsn.algolia.net/1/indexes/*/queries"
headers = {
    "x-algolia-agent": "Algolia for JavaScript (5.0.0); Browser",
    "x-algolia-api-key": "6658746ce52e30dacfdd8ba5f8e8cf18",
    "x-algolia-application-id": "IGSYV1Z1XI",
    "content-type": "application/json"
}

payload = {
    "requests": [
        {
            "indexName": "product",
            "clickAnalytics": True,
            "facets": [
                "availability", "language", "learning_type", "level",
                "partner", "product", "program_type", "skills.skill", "subject"
            ],
            "hitsPerPage": 150,
            "page": 0,
            "filters": "",
            "query": ""
        }
    ]
}

all_courses = []
page = 0
product_type = ["course", "program"]

total_pages = None

for ptype in product_type:
    while True:
        print(f"[INFO] Fetching page {page} for product {ptype}...")
        payload["requests"][0]["page"] = page
        payload["requests"][0]["filters"] = f"product:{ptype}"
        response = requests.post(url, headers=headers, json=payload)
        data = response.json()

        hits = data["results"][0]["hits"]
        if total_pages is None:
            total_pages = data["results"][0].get("nbPages", 1)
            print(f"[INFO] Total pages available for product {ptype}: {total_pages}")

        if not hits:
            break

        for course in hits:
            record = {
                "title": course.get("title", "Missing"),
                "partner": course.get("partner", ["Missing"]),
                "primary_description": BeautifulSoup(course.get("primary_description", "Missing"), "html.parser").get_text(),
                "secondary_description": BeautifulSoup(course.get("secondary_description", "Missing"), "html.parser").get_text(),
                "tertiary_description": BeautifulSoup(course.get("tertiary_description", "Missing"), "html.parser").get_text(),
                "availability": course.get("availability", ["Missing"]),
                "subject": course.get("subject", ["Missing"]),
                "level": course.get("level", ["Missing"]),
                "language": course.get("language", ["Missing"]),
                "product": course.get("product", "Missing"),
                "program_type": course.get("program_type", ["Missing"]),
                "staff": course.get("staff", ["Missing"]),
                "translation_language": course.get("ai_languages", {}).get("translation_languages", ["Missing"]),
                "transcription_language": course.get("ai_languages", {}).get("transcription_languages", ["Missing"]),
                "recent_enrollment_count": course.get("recent_enrollment_count", "Missing"),
                "marketing_url": course.get("marketing_url", "Missing"),
                "weeks_to_complete": course.get("weeks_to_complete", "Missing"),
                "skill": (
                    [s["skill"] for s in course.get("skills", []) if isinstance(s, dict)]
                    if isinstance(course.get("skills", []), list) else ["Missing"]
                )
            }
            all_courses.append(record)

        page += 1
        if page >= total_pages:
            page = 0
            total_pages = None
            break

        print(f"[INFO] Sleeping 10 seconds to respect crawl delay...")
        time.sleep(10)


df = pd.DataFrame(all_courses)
df.to_csv("scrape_result/edx_courses.csv", index=False)
print("[DONE] Saved to edx_courses.csv")

[INFO] Fetching page 0 for product course...
[INFO] Total pages available for product course: 7
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 1 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 2 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 3 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 4 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 5 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 6 for product course...
[INFO] Fetching page 0 for product program...
[INFO] Total pages available for product program: 5


  "primary_description": BeautifulSoup(course.get("primary_description", "Missing"), "html.parser").get_text(),


[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 1 for product program...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 2 for product program...


  "tertiary_description": BeautifulSoup(course.get("tertiary_description", "Missing"), "html.parser").get_text(),


[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 3 for product program...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 4 for product program...
[DONE] Saved to edx_courses.csv


In [10]:
df = pd.read_csv("scrape_result/edx_courses.csv")
df

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill
0,How to Learn Online,['edX'],This course will prepare you with strategies t...,"\nHistory, benefits, and foundational concepts...",This course harnesses science-backed technique...,['Available now'],['Education & Teacher Training'],['Introductory'],['English'],Course,[],[],"['Arabic', 'English', 'Spanish (Latin America)...","['Portuguese - Brazil', 'Indonesian', 'Arabic'...",47714,https://www.edx.org/learn/how-to-learn/edx-how...,2.0,['Learning Design']
1,The Science of Happiness,"['University of California, Berkeley']",The first MOOC to teach positive psychology. L...,\nWhat happiness really means and why it matte...,"""A free eight-week Science of Happiness course...","['Available now', 'Upcoming']",['Social Sciences'],['Introductory'],['English'],Course,[],"['dacher-keltner', 'emiliana-simon-thomas']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Indonesian', 'Spanish', 'Portugue...",16821,https://www.edx.org/learn/happiness/university...,11.0,"['Empathy', 'Evolutionary Biology', 'Psychology']"
2,Remote Work Revolution for Everyone,['Harvard University'],"In Remote Work Revolution for Everyone, you wi...",\nUnderstand the key elements of remote work a...,How are you thriving or surviving in your remo...,['Available now'],['Business & Management'],['Introductory'],['English'],Course,['Professional Certificate'],['tsedal-neeley'],"['Arabic', 'English', 'Spanish (Latin America)...","['Spanish', 'Arabic', 'Thai', 'Korean', 'Chine...",45954,https://www.edx.org/learn/remote-work/harvard-...,3.0,"['Telecommuting', 'Customer Relationship Build..."
3,CS50's Introduction to Computer Science,['Harvard University'],An introduction to the intellectual enterprise...,\nA broad and robust understanding of computer...,"This is CS50x , Harvard University's introduct...",['Available now'],"['Computer Science', 'Engineering']",['Introductory'],['English'],Course,"['Professional Certificate', 'Professional Cer...","['doug-lloyd', 'david-j-malan', 'brian-yu']","['Arabic', 'English', 'Spanish (Latin America)...","['Telugu', 'Turkish', 'Swahili', 'Arabic', 'Fr...",425063,https://www.edx.org/learn/computer-science/har...,12.0,"['Cryptography', 'Resource Management', 'Algor..."
4,Data Visualization and Building Dashboards wit...,['IBM'],Build the fundamental knowledge necessary to u...,\nDescribe the important role charts play in t...,Please Note: Learners who successfully complet...,['Available now'],['Data Analysis & Statistics'],['Introductory'],['English'],Course,"['Professional Certificate', 'Professional Cer...","['steve-ryan', 'sandip-sasha-joy']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Portuguese - Brazil', 'Indonesian']",11885,https://www.edx.org/learn/data-visualization/i...,4.0,"['Data Visualization', 'Data Analysis', 'Micro..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,Corporate Innovation,['The University of Queensland'],Learn how to apply state-of-the-art methods to...,The ability to innovate is crucial for busines...,Fundamentals of creative and design thinking t...,['Archived'],"['Business & Management', 'Communication', 'Da...",['Intermediate'],['English'],Program,['MicroMasters'],"['martie-louise-verreynne', 'rachel-fitzgerald...",[],[],-269,https://www.edx.org/masters/micromasters/uqx-c...,,[]
1652,IBM: Ciencia de datos,['IBM'],,La ciencia de datos y las habilidades de apren...,"Aplicar varias habilidades, técnicas y herrami...",['Archived'],"['Data Analysis & Statistics', 'Computer Scien...","['Introductory', 'Intermediate']",['Spanish'],Program,['Professional Certificate'],"['romeo-kienzler', 'saeed-aghabozorgi', 'josep...",[],[],-328,https://www.edx.org/certificates/professional-...,,"['Data Science', 'Python (Programming Language..."
1653,C++ Programming Essentials,['IBM'],Become a skilled C++ developer who is fluent i...,This Professional Certificate program takes yo...,Fundamental concepts of programming using C++ ...,['Archived'],['Computer Science'],"['Introductory', 'Intermediate']",['English'],Program,['Professional Certificate'],"['nisha-p-2', 'sripriya-s', 'sathya-ponmalar-h...",[],[],-444,https://www.edx.org/certificates/professional-...,,"['Object-Oriented Programming (OOP)', 'C++ (Pr..."
1654,Marketing Digital,['Universidad Galileo'],Aprende a conectar tu marca con el mundo. Cono...,Gana una ventaja competitiva convirtiéndote en...,Diseñar estrategias de marketing en redes soci...,['Archived'],"['Communication', 'Business & Management']",['Introductory'],['Spanish'],Program,['Professional Certificate'],"['rocael-hernandez-ph-d', 'miguel-morales-ph-d...",[],[],-659,https://www.edx.org/certificates/professional-...,,"['Digital Marketing', 'Advertising Campaigns',..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656 entries, 0 to 1655
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    1656 non-null   object 
 1   partner                  1656 non-null   object 
 2   primary_description      1511 non-null   object 
 3   secondary_description    1653 non-null   object 
 4   tertiary_description     1650 non-null   object 
 5   availability             1656 non-null   object 
 6   subject                  1656 non-null   object 
 7   level                    1656 non-null   object 
 8   language                 1656 non-null   object 
 9   product                  1656 non-null   object 
 10  program_type             1656 non-null   object 
 11  staff                    1656 non-null   object 
 12  translation_language     1656 non-null   object 
 13  transcription_language   1656 non-null   object 
 14  recent_enrollment_count 

# Linkedin Jobs

In [None]:
location = "Indonesia"
exp_levels = range(1,7)
start="0"
id_list = []

for exp_level in exp_levels:
    for start in np.arange(0, 500, 10):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?location={location}&f_E={exp_level}&start={start}"
        response = requests.get(list_url)
        
        list_data = response.text
        list_soup = BeautifulSoup(list_data, "html.parser")
        page_jobs = list_soup.find_all("li")
        
        for job in page_jobs:
            base_card_div = job.find(class_= "base-card")
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
        time.sleep(2)

In [17]:
# Try again so we can get 1000 jobs for each level
for exp_level in exp_levels:
    for start in np.arange(510, 1010, 10):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?location={location}&f_E={exp_level}&start={start}"
        response = requests.get(list_url)
        
        list_data = response.text
        list_soup = BeautifulSoup(list_data, "html.parser")
        page_jobs = list_soup.find_all("li")
        
        for job in page_jobs:
            base_card_div = job.find(class_= "base-card")
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
        time.sleep(2)

In [24]:
len(id_list)

4510

In [None]:
# Initialize an empty list to store job information
job_list = []

# chunk by chunk to avoid rate limit issues
for job_id in id_list[4432:4510]:
    # Construct the URL for each job using the job ID
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }
    # Send a GET request to the job URL and parse the reponse
    job_response = requests.get(job_url, headers=headers)
    job_soup = BeautifulSoup(job_response.text, "html.parser")
    
     # Create a dictionary to store job details
    job_post = {}
    
    try:
        job_post["job_link"] = job_soup.find("a", {"class":"topcard__link"}).get('href').strip().split('?')[0]
    except:
        job_post["job_link"] = None
    
    # Try to extract and store the job title
    try:
        job_post["job_title"] = job_soup.find("h2", {"class":"top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
    except:
        job_post["job_title"] = None
        
    # Try to extract and store the company name
    try:
        job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    except:
        job_post["company_name"] = None
    
    try:
        job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip()
    except:
        job_post["location"] = None
    
    try:
        uls = job_soup.select(".description__text.description__text--rich section ul")
        responsibilities = [li.text.strip() for li in uls[0].find_all("li")]
        job_post["responsibilities"] = responsibilities
    except:
        job_post["responsibilities"] = None
        
    try:
        uls = job_soup.select(".description__text.description__text--rich section ul")
        requirements = [li.text.strip() for li in uls[1].find_all("li")]
        job_post["requirements"] = requirements
    except:
        job_post["requirements"] = None
        
    try:
        additional_details_title = job_soup.find_all(class_="description__job-criteria-subheader")
        additional_details_content = job_soup.find_all(class_="description__job-criteria-text")
        for title in additional_details_title:
            key = None
            if title.text.strip() == "Seniority level":
                key = "level"
            elif title.text.strip() == "Employment type":
                key = "employment_type"
            elif title.text.strip() == "Job function":
                key = "job_function"
            elif title.text.strip() == "Industries":
                key = "industries"
            content_text = additional_details_content[additional_details_title.index(title)].text.strip()
            if key:
                job_post[key] = content_text
    except:
        pass
        
    # Try to extract and store the time posted
    try:
        job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    except:
        job_post["time_posted"] = None
        
    # Try to extract and store the number of applicants
    try:
        job_post["num_applicants"] = job_soup.find("span", {"class": "num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip()
    except:
        job_post["num_applicants"] = None
    
        
    # Append the job details to the job_list
    job_list.append(job_post)
    time.sleep(2)

In [None]:
job_df = pd.DataFrame(job_list)
job_df.to_csv("scrape_result/linkedin_jobs.csv", index=False)

In [62]:
job_df.head()

Unnamed: 0,job_link,job_title,company_name,location,responsibilities,requirements,level,employment_type,job_function,industries,time_posted,num_applicants
0,https://id.linkedin.com/jobs/view/general-affa...,General Affair Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia","[Membuat konten kreatif (foto, video, caption)...","[Mahasiswa aktif dari jurusan Komunikasi, Mana...",Internship,Internship,"Other, Information Technology, and Management",Food and Beverage Services,1 month ago,
1,https://id.linkedin.com/jobs/view/data-analyst...,Data Analyst Intern,PT Lion Super Indo,"Jakarta, Indonesia",[Understand the day-to-day issues that our bus...,[Student of Bachelor degree in Statistics or A...,Internship,Internship,Information Technology and Business Development,Retail,1 week ago,
2,https://id.linkedin.com/jobs/view/project-mana...,Project Management Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",[Assist in compiling & updating the project ti...,[Willing to be placed in Kalbe Morinaga Cikamp...,Internship,Internship,Project Management and Information Technology,Food and Beverage Services,1 week ago,
3,https://id.linkedin.com/jobs/view/improvement-...,Improvement Campaign & Communication Intern,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",[Melaksanakan observasi lapangan dan pencatata...,[Mendukung pelaksanaan event TPM dan campaign ...,Internship,Internship,Marketing and Sales,Food and Beverage Services,1 week ago,
4,https://id.linkedin.com/jobs/view/management-t...,Management Trainee,PT Astra International Tbk,"Jakarta, Jakarta, Indonesia","[Gelar sarjana dari jurusan apa pun, Lulusan b...",,Internship,Full-time,Education and Training,Automation Machinery Manufacturing,2 months ago,


In [12]:
job_df = pd.read_csv("scrape_result/linkedin_jobs.csv")
job_df

Unnamed: 0,job_link,job_title,company_name,location,responsibilities,requirements,level,employment_type,job_function,industries,time_posted,num_applicants
0,https://id.linkedin.com/jobs/view/general-affa...,General Affair Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia","['Membuat konten kreatif (foto, video, caption...","['Mahasiswa aktif dari jurusan Komunikasi, Man...",Internship,Internship,"Other, Information Technology, and Management",Food and Beverage Services,1 month ago,
1,https://id.linkedin.com/jobs/view/data-analyst...,Data Analyst Intern,PT Lion Super Indo,"Jakarta, Indonesia",['Understand the day-to-day issues that our bu...,['Student of Bachelor degree in Statistics or ...,Internship,Internship,Information Technology and Business Development,Retail,1 week ago,
2,https://id.linkedin.com/jobs/view/project-mana...,Project Management Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",['Assist in compiling & updating the project t...,['Willing to be placed in Kalbe Morinaga Cikam...,Internship,Internship,Project Management and Information Technology,Food and Beverage Services,1 week ago,
3,https://id.linkedin.com/jobs/view/improvement-...,Improvement Campaign & Communication Intern,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",['Melaksanakan observasi lapangan dan pencatat...,['Mendukung pelaksanaan event TPM dan campaign...,Internship,Internship,Marketing and Sales,Food and Beverage Services,1 week ago,
4,https://id.linkedin.com/jobs/view/management-t...,Management Trainee,PT Astra International Tbk,"Jakarta, Jakarta, Indonesia","['Gelar sarjana dari jurusan apa pun', 'Lulusa...",,Internship,Full-time,Education and Training,Automation Machinery Manufacturing,2 months ago,
...,...,...,...,...,...,...,...,...,...,...,...,...
4505,https://id.linkedin.com/jobs/view/koordinator-...,Koordinator TH (Kuta Selatan),byOrange,"Kecamatan Kuta Selatan, Bali, Indonesia",['Membuat Plan Schedule Shifting untuk team Tr...,"['Laki-laki/Perempuan', 'Usia min 21 tahun', '...",Mid-Senior level,Contract,Other,Internet Publishing,4 months ago,
4506,https://id.linkedin.com/jobs/view/business-ana...,Business Analyst,"NTT DATA, Inc.","South Jakarta, Jakarta, Indonesia",['Acts as a bridge between the business proble...,['Demonstrated understanding of business pract...,Mid-Senior level,Full-time,"Research, Analyst, and Information Technology",IT Services and IT Consulting,3 days ago,88 applicants
4507,https://id.linkedin.com/jobs/view/housekeeppin...,Housekeepping Coordinator,PT. Bondar Loyo Management,"Kecamatan Kuta Selatan, Bali, Indonesia",,,Mid-Senior level,Full-time,Management and Manufacturing,Hospitality,2 days ago,
4508,https://id.linkedin.com/jobs/view/pr%C3%A9vent...,Préventeur Santé Sécurité H/F,Sulzer Turbo Services Indonesia,"Purwakarta, West Java, Indonesia",['Soutenir les équipes de proximité dans la pr...,['Avoir un poste basé à Nantes et profiter de ...,Mid-Senior level,Full-time,Management and Manufacturing,Machinery Manufacturing,1 month ago,


In [13]:
job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4510 entries, 0 to 4509
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_link          4498 non-null   object
 1   job_title         4498 non-null   object
 2   company_name      4484 non-null   object
 3   location          4498 non-null   object
 4   responsibilities  4149 non-null   object
 5   requirements      3567 non-null   object
 6   level             4498 non-null   object
 7   employment_type   4498 non-null   object
 8   job_function      4491 non-null   object
 9   industries        4472 non-null   object
 10  time_posted       4417 non-null   object
 11  num_applicants    1358 non-null   object
dtypes: object(12)
memory usage: 422.9+ KB


# Majors & Universities

In [None]:
import requests

def check_robots_txt(domain):
    robots_url = f"https://{domain}/robots.txt"
    try:
        response = requests.get(robots_url)
        response.raise_for_status()
        print(f"Content of {robots_url}:\n")
        print(response.text)
    except requests.exceptions.RequestException as e:
        print(f"Could not retrieve robots.txt for {domain}: {e}")

check_robots_txt("banpt.or.id")



Content of https://banpt.or.id/robots.txt:

User-agent: *
Disallow: /direktori/
Disallow: /bianglala/



In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup headless browser
options = Options()
# options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Buka halaman awal
url = "https://www.banpt.or.id/direktori/prodi/pencarian_prodi.php"
driver.get(url)

time.sleep(3)  # Tunggu load awal

# Inisialisasi data
all_data = []
current_page = 1

while True:
    time.sleep(4)  # beri waktu untuk load halaman

    # Ambil baris tabel
    rows = driver.find_elements(By.CSS_SELECTOR, "table#table tbody tr")
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) >= 4:
            universitas = cols[0].text.strip()
            prodi = cols[1].text.strip()
            jenjang = cols[2].text.strip()
            status = cols[3].text.strip()
            all_data.append({
                "Universitas": universitas,
                "Prodi": prodi,
                "Jenjang": jenjang,
                "Status": status
            })

    print(f"✅ Page {current_page} berhasil discape")
    current_page += 1

    # Cek apakah tombol Next sudah disable
    try:
        next_li = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "table_next"))
        )
        class_attr = next_li.get_attribute("class")

        if "disabled" in class_attr.lower():
            print(f"❌ Tombol Next sudah nonaktif di halaman {current_page - 1}. Selesai scraping.")
            break

        # Klik tombol Next di dalam <li>
        next_button = next_li.find_element(By.TAG_NAME, "a")
        driver.execute_script("arguments[0].click();", next_button)
        print(f"✅ Halaman {current_page - 1} berhasil diklik Next.")

    except Exception as e:
        print(f"❌ Gagal menemukan atau klik tombol next di halaman {current_page - 1}: {e}")
        break

# Tutup browser
driver.quit()

# Simpan hasil
df = pd.DataFrame(all_data)
df.to_csv("scrape_result/jurusan_result.csv", index=False)
print(f"✅ Total {len(df)} baris data berhasil disimpan ke 'scrape_result/jurusan_result.csv'")


✅ Page 1 berhasil discape
✅ Halaman 1 berhasil diklik Next.
✅ Page 2 berhasil discape
✅ Halaman 2 berhasil diklik Next.
✅ Page 3 berhasil discape
✅ Halaman 3 berhasil diklik Next.
✅ Page 4 berhasil discape
✅ Halaman 4 berhasil diklik Next.
✅ Page 5 berhasil discape
✅ Halaman 5 berhasil diklik Next.
✅ Page 6 berhasil discape
✅ Halaman 6 berhasil diklik Next.
✅ Page 7 berhasil discape
✅ Halaman 7 berhasil diklik Next.
✅ Page 8 berhasil discape
✅ Halaman 8 berhasil diklik Next.
✅ Page 9 berhasil discape
✅ Halaman 9 berhasil diklik Next.
✅ Page 10 berhasil discape
✅ Halaman 10 berhasil diklik Next.
✅ Page 11 berhasil discape
✅ Halaman 11 berhasil diklik Next.
✅ Page 12 berhasil discape
✅ Halaman 12 berhasil diklik Next.
✅ Page 13 berhasil discape
✅ Halaman 13 berhasil diklik Next.
✅ Page 14 berhasil discape
✅ Halaman 14 berhasil diklik Next.
✅ Page 15 berhasil discape
✅ Halaman 15 berhasil diklik Next.
✅ Page 16 berhasil discape
✅ Halaman 16 berhasil diklik Next.
✅ Page 17 berhasil discape

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup headless Chrome browser (jika ingin tampilkan browser, komentari headless)
options = Options()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open the QS Rankings Indonesia page
url = "https://www.topuniversities.com/world-university-rankings?countries=id"
driver.get(url)

wait = WebDriverWait(driver, 20)

# Try to dismiss popup if present
try:
    # Tunggu hingga tombol Submit muncul
    submit_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
    submit_btn.click()
    print("Popup dismissed.")
except Exception as e:
    print("No popup found or unable to dismiss:", e)

# Tunggu elemen utama tabel ranking muncul
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.visible-rows-page-number")))

# Scroll berulang agar semua universitas termuat
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_attempts = 0
max_attempts = 10

while scroll_attempts < max_attempts:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        scroll_attempts += 1
        time.sleep(1)
    else:
        scroll_attempts = 0
    last_height = new_height

# Tunggu hingga semua nama universitas termuat
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.uni-link")))

# Ambil data nama universitas dan ranking
names = driver.find_elements(By.CSS_SELECTOR, "a.uni-link")
ranks = driver.find_elements(By.CSS_SELECTOR, "span.rank-no")

universities = [name.text.strip() for name in names]
ranks_text = [rank.text.strip() for rank in ranks]

# Debug jika kosong
if not universities:
    print("Universities list is empty, trying innerText fallback...")
    universities = [name.get_attribute('innerText').strip() for name in names]

# Sinkronisasi panjang list
max_len = min(len(universities), len(ranks_text))
universities = universities[:max_len]
ranks_text = ranks_text[:max_len]

# Simpan ke DataFrame
df = pd.DataFrame({
    "Rank": ranks_text,
    "University": universities
})

print(df)
df.to_csv("scrape_result/universitas_indonesia_qs.csv", index=False)

driver.quit()


No popup found or unable to dismiss: Message: 
Stacktrace:
#0 0x5f63c744514a <unknown>
#1 0x5f63c6ee2b80 <unknown>
#2 0x5f63c6f340e9 <unknown>
#3 0x5f63c6f34271 <unknown>
#4 0x5f63c6f82de4 <unknown>
#5 0x5f63c6f59efd <unknown>
#6 0x5f63c6f8014a <unknown>
#7 0x5f63c6f59ca3 <unknown>
#8 0x5f63c6f25f08 <unknown>
#9 0x5f63c6f27071 <unknown>
#10 0x5f63c740eb5b <unknown>
#11 0x5f63c7412ae2 <unknown>
#12 0x5f63c73fa967 <unknown>
#13 0x5f63c74136d4 <unknown>
#14 0x5f63c73dec7f <unknown>
#15 0x5f63c7433cd8 <unknown>
#16 0x5f63c7433ea9 <unknown>
#17 0x5f63c7443fc6 <unknown>
#18 0x782a6fc9caa4 <unknown>
#19 0x782a6fd29c3c <unknown>

         Rank                                         University
0         189                              Universitas Indonesia
1         224                             Gadjah Mada University
2         255                   Institut Teknologi Bandung (ITB)
3        =287                              Universitas Airlangga
4         399  IPB University (aka Bogor Agri

In [14]:
uni_df = pd.read_csv("scrape_result/universitas_indonesia_qs.csv")
uni_df

Unnamed: 0,Rank,University
0,189,Universitas Indonesia
1,224,Gadjah Mada University
2,255,Institut Teknologi Bandung (ITB)
3,=287,Universitas Airlangga
4,399,IPB University (aka Bogor Agricultural Univers...
5,=509,Institut Teknologi Sepuluh Nopember (ITS Surab...
6,=515,Universitas Padjadjaran (UNPAD)
7,=624,Diponegoro University
8,=680,Universitas Brawijaya
9,851-900,Bina Nusantara University (BINUS)


In [15]:
uni_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Rank        26 non-null     object
 1   University  26 non-null     object
dtypes: object(2)
memory usage: 548.0+ bytes


In [16]:
major_df = pd.read_csv("scrape_result/jurusan_result.csv")
major_df

Unnamed: 0,Universitas,Prodi,Jenjang,Status
0,"POLITEKNIK ANGKATAN LAUT, SURABAYA",FARMASI,D-III,PTKL
1,Universitas Brawijaya,Pendidikan Profesi Arsitek,Profesi,PTN
2,Sekolah Tinggi Agama Kristen Protestan Negeri ...,Pendidikan Agama Kristen,S1,PTAN
3,Universitas Lambung Mangkurat,Administrasi Bisnis,S2,PTN
4,Sekolah Tinggi Ilmu Ekonomi Tri Dharma Widya,Akuntansi,S1,03
...,...,...,...,...
33683,STMIK Musi Rawas,Teknik Informatika,S1,02
33684,"UNIVERSITAS INDONESIA, JAKARTA",KEDOKTERAN,S1,PTN
33685,"UNIVERSITAS INDONESIA, JAKARTA",DOKTER,Profesi,PTN
33686,Universitas Bina Bangsa,Pendidikan Ilmu Pengetahuan Sosial,S1,04


In [17]:
major_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33688 entries, 0 to 33687
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Universitas  33687 non-null  object
 1   Prodi        33687 non-null  object
 2   Jenjang      33688 non-null  object
 3   Status       33685 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


# Career

In [5]:
# Make a request to get career codes from O*NET

url = "https://services.onetcenter.org/ws/mnm/careers/?start=1&end=923"

dotenv.load_dotenv()
headers = {
        'User-Agent': 'python-OnetWebService/1.00 (bot)',
        'Authorization': 'Basic ' + base64.standard_b64encode((os.getenv('ONET_USERNAME') + ':' + os.getenv('ONET_PASSWORD')).encode()).decode(),
        'Accept': 'application/json'
    }
    
r = requests.get(url, headers=headers)

career_codes = []
if r.status_code != 200:
    raise Exception(f"Failed to fetch data: {r.status_code} - {r.text}")
else:
    for career in r.json()['career']:
        career_codes.append(career['code'])

print(f"{len(career_codes)} career codes fetched successfully.")

923 career codes fetched successfully.


In [8]:
career_codes

['13-2011.00',
 '27-2011.00',
 '15-2011.00',
 '29-1291.00',
 '29-1141.01',
 '25-2059.01',
 '51-9191.00',
 '23-1021.00',
 '11-3012.00',
 '25-3011.00',
 '29-1141.02',
 '11-2011.00',
 '41-3011.00',
 '17-3021.00',
 '17-2011.00',
 '13-1011.00',
 '17-2021.00',
 '45-2091.00',
 '45-2011.00',
 '25-1041.00',
 '19-4012.00',
 '53-2021.00',
 '53-1041.00',
 '49-3011.00',
 '53-6032.00',
 '51-2011.00',
 '53-2022.00',
 '53-2011.00',
 '29-1229.01',
 '53-3011.00',
 '39-3091.00',
 '29-1071.01',
 '29-1211.00',
 '45-2021.00',
 '39-2021.00',
 '33-9011.00',
 '19-1011.00',
 '39-2011.00',
 '19-3091.00',
 '25-1061.00',
 '13-2023.00',
 '13-2022.00',
 '23-1022.00',
 '17-1011.00',
 '17-3011.00',
 '11-9041.00',
 '25-1031.00',
 '25-4011.00',
 '25-1062.00',
 '27-1011.00',
 '29-1129.01',
 '25-1121.00',
 '19-2011.00',
 '27-2021.00',
 '29-9091.00',
 '19-2021.00',
 '25-1051.00',
 '27-4011.00',
 '29-1181.00',
 '49-2097.00',
 '53-6031.00',
 '49-3021.00',
 '17-3027.01',
 '17-2141.02',
 '49-3022.00',
 '49-3023.00',
 '53-6051.

In [9]:
# Make another request to get career details for each code

career_details = []
for code in career_codes:
    report_url = f"https://services.onetcenter.org/ws/mnm/careers/{code}/report"
    report_response = requests.get(report_url, headers=headers)
    if report_response.status_code != 200:
        raise Exception(f"Error fetching career report: {report_response.status_code} - {report_response.text}")

    report_data = report_response.json()
    code = report_data.get('career', {}).get('code', '')
    title = report_data.get('career', {}).get('title', '')
    also_called = report_data.get('career', {}).get('also_called', {}).get('title', [])
    what_they_do = report_data.get('career', {}).get('what_they_do', 'N/A')
    on_the_job = report_data.get('career', {}).get('on_the_job', {}).get('task', [])

    # Process knowledges
    c_knowledges = []
    for knowledge in report_data.get('knowledge', {}).get('group', []):
        c_knowledges.append(knowledge['title']['name'])
        for element in knowledge.get('element', []):
            c_knowledges.append(element['name'])

    # Process skills
    c_skills = []
    for skill in report_data.get('skills', {}).get('group', []):
        c_skills.append(skill['title']['name'])
        for element in skill.get('element', []):
            c_skills.append(element['name'])

    # Process abilities
    c_abilities = []
    for ability in report_data.get('abilities', {}).get('group', []):
        c_abilities.append(ability['title']['name'])
        for element in ability.get('element', []):
            c_abilities.append(element['name'])

    # Process technologies
    c_technologies = []
    for tech in report_data.get('technology', {}).get('category', []):
        c_technologies.append(tech['title']['name'])
        for example in tech.get('example', []):
            c_technologies.append(example['name'])

    # Job outlook
    outlook = report_data.get('job_outlook', {}).get('outlook', 'N/A')
    
    career_details.append({
        "code": code,
        "title": title,
        "also_called": also_called,
        "what_they_do": what_they_do,
        "on_the_job": on_the_job,
        "knowledges": c_knowledges,
        "skills": c_skills,
        "abilities": c_abilities,
        "technologies": c_technologies,
        "job_outlook": outlook
    })
    
    time.sleep(0.5)

career_json = json.dumps(career_details, indent=4)
with open("scrape_result/onet_careers.json", "w") as outfile:
    outfile.write(career_json)