In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def lowering(text: str) -> str:
    text = text.lower()
    return text

def remove_punctuation_and_symbol(text: str) -> str:
    text = re.sub(r'[^\w\s]', '', text)
    return text

def stopword_removal(text: str) -> str:
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# def stemming(text: str) -> str:
#     text = " ".join([stemmer.stem(word) for word in text.split()])
#     return text

def lemmatization(text: str) -> str:
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_html_tags(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text)
    return text

def preprocessing(text: str, remove_html:bool=True) -> str:
    if not isinstance(text, str):
        return ""
    if remove_html:
        text = remove_html_tags(text)
    text = lowering(text)
    text = remove_punctuation_and_symbol(text)
    text = stopword_removal(text)
    text = lemmatization(text)
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
import time
from deep_translator import GoogleTranslator
from langdetect import detect


def detect_language(text: str) -> str:
    try:
        lang = detect(text)
        return lang
    except Exception as e:
        print(f"Language detection error: {e}")
        return "en"  # default to English if detection fails

def translate_to_english(text: str) -> str:
    """
    Translate text using GoogleTranslator (free, unofficial).
    Handles sentence splitting and translation.
    """
    try:
        if pd.isna(text) or str(text).strip() == "":
            return text
        
        sentences = re.split(r'(?<=[.!?]) +', str(text))
        translated_sentences = []

        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                try:
                    translation = GoogleTranslator(source='auto', target='en').translate(sentence)
                except Exception as inner_e:
                    print(f"Translation error (sentence): {inner_e}")
                    translation = sentence
                print(f"Original: {sentence} | Translated: {translation}")
                translated_sentences.append(translation)
                time.sleep(0.5)  # optional: biar aman dari limit
        return ' '.join(translated_sentences)

    except Exception as e:
        print(f"Translation error (full text): {e}")
        return str(text)

# EDX Courses

In [None]:
df = pd.read_csv("scrape_result/edx_courses.csv")
df

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill
0,How to Learn Online,['edX'],This course will prepare you with strategies t...,"\nHistory, benefits, and foundational concepts...",This course harnesses science-backed technique...,['Available now'],['Education & Teacher Training'],['Introductory'],['English'],Course,[],[],"['Arabic', 'English', 'Spanish (Latin America)...","['Portuguese - Brazil', 'Indonesian', 'Arabic'...",47714,https://www.edx.org/learn/how-to-learn/edx-how...,2.0,['Learning Design']
1,The Science of Happiness,"['University of California, Berkeley']",The first MOOC to teach positive psychology. L...,\nWhat happiness really means and why it matte...,"""A free eight-week Science of Happiness course...","['Available now', 'Upcoming']",['Social Sciences'],['Introductory'],['English'],Course,[],"['dacher-keltner', 'emiliana-simon-thomas']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Indonesian', 'Spanish', 'Portugue...",16821,https://www.edx.org/learn/happiness/university...,11.0,"['Empathy', 'Evolutionary Biology', 'Psychology']"
2,Remote Work Revolution for Everyone,['Harvard University'],"In Remote Work Revolution for Everyone, you wi...",\nUnderstand the key elements of remote work a...,How are you thriving or surviving in your remo...,['Available now'],['Business & Management'],['Introductory'],['English'],Course,['Professional Certificate'],['tsedal-neeley'],"['Arabic', 'English', 'Spanish (Latin America)...","['Spanish', 'Arabic', 'Thai', 'Korean', 'Chine...",45954,https://www.edx.org/learn/remote-work/harvard-...,3.0,"['Telecommuting', 'Customer Relationship Build..."
3,CS50's Introduction to Computer Science,['Harvard University'],An introduction to the intellectual enterprise...,\nA broad and robust understanding of computer...,"This is CS50x , Harvard University's introduct...",['Available now'],"['Computer Science', 'Engineering']",['Introductory'],['English'],Course,"['Professional Certificate', 'Professional Cer...","['doug-lloyd', 'david-j-malan', 'brian-yu']","['Arabic', 'English', 'Spanish (Latin America)...","['Telugu', 'Turkish', 'Swahili', 'Arabic', 'Fr...",425063,https://www.edx.org/learn/computer-science/har...,12.0,"['Cryptography', 'Resource Management', 'Algor..."
4,Data Visualization and Building Dashboards wit...,['IBM'],Build the fundamental knowledge necessary to u...,\nDescribe the important role charts play in t...,Please Note: Learners who successfully complet...,['Available now'],['Data Analysis & Statistics'],['Introductory'],['English'],Course,"['Professional Certificate', 'Professional Cer...","['steve-ryan', 'sandip-sasha-joy']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Portuguese - Brazil', 'Indonesian']",11885,https://www.edx.org/learn/data-visualization/i...,4.0,"['Data Visualization', 'Data Analysis', 'Micro..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,Corporate Innovation,['The University of Queensland'],Learn how to apply state-of-the-art methods to...,The ability to innovate is crucial for busines...,Fundamentals of creative and design thinking t...,['Archived'],"['Business & Management', 'Communication', 'Da...",['Intermediate'],['English'],Program,['MicroMasters'],"['martie-louise-verreynne', 'rachel-fitzgerald...",[],[],-269,https://www.edx.org/masters/micromasters/uqx-c...,,[]
1652,IBM: Ciencia de datos,['IBM'],,La ciencia de datos y las habilidades de apren...,"Aplicar varias habilidades, técnicas y herrami...",['Archived'],"['Data Analysis & Statistics', 'Computer Scien...","['Introductory', 'Intermediate']",['Spanish'],Program,['Professional Certificate'],"['romeo-kienzler', 'saeed-aghabozorgi', 'josep...",[],[],-328,https://www.edx.org/certificates/professional-...,,"['Data Science', 'Python (Programming Language..."
1653,C++ Programming Essentials,['IBM'],Become a skilled C++ developer who is fluent i...,This Professional Certificate program takes yo...,Fundamental concepts of programming using C++ ...,['Archived'],['Computer Science'],"['Introductory', 'Intermediate']",['English'],Program,['Professional Certificate'],"['nisha-p-2', 'sripriya-s', 'sathya-ponmalar-h...",[],[],-444,https://www.edx.org/certificates/professional-...,,"['Object-Oriented Programming (OOP)', 'C++ (Pr..."
1654,Marketing Digital,['Universidad Galileo'],Aprende a conectar tu marca con el mundo. Cono...,Gana una ventaja competitiva convirtiéndote en...,Diseñar estrategias de marketing en redes soci...,['Archived'],"['Communication', 'Business & Management']",['Introductory'],['Spanish'],Program,['Professional Certificate'],"['rocael-hernandez-ph-d', 'miguel-morales-ph-d...",[],[],-659,https://www.edx.org/certificates/professional-...,,"['Digital Marketing', 'Advertising Campaigns',..."


In [None]:
exclude_cols = [
    'marketing_url', 'availability', 'subject', 'level', 'product', 'program_type',
    'staff', 'translation_language', 'transcription_language', 'partner'
]

for col in df.columns:
    if df[col].dtype == 'object' and col not in exclude_cols:
        df[col] = df[col].apply(preprocessing)
        df[col] = df[col].replace("", None)

In [None]:
df

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill
0,learn online,['edX'],course prepare strategy successful online lear...,history benefit foundational concept online le...,course harness sciencebacked technique applica...,['Available now'],['Education & Teacher Training'],['Introductory'],english,Course,[],[],"['Arabic', 'English', 'Spanish (Latin America)...","['Portuguese - Brazil', 'Indonesian', 'Arabic'...",47714,https://www.edx.org/learn/how-to-learn/edx-how...,2.0,learning design
1,science happiness,"['University of California, Berkeley']",first mooc teach positive psychology learn sci...,happiness really mean matter increase happines...,free eightweek science happiness course offer ...,"['Available now', 'Upcoming']",['Social Sciences'],['Introductory'],english,Course,[],"['dacher-keltner', 'emiliana-simon-thomas']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Indonesian', 'Spanish', 'Portugue...",16821,https://www.edx.org/learn/happiness/university...,11.0,empathy evolutionary biology psychology
2,remote work revolution everyone,['Harvard University'],remote work revolution everyone learn excel vi...,understand key element remote work use knowled...,thriving surviving remote work environment rem...,['Available now'],['Business & Management'],['Introductory'],english,Course,['Professional Certificate'],['tsedal-neeley'],"['Arabic', 'English', 'Spanish (Latin America)...","['Spanish', 'Arabic', 'Thai', 'Korean', 'Chine...",45954,https://www.edx.org/learn/remote-work/harvard-...,3.0,telecommuting customer relationship building
3,cs50s introduction computer science,['Harvard University'],introduction intellectual enterprise computer ...,broad robust understanding computer science pr...,cs50x harvard university introduction intellec...,['Available now'],"['Computer Science', 'Engineering']",['Introductory'],english,Course,"['Professional Certificate', 'Professional Cer...","['doug-lloyd', 'david-j-malan', 'brian-yu']","['Arabic', 'English', 'Spanish (Latin America)...","['Telugu', 'Turkish', 'Swahili', 'Arabic', 'Fr...",425063,https://www.edx.org/learn/computer-science/har...,12.0,cryptography resource management algorithm pyt...
4,data visualization building dashboard excel co...,['IBM'],build fundamental knowledge necessary use exce...,describe important role chart play telling dat...,please note learner successfully complete ibm ...,['Available now'],['Data Analysis & Statistics'],['Introductory'],english,Course,"['Professional Certificate', 'Professional Cer...","['steve-ryan', 'sandip-sasha-joy']","['Arabic', 'English', 'Spanish (Latin America)...","['Russian', 'Portuguese - Brazil', 'Indonesian']",11885,https://www.edx.org/learn/data-visualization/i...,4.0,data visualization data analysis microsoft exc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,corporate innovation,['The University of Queensland'],learn apply stateoftheart method foster innova...,ability innovate crucial business survival gro...,fundamental creative design thinking enable in...,['Archived'],"['Business & Management', 'Communication', 'Da...",['Intermediate'],english,Program,['MicroMasters'],"['martie-louise-verreynne', 'rachel-fitzgerald...",[],[],-269,https://www.edx.org/masters/micromasters/uqx-c...,,
1652,ibm ciencia de datos,['IBM'],,la ciencia de datos la habilidades de aprendiz...,aplicar varias habilidades técnicas herramient...,['Archived'],"['Data Analysis & Statistics', 'Computer Scien...","['Introductory', 'Intermediate']",spanish,Program,['Professional Certificate'],"['romeo-kienzler', 'saeed-aghabozorgi', 'josep...",[],[],-328,https://www.edx.org/certificates/professional-...,,data science python programming language machi...
1653,c programming essential,['IBM'],become skilled c developer fluent complete syn...,professional certificate program take right fu...,fundamental concept programming using c includ...,['Archived'],['Computer Science'],"['Introductory', 'Intermediate']",english,Program,['Professional Certificate'],"['nisha-p-2', 'sripriya-s', 'sathya-ponmalar-h...",[],[],-444,https://www.edx.org/certificates/professional-...,,objectoriented programming oop c programming l...
1654,marketing digital,['Universidad Galileo'],aprende conectar tu marca con el mundo conoce ...,gana una ventaja competitiva convirtiéndote en...,diseñar estrategias de marketing en redes soci...,['Archived'],"['Communication', 'Business & Management']",['Introductory'],spanish,Program,['Professional Certificate'],"['rocael-hernandez-ph-d', 'miguel-morales-ph-d...",[],[],-659,https://www.edx.org/certificates/professional-...,,digital marketing advertising campaign marketi...


In [None]:
import ast

list_cols = [
    'availability', 'subject', 'level', 'program_type',
    'staff', 'translation_language', 'transcription_language', 'partner'
]

for col in list_cols:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: (
                ', '.join(ast.literal_eval(x)) if isinstance(x, str) and x.startswith('[') and x.strip() != "[]" else
                None if x == "" or x == "[]" else
                x
            )
        )
        df[col] = df[col].str.lower()
        
df

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill
0,learn online,edx,course prepare strategy successful online lear...,history benefit foundational concept online le...,course harness sciencebacked technique applica...,available now,education & teacher training,introductory,english,Course,,,"arabic, english, spanish (latin america), indo...","portuguese - brazil, indonesian, arabic, spanish",47714,https://www.edx.org/learn/how-to-learn/edx-how...,2.0,learning design
1,science happiness,"university of california, berkeley",first mooc teach positive psychology learn sci...,happiness really mean matter increase happines...,free eightweek science happiness course offer ...,"available now, upcoming",social sciences,introductory,english,Course,,"dacher-keltner, emiliana-simon-thomas","arabic, english, spanish (latin america), indo...","russian, indonesian, spanish, portuguese - bra...",16821,https://www.edx.org/learn/happiness/university...,11.0,empathy evolutionary biology psychology
2,remote work revolution everyone,harvard university,remote work revolution everyone learn excel vi...,understand key element remote work use knowled...,thriving surviving remote work environment rem...,available now,business & management,introductory,english,Course,professional certificate,tsedal-neeley,"arabic, english, spanish (latin america), indo...","spanish, arabic, thai, korean, chinese - china...",45954,https://www.edx.org/learn/remote-work/harvard-...,3.0,telecommuting customer relationship building
3,cs50s introduction computer science,harvard university,introduction intellectual enterprise computer ...,broad robust understanding computer science pr...,cs50x harvard university introduction intellec...,available now,"computer science, engineering",introductory,english,Course,"professional certificate, professional certifi...","doug-lloyd, david-j-malan, brian-yu","arabic, english, spanish (latin america), indo...","telugu, turkish, swahili, arabic, french, germ...",425063,https://www.edx.org/learn/computer-science/har...,12.0,cryptography resource management algorithm pyt...
4,data visualization building dashboard excel co...,ibm,build fundamental knowledge necessary use exce...,describe important role chart play telling dat...,please note learner successfully complete ibm ...,available now,data analysis & statistics,introductory,english,Course,"professional certificate, professional certifi...","steve-ryan, sandip-sasha-joy","arabic, english, spanish (latin america), indo...","russian, portuguese - brazil, indonesian",11885,https://www.edx.org/learn/data-visualization/i...,4.0,data visualization data analysis microsoft exc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,corporate innovation,the university of queensland,learn apply stateoftheart method foster innova...,ability innovate crucial business survival gro...,fundamental creative design thinking enable in...,archived,"business & management, communication, data ana...",intermediate,english,Program,micromasters,"martie-louise-verreynne, rachel-fitzgerald, ti...",,,-269,https://www.edx.org/masters/micromasters/uqx-c...,,
1652,ibm ciencia de datos,ibm,,la ciencia de datos la habilidades de aprendiz...,aplicar varias habilidades técnicas herramient...,archived,"data analysis & statistics, computer science, ...","introductory, intermediate",spanish,Program,professional certificate,"romeo-kienzler, saeed-aghabozorgi, joseph-sant...",,,-328,https://www.edx.org/certificates/professional-...,,data science python programming language machi...
1653,c programming essential,ibm,become skilled c developer fluent complete syn...,professional certificate program take right fu...,fundamental concept programming using c includ...,archived,computer science,"introductory, intermediate",english,Program,professional certificate,"nisha-p-2, sripriya-s, sathya-ponmalar-h, abin...",,,-444,https://www.edx.org/certificates/professional-...,,objectoriented programming oop c programming l...
1654,marketing digital,universidad galileo,aprende conectar tu marca con el mundo conoce ...,gana una ventaja competitiva convirtiéndote en...,diseñar estrategias de marketing en redes soci...,archived,"communication, business & management",introductory,spanish,Program,professional certificate,"rocael-hernandez-ph-d, miguel-morales-ph-d, jo...",,,-659,https://www.edx.org/certificates/professional-...,,digital marketing advertising campaign marketi...


In [None]:
df.isnull().sum()

title                         0
partner                       0
primary_description         145
secondary_description         3
tertiary_description          6
availability                  0
subject                       2
level                         3
language                      3
product                       0
program_type                398
staff                        97
translation_language        873
transcription_language     1033
recent_enrollment_count       0
marketing_url                 0
weeks_to_complete           657
skill                       191
dtype: int64

In [None]:
original_df = pd.read_csv("scrape_result/edx_courses.csv")
original_df['text'] = df['title'].fillna("") + " " + df['secondary_description'].fillna("") + " " + df['subject'].fillna("") + " " + df['skill'].fillna("")
original_df['text'] = original_df['text'].map(lambda x: str(x).replace(", ", ''))

In [None]:
# Save to JSON

ori_dict = original_df.to_dict(orient="records")

list_cols = [
    'availability', 'subject', 'level', 'program_type',
    'staff', 'translation_language', 'transcription_language', 'partner', 'language', 'skill'
]

for key in list_cols:
    for item in ori_dict:
        if key in item and isinstance(item[key], str):
            item[key] = ast.literal_eval(item[key]) if item[key].startswith('[') else item[key]
            
            
for course in ori_dict:
    for key, value in course.items():
        if isinstance(value, (list, dict)):
            continue
        if pd.isna(value):
            course[key] = ""
            
# Only get courses that are available
ori_dict = [course for course in ori_dict if "Available now" in course.get("availability", [])]
            
import json

courses_json = json.dumps(ori_dict, indent=4)

with open("preprocessed/edx_courses.json", "w") as outfile:
    outfile.write(courses_json)

# LinkedIn

In [12]:
job_df = pd.read_csv("scrape_result/linkedin_jobs.csv")

In [None]:
# Convert to dict and turn string lists into actual lists

job_dict = job_df.to_dict(orient="records")

list_cols = [
    'responsibilities', 'requirements'
]

for key in list_cols:
    for item in job_dict:
        if key in item and isinstance(item[key], str):
            item[key] = ast.literal_eval(item[key]) if item[key].startswith('[') else item[key]
            
     
# make sure no nan values       
for course in job_dict:
    for key, value in course.items():
        if isinstance(value, (list, dict)):
            continue
        if pd.isna(value):
            course[key] = ""

In [None]:
# Some text in responsibilities and requirements are not in English, so we will translate them to English

for job in job_dict:
    res_list = []
    for text in job['responsibilities']:
        if detect_language(text) != 'en':
            text = translate_to_english(text)
        res_list.append(text)
    job['responsibilities'] = res_list
    
    req_list = []
    for text in job['requirements']:
        if detect_language(text) != 'en':
            text = translate_to_english(text)
        req_list.append(text)
    job['requirements'] = req_list

Original: Membuat konten kreatif (foto, video, caption) dan familiar dengan platform seperti Instagram, TikTok. | Translated: Creating creative content (photo, video, caption) and familiar with platforms such as Instagram, Tiktok.
Original: Bisa menggunakan aplikasi editing seperti Canva, Capcut, dll. | Translated: Can use editing applications such as Canva, Capcut, etc.
Original: Mampu menggunakan Ms. | Translated: Able to use Ms.
Original: Word, Excel & PowerPoint. | Translated: Word, Excel & PowerPoint.
Original: Menyusun perancangan event kantor. | Translated: Arranging office event designs.
Original: Menjadi nilai plus jika mampu membuat Desain 3D. | Translated: Become a plus if it is able to make a 3D design.
Original: Mahasiswa aktif dari jurusan Komunikasi, Manajemen, Public Relations, atau bidang terkait. | Translated: Active students from the Department of Communication, Management, Public Relations, or related fields.
Original: Mampu perencanaan event dan media sosial (spesi

In [25]:
# Responsibilities & requirements translated

print(job_dict[0]['responsibilities'])
print(job_dict[1]['requirements'])

['Creating creative content (photo, video, caption) and familiar with platforms such as Instagram, Tiktok.', 'Can use editing applications such as Canva, Capcut, etc.', 'Able to use Ms. Word, Excel & PowerPoint.', 'Arranging office event designs.', 'Become a plus if it is able to make a 3D design.']
['Student of Bachelor degree in Statistics or Applied Mathematics or Information Technology or equivalent experience', 'Passionate in Data Analysis/Business Analysis', 'Familiar with business process / blueprint documentation / software development lifecycle concept', 'Familiar with web programming', 'Having good analytical thinking & communication skills', 'Having experience in using analytical tools (Power BI/Tableau/Looker) is a plus', 'Willing to WFO in SOUTH JAKARTA']


In [28]:
# Preprocess job titles, responsibilities, requirements, job function, and industries

for job in job_dict:
    title = preprocessing(job['job_title'], remove_html=False)
    res = [preprocessing(res, remove_html=False) for res in job['responsibilities']]
    req = [preprocessing(req, remove_html=False) for req in job['requirements']]
    function = preprocessing(job['job_function'], remove_html=False)
    industries = preprocessing(job['industries'], remove_html=False)
    concat = title + " " + " ".join(res) + " " + " ".join(req) + " " + function + " " + industries
    if pd.isna(concat) or concat.strip() == "":
        job['text'] = ""
    else:
        job['text'] = concat

In [29]:
# Job text is preprocessed
job_dict[0]['text']

'general affair internship creating creative content photo video caption familiar platform instagram tiktok use editing application canva capcut etc able use m word excel powerpoint arranging office event design become plus able make 3d design active student department communication management public relation related field able plan event social medium specific instagram tiktok able work together team communicative honest neat proactive able complete task according time given information technology management food beverage service'

In [None]:
# save back to JSON file

import json

jobs_json = json.dumps(job_dict, indent=4)

with open("preprocessed/linkedin_jobs.json", "w") as outfile:
    outfile.write(jobs_json)

# Majors & Universities

## Merge Major & Uni

In [3]:
major = pd.read_csv('scrape_result/jurusan_result.csv')
rank = pd.read_csv('scrape_result/universitas_indonesia_qs.csv')

In [4]:
major

Unnamed: 0,Universitas,Prodi,Jenjang,Status
0,"POLITEKNIK ANGKATAN LAUT, SURABAYA",FARMASI,D-III,PTKL
1,Universitas Brawijaya,Pendidikan Profesi Arsitek,Profesi,PTN
2,Sekolah Tinggi Agama Kristen Protestan Negeri ...,Pendidikan Agama Kristen,S1,PTAN
3,Universitas Lambung Mangkurat,Administrasi Bisnis,S2,PTN
4,Sekolah Tinggi Ilmu Ekonomi Tri Dharma Widya,Akuntansi,S1,03
...,...,...,...,...
33683,STMIK Musi Rawas,Teknik Informatika,S1,02
33684,"UNIVERSITAS INDONESIA, JAKARTA",KEDOKTERAN,S1,PTN
33685,"UNIVERSITAS INDONESIA, JAKARTA",DOKTER,Profesi,PTN
33686,Universitas Bina Bangsa,Pendidikan Ilmu Pengetahuan Sosial,S1,04


In [5]:
rank

Unnamed: 0,Rank,University
0,189,Universitas Indonesia
1,224,Gadjah Mada University
2,255,Institut Teknologi Bandung (ITB)
3,=287,Universitas Airlangga
4,399,IPB University (aka Bogor Agricultural Univers...
5,=509,Institut Teknologi Sepuluh Nopember (ITS Surab...
6,=515,Universitas Padjadjaran (UNPAD)
7,=624,Diponegoro University
8,=680,Universitas Brawijaya
9,851-900,Bina Nusantara University (BINUS)


In [6]:
major['Jenjang'].unique()

array(['D-III', 'Profesi', 'S1', 'S2', 'Spesialis', 'S3', 'D-IV', 'D4',
       'Subspesialis', '-', 'D-II', 'D-I', 'S2 Terapan', 'S3 Terapan',
       'D3', 'Pro.', 'S1 Terapan', 'Sarjana', 'Magister', 'Diploma-III',
       'Sarjana Terapan'], dtype=object)

In [7]:
excluded_jenjang = [
    'Profesi', 'S2', 'Spesialis', 'S3', 'Subspesialis', '-', 
    'S2 Terapan', 'S3 Terapan', 'Pro.', 'Magister'
]

major = major[~major['Jenjang'].isin(excluded_jenjang)].reset_index(drop=True)

In [8]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"\([^)]*\)", "", text)
    text = text.split(",")[0]
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [9]:
major['Universitas'] = major['Universitas'].apply(clean_text)
rank['University'] = rank['University'].apply(clean_text)

In [10]:
major

Unnamed: 0,Universitas,Prodi,Jenjang,Status
0,politeknik angkatan laut,FARMASI,D-III,PTKL
1,sekolah tinggi agama kristen protestan negeri ...,Pendidikan Agama Kristen,S1,PTAN
2,sekolah tinggi ilmu ekonomi tri dharma widya,Akuntansi,S1,03
3,universitas pasifik morotai,Akuntansi,S1,12
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,Akuntansi,S1,04
...,...,...,...,...
26790,stai bakti negara tegal,Pendidikan Agama Islam,S1,PTAS
26791,stmik musi rawas,Teknik Informatika,S1,02
26792,universitas indonesia,KEDOKTERAN,S1,PTN
26793,universitas bina bangsa,Pendidikan Ilmu Pengetahuan Sosial,S1,04


In [11]:
univ_mapping = {
    'gadjah mada university': 'universitas gadjah mada',
    'ipb university': 'institut pertanian bogor',
    'diponegoro university': 'universitas diponegoro',
    'atma jaya catholic university of indonesia': 'universitas katolik indonesia atma jaya',
    'state university of malang': 'universitas negeri malang',
    'petra christian university': 'universitas kristen petra',
    'bina nusantara university': 'universitas bina nusantara',
}

rank['University'] = rank['University'].str.lower().replace(univ_mapping)

rank

Unnamed: 0,Rank,University
0,189,universitas indonesia
1,224,universitas gadjah mada
2,255,institut teknologi bandung
3,=287,universitas airlangga
4,399,institut pertanian bogor
5,=509,institut teknologi sepuluh nopember
6,=515,universitas padjadjaran
7,=624,universitas diponegoro
8,=680,universitas brawijaya
9,851-900,universitas bina nusantara


In [12]:
rank['Rank'] = rank.index + 1
rank

Unnamed: 0,Rank,University
0,1,universitas indonesia
1,2,universitas gadjah mada
2,3,institut teknologi bandung
3,4,universitas airlangga
4,5,institut pertanian bogor
5,6,institut teknologi sepuluh nopember
6,7,universitas padjadjaran
7,8,universitas diponegoro
8,9,universitas brawijaya
9,10,universitas bina nusantara


In [13]:
merged_df = major.merge(rank, left_on='Universitas', right_on='University', how='left')
merged_df = merged_df.drop(columns=['University'])
merged_df

Unnamed: 0,Universitas,Prodi,Jenjang,Status,Rank
0,politeknik angkatan laut,FARMASI,D-III,PTKL,
1,sekolah tinggi agama kristen protestan negeri ...,Pendidikan Agama Kristen,S1,PTAN,
2,sekolah tinggi ilmu ekonomi tri dharma widya,Akuntansi,S1,03,
3,universitas pasifik morotai,Akuntansi,S1,12,
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,Akuntansi,S1,04,
...,...,...,...,...,...
26790,stai bakti negara tegal,Pendidikan Agama Islam,S1,PTAS,
26791,stmik musi rawas,Teknik Informatika,S1,02,
26792,universitas indonesia,KEDOKTERAN,S1,PTN,1.0
26793,universitas bina bangsa,Pendidikan Ilmu Pengetahuan Sosial,S1,04,


In [14]:
merged_df['Rank'] = merged_df['Rank'].fillna(999)
merged_df

Unnamed: 0,Universitas,Prodi,Jenjang,Status,Rank
0,politeknik angkatan laut,FARMASI,D-III,PTKL,999.0
1,sekolah tinggi agama kristen protestan negeri ...,Pendidikan Agama Kristen,S1,PTAN,999.0
2,sekolah tinggi ilmu ekonomi tri dharma widya,Akuntansi,S1,03,999.0
3,universitas pasifik morotai,Akuntansi,S1,12,999.0
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,Akuntansi,S1,04,999.0
...,...,...,...,...,...
26790,stai bakti negara tegal,Pendidikan Agama Islam,S1,PTAS,999.0
26791,stmik musi rawas,Teknik Informatika,S1,02,999.0
26792,universitas indonesia,KEDOKTERAN,S1,PTN,1.0
26793,universitas bina bangsa,Pendidikan Ilmu Pengetahuan Sosial,S1,04,999.0


In [15]:
merged_df.drop(columns=['Jenjang', 'Status'], inplace=True)
merged_df

Unnamed: 0,Universitas,Prodi,Rank
0,politeknik angkatan laut,FARMASI,999.0
1,sekolah tinggi agama kristen protestan negeri ...,Pendidikan Agama Kristen,999.0
2,sekolah tinggi ilmu ekonomi tri dharma widya,Akuntansi,999.0
3,universitas pasifik morotai,Akuntansi,999.0
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,Akuntansi,999.0
...,...,...,...
26790,stai bakti negara tegal,Pendidikan Agama Islam,999.0
26791,stmik musi rawas,Teknik Informatika,999.0
26792,universitas indonesia,KEDOKTERAN,1.0
26793,universitas bina bangsa,Pendidikan Ilmu Pengetahuan Sosial,999.0


In [16]:
merged_df['Prodi'] = (
    merged_df['Prodi']
    .str.lower()
    .str.strip()
    .str.replace(r"[^a-z0-9\s]", "", regex=True)  
    .str.replace(r"\s+", " ", regex=True)        
)

In [17]:
merged_df

Unnamed: 0,Universitas,Prodi,Rank
0,politeknik angkatan laut,farmasi,999.0
1,sekolah tinggi agama kristen protestan negeri ...,pendidikan agama kristen,999.0
2,sekolah tinggi ilmu ekonomi tri dharma widya,akuntansi,999.0
3,universitas pasifik morotai,akuntansi,999.0
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,akuntansi,999.0
...,...,...,...
26790,stai bakti negara tegal,pendidikan agama islam,999.0
26791,stmik musi rawas,teknik informatika,999.0
26792,universitas indonesia,kedokteran,1.0
26793,universitas bina bangsa,pendidikan ilmu pengetahuan sosial,999.0


## Preprocess Text

Get the translated file, and preprocess text field

In [23]:
# get translated prodi names

translated_major = pd.read_csv("translated/major_final.csv")

translated_major

Unnamed: 0,Universitas,Prodi,Rank,text
0,politeknik angkatan laut,farmasi,999.0,Pharmacy
1,sekolah tinggi agama kristen protestan negeri ...,pendidikan agama kristen,999.0,Christian Religious Education
2,sekolah tinggi ilmu ekonomi tri dharma widya,akuntansi,999.0,accountancy
3,universitas pasifik morotai,akuntansi,999.0,accountancy
4,sekolah tinggi ilmu ekonomi ekadharma indonesia,akuntansi,999.0,accountancy
...,...,...,...,...
26789,stai bakti negara tegal,pendidikan agama islam,999.0,Islamic Religious Education
26790,stmik musi rawas,teknik informatika,999.0,Informatics Engineering
26791,universitas indonesia,kedokteran,1.0,medical
26792,universitas bina bangsa,pendidikan ilmu pengetahuan sosial,999.0,social science education


In [24]:
translated_major['text'] = translated_major['text'].apply(clean_text)

In [25]:
import json

translated_major = translated_major.to_dict(orient='records')
major_json = json.dumps(translated_major, indent=4)

with open("preprocessed/major_final.json", "w") as outfile:
    outfile.write(major_json)

# Career

In [None]:
with open("scrape_result/onet_careers.json", "rb") as f:
    career_data = json.load(f)

In [None]:
for career in career_data:
    title = preprocessing(career['title'], remove_html=False)
    also_called = " ".join(preprocessing(x, remove_html=False) for x in career['also_called'])
    what_they_do = preprocessing(career['what_they_do'], remove_html=False)
    on_the_job = " ".join(preprocessing(x, remove_html=False) for x in career['on_the_job'])
    career['text'] = f"{title} {also_called} {what_they_do} {on_the_job}"

In [None]:
# preprocessed result
career_data[0]['text']

'accountant auditor accountant auditor certified public accountant cpa financial auditor examine analyze interpret accounting record prepare financial statement give advice audit evaluate statement prepared others install advise system recording cost financial budgetary data prepare detailed report audit finding report management asset utilization audit result recommend change operation financial activity collect analyze data detect deficient control duplicated effort extravagance fraud noncompliance law regulation management policy'

In [None]:
career_json = json.dumps(career_data, indent=4)
with open("preprocessed/onet_careers.json", "w") as f:
    f.write(career_json)