In [None]:
import pandas as pd
from dateutil.parser import parse
import re
from deep_translator import GoogleTranslator
from langdetect import detect
import spacy

### Data Summaries

In [111]:
df = pd.read_csv("lokerid_data.csv")
df.head()

Unnamed: 0,Job ID,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description
0,15758013,Staff Engineering,Rusunami CityPark,Jakarta Barat,Rp 3 – 4 Juta,Kontrak;SMA / SMK / STM;1-2 Tahun;Staff / Offi...,7/9/2025 17:01,https://www.loker.id/teknik/elektro/staff-engi...,Teknisi Elektro,Kontrak,SMA / SMK / STM,Staff / Officer,"Badan Pengelola Rusunami City Park adalah, Pen..."
1,15785089,Leader Quality Control,PT. Mega Indo Prima,Tangerang,Rp 4 – 5 Juta,Kontrak;Diploma/D1/D2/D3;Sarjana / S1;1-2 Tahu...,7/9/2025 16:50,https://www.loker.id/pabrik-dan-manufaktur/qua...,Quality Control,Kontrak,Diploma/D1/D2/D3;Sarjana / S1,Supervisor / Coordinator,Seseorang yang melakukan pengecekan dan penguj...
2,15784987,Sales Executive,PT Niagamas Lestari Gemilang,Jakarta Barat,Negosiasi,Full Time;SMA / SMK / STM;1-2 Tahun;Staff / Of...,7/9/2025 16:50,https://www.loker.id/penjualan/konsultan-penju...,Sales Executive,Full Time,SMA / SMK / STM,Staff / Officer,PT Niagamas Lestari Gemilang adalah perusahaan...
3,15784306,Teknisi Telekomunikasi,PT Pangestu Suryaning Famili,Klaten,Rp 4 – 5 Juta,Kontrak;SMA / SMK / STM;1-2 Tahun;Fresh Gradua...,7/9/2025 16:45,https://www.loker.id/information-technology/te...,Teknisi Jaringan,Kontrak,SMA / SMK / STM,Junior / Entry Level,PT PANGESTU SURYANING FAMILI adalah perusahaan...
4,15784729,Sales Engineer,PT ASGI Industrial Indonesia,Cikarang,Rp 6 – 7 Juta,Full Time;Diploma/D1/D2/D3;Sarjana / S1;1-2 Ta...,7/9/2025 16:28,https://www.loker.id/penjualan/konsultan-penju...,Sales Engineer,Full Time,Diploma/D1/D2/D3;Sarjana / S1,Staff / Officer,PT ASGI Industrial Indonesia is a growing indu...


In [112]:
df.shape

(2373, 13)

There are 2373 rows and 13 columns in the dataset

In [113]:
df.dtypes

Job ID          int64
Title          object
Company        object
Location       object
Salary         object
Category       object
Posted Date    object
Link           object
Function       object
Type           object
Education      object
Level          object
Description    object
dtype: object

In [114]:
# Check missing value
df.isnull().sum()

Job ID         0
Title          0
Company        0
Location       0
Salary         0
Category       0
Posted Date    0
Link           0
Function       0
Type           0
Education      0
Level          0
Description    0
dtype: int64

In [115]:
df.describe(include='object')

Unnamed: 0,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description
count,2373,2373,2373,2373,2373,2373,2373,2373,2373,2373,2373,2373
unique,1396,1288,112,14,1627,2241,2229,512,5,17,5,2231
top,Sales Marketing,Indonesia Global Grup,Jakarta Barat,Negosiasi,Full Time;SMA / SMK / STM;1-2 Tahun;Junior / E...,6/18/2025 8:15,https://www.loker.id/customer-care/customer-se...,Staff Accounting,Full Time,SMA / SMK / STM;Diploma/D1/D2/D3;Sarjana / S1,Junior / Entry Level,"PT. PSKM, bergerak di bidang perdagangan bunga..."
freq,55,31,251,999,11,4,2,64,1690,542,1062,2


### Posted Date Field Standardization

In [116]:
print("Posted date format 1:", df['Posted Date'].iloc[0])
print("Posted date format 2:", df['Posted Date'].iloc[-1])

Posted date format 1: 7/9/2025 17:01
Posted date format 2: 2025-06-16 04:39:14


The date format is inconsistent and should be standardized to match format 2

In [117]:
# Standardize the posted date format
df['Posted Date'] = df['Posted Date'].apply(
    lambda x: (
        x if len(str(x).split(':')) == 3
        else parse(x).strftime('%Y-%m-%d %H:%M:%S')
    ) if pd.notnull(x) else x
)

In [118]:
print("Posted date 1:", df['Posted Date'].iloc[0])
print("Posted date 2:", df['Posted Date'].iloc[-1])

Posted date 1: 2025-07-09 17:01:00
Posted date 2: 2025-06-16 04:39:14


### Duplicate Handling

In [119]:
# Check for duplicates based on all field
duplicates = df[df.duplicated(keep=False)]
len(duplicates)

12

In [120]:
# Check duplicates by Job ID
duplicates = df[df["Job ID"].duplicated()]
len(duplicates)

144

In [121]:
# Get one example duplicate value
example_job_id = df["Job ID"][df["Job ID"].duplicated()].iloc[0]
example_duplicates = df[df["Job ID"] == example_job_id]
example_duplicates

Unnamed: 0,Job ID,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description
273,15765600,Guru Bahasa Inggris,CV. IRDH Penerbitan,Kota Batu,Negosiasi,Part Time;SMA / SMK / STM;Diploma/D1/D2/D3;Sar...,2025-07-07 08:13:34,https://www.loker.id/pendidikan-pelatihan/tuto...,Guru Bahasa Inggris,Part Time,SMA / SMK / STM;Diploma/D1/D2/D3;Sarjana / S1;...,Junior / Entry Level,CV IRDH Malang (www.irdhcenter.com) membuka ke...
336,15765600,Guru Bahasa Inggris,CV. IRDH Penerbitan,Kota Batu,Negosiasi,Part Time;SMA / SMK / STM;Diploma/D1/D2/D3;Sar...,2025-07-07 08:13:00,https://www.loker.id/pendidikan-pelatihan/tuto...,Guru Bahasa Inggris,Part Time,SMA / SMK / STM;Diploma/D1/D2/D3;Sarjana / S1;...,Junior / Entry Level,CV IRDH Malang (www.irdhcenter.com) membuka ke...


The job listing is identical, with the only difference being the posted date

In [122]:
# Change 'Posted Date' to datetime format
df["Posted Date"] = pd.to_datetime(df["Posted Date"], errors='coerce')

# Sort by newest posted date first
df = df.sort_values(by="Posted Date", ascending=False)

# Drop duplicates keeping the latest one
df = df.drop_duplicates(subset=[col for col in df.columns if col != "Posted Date"], keep="first")
df = df.reset_index(drop=True)

In [123]:
# Get the example Job ID used before
example_job_id = 15765600

# Check how many times it still appears
count = df[df["Job ID"] == example_job_id].shape[0]

print(f"Job ID '{example_job_id}' appears {count} time(s).")

Job ID '15765600' appears 1 time(s).


The duplicates by Job ID removed successfully

In [124]:
# Check again the duplicates by Job ID
duplicates = df[df["Job ID"].duplicated()]
len(duplicates)

3

In [125]:
# Get one example duplicate value
example_job_id = df["Job ID"][df["Job ID"].duplicated()].iloc[0]
example_duplicates = df[df["Job ID"] == example_job_id]
example_duplicates

Unnamed: 0,Job ID,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description
1406,15008530,HSE Officer,PT Mitra Purnama Engineering,Lombok,Rp 3 – 4 Juta,Full Time;Diploma/D1/D2/D3;Sarjana / S1;1-2 Ta...,2025-06-18 08:15:04,https://www.loker.id/kesehatan-dan-kedokteran/...,Health and Safety Enviromental / HSE,Full Time,Diploma/D1/D2/D3;Sarjana / S1,Junior / Entry Level,PT. Mitra Purnama Engineering membutuhkan posi...
1407,15008530,HSE Officer,PT Mitra Purnama Engineering,Lombok,Rp 3 – 4 Juta,Full Time;Diploma/D1/D2/D3;Sarjana / S1;1-2 Ta...,2025-06-18 08:15:00,https://www.loker.id/kesehatan-dan-kedokteran/...,Health and Safety Enviromental / HSE,Full Time,Diploma/D1/D2/D3;Sarjana / S1,Junior / Entry Level,PT. Mitra Purnama Engineering membutuhkan posi...


In [126]:
if df["Description"].iloc[1406] != df["Description"].iloc[1407]:
    print("Different description, same job ID")

Different description, same job ID


All fields is identical except the "Description"

In [127]:
# Keep only the first row for jobs with identical Job ID but different Description
df.drop_duplicates(subset='Job ID', keep='first', inplace=True)

In [128]:
# Check again the duplicates by Job ID
duplicates = df[df["Job ID"].duplicated()]
len(duplicates)

0

### Cleaning and Formatting Company Names

In [129]:
print("Example with all uppercase and a '.' after 'PT':")
print("    ", df["Company"].iloc[520])
print("Example where 'CV' is written incorrectly and has a '.' after it:")
print("    ", df["Company"].iloc[1092])
print("Example with a '.' after 'PT':")
print("    ", df["Company"].iloc[1720])
print("Example with all lowercase:")
print("    ", df["Company"].iloc[1449])
print("Example with all uppercase:")
print("    ", df["Company"].iloc[207])

Example with all uppercase and a '.' after 'PT':
     PT.ALAM RAYA ELYNDO TANGERANG
Example where 'CV' is written incorrectly and has a '.' after it:
     Cv. Prima Kreasi Dewata
Example with a '.' after 'PT':
     PT. Trimitra Kemasindo
Example with all lowercase:
     bright printing
Example with all uppercase:
     SMK JAKARTA DUA


In [130]:
# Standardize company names
df['Company'] = df['Company'].str.replace(
    r'^\s*(PT|Pt|pt|CV|Cv|cv|PD|Pd|pd|UD|Ud|ud)[\.\s]*',
    lambda m: m.group(1) + ' ',
    regex=True
)

In [131]:
# Capitalize company names
df['Company'] = df['Company'].str.replace(
    r'^(.)', lambda m: m.group(1).upper(), regex=True
)

In [132]:
# Convert company names written in all uppercase to title case
# df['Company'] = df['Company'].apply(
#     lambda x: (
#         x.split(' ', 1)[0] + ' ' + x.split(' ', 1)[1].title()
#         if  re.match(r'^(PT|CV|PD|UD|TK SDK|SMK|TK / SD)\s', str(x))
#         else x.title() if str(x)
#         else x
#     )
# )

df['Company'] = df['Company'].apply(
    lambda x: (
        re.match(r'^(PT|CV|PD|UD|TK SDK|TK / SD|SMK|KJPP)[\.\s/]+', str(x), re.IGNORECASE).group(1).upper() + ' ' +
        re.sub(r'^(PT|CV|PD|UD|TK SDK|TK / SD|SMK|KJPP)[\.\s/]+', '', str(x), flags=re.IGNORECASE).title()
        if re.match(r'^(PT|CV|PD|UD|TK SDK|TK / SD|SMK|KJPP)[\.\s/]+', str(x), re.IGNORECASE)
        else str(x).title()
    )
)


In [133]:
print("Cleaned and formatted company names:")
print("    ", df["Company"].iloc[520])
print("    ", df["Company"].iloc[1092])
print("    ", df["Company"].iloc[1720])
print("    ", df["Company"].iloc[1449])
print("    ", df["Company"].iloc[207])

Cleaned and formatted company names:
     PT Alam Raya Elyndo Tangerang
     CV Prima Kreasi Dewata
     PT Trimitra Kemasindo
     Bright Printing
     SMK Jakarta Dua


### Salary Field Extraction and Transformation

In [134]:
# Create "Salary Negotiation" column
df['Salary Negotiation'] = df['Salary'].apply(
    lambda x: 1 if 'negosiasi' in str(x).lower() else 0
)

In [135]:
# Extract minimum salary and convert from 'juta' format to full numeric value
df['Min Salary'] = df['Salary'].apply(
    lambda x: int(re.match(r'Rp\s*(\d+)', str(x)).group(1)) * 1_000_000
    if re.match(r'Rp\s*(\d+)\s*[–-]', str(x)) else
    int(re.match(r'Rp\s*(\d+)', str(x)).group(1)) * 1_000_000
    if 'lebih' in str(x).lower() else pd.NA
)

# Extract maximum salary
df['Max Salary'] = df['Salary'].apply(
    lambda x: int(re.match(r'Rp\s*\d+\s*[–-]\s*(\d+)', str(x)).group(1)) * 1_000_000
    if re.match(r'Rp\s*\d+\s*[–-]\s*(\d+)', str(x)) else pd.NA
)

In [136]:
df[['Salary', 'Min Salary', 'Max Salary', 'Salary Negotiation']].iloc[780:783]

Unnamed: 0,Salary,Min Salary,Max Salary,Salary Negotiation
780,Negosiasi,,,1
781,Rp 20 Juta Lebih,20000000.0,,0
782,Rp 3 – 4 Juta,3000000.0,4000000.0,0


- The word "Negosiasi" indicates no fixed amount, so Min Salary and Max Salary are empty and Salary Negotiation is set to Yes
- Since the word "Lebih" (more than) is present but no upper limit is stated, 20 million as Min Salary and Max Salary remains empty. Negotiation is not mentioned, so Salary Negotiation is No.
- Both Min Salary and Max Salary are extracted and converted to numeric form. No negotiation mentioned, so Salary Negotiation is No.

### Category Field Cleaning and Splitting

In [29]:
df['Category'].iloc[0]

'Kontrak;SMA / SMK / STM;1-2 Tahun;Staff / Officer;Teknik;Teknik Elektro / Elektronika'

The 'Category' column contains multiple job attributes, it need to be splitted to keep only the category in this column

In [30]:
# Remove Type, Education, and Level because it's already stored
# Extract values related to experience (contains "tahun") to 'Year of Experience'
# Make 'Fresh Graduate' column that will be marked as 'Yes' if mentioned in Category or Description
# The remaining items is the Category
df[['Category', 'Year of Experience', 'Fresh Graduate']] = df.apply(
    lambda row: pd.Series(
        (
            lambda items, type_val, edu_vals, level_val, desc: (
                ';'.join([
                    i for i in items
                    if i != type_val
                    and i not in edu_vals
                    and i != level_val
                    and 'tahun' not in i.lower()
                    and 'fresh graduate' not in i.lower()
                ]),

                # Year of Experience
                ';'.join([i for i in items if 'tahun' in i.lower()]) or '0',

                # Fresh Graduate
                1 if any('fresh graduate' in i.lower() for i in items) or 'fresh graduate' in desc.lower() else 0
            )
        )(
            str(row['Category']).split(';'),
            str(row['Type']),
            str(row['Education']).split(';'),
            str(row['Level']),
            str(row.get('Description', '')).lower()
        )
    ),
    axis=1
)

In [31]:
# Current result after processing
df[['Category', 'Type', 'Education', 'Category', 'Fresh Graduate', 'Year of Experience', 'Level']].iloc[0:1]

Unnamed: 0,Category,Type,Education,Category.1,Fresh Graduate,Year of Experience,Level
0,Teknik;Teknik Elektro / Elektronika,Kontrak,SMA / SMK / STM,Teknik;Teknik Elektro / Elektronika,0,1-2 Tahun,Staff / Officer


After processing,the Category field only consist of category, there's no other job attributes

In [32]:
grouped = df['Category'].str.count(';').value_counts().sort_index()
print(grouped)

Category
0      16
1    2213
Name: count, dtype: int64


The category consists of a main category and a subcategory. There are 16 jobs that only have a main category.

In [33]:
# Example row with no semicolon in 'Category' (this means it only has a main category and no subcategory)
df[df['Category'].str.count(';') == 0].head(1)

Unnamed: 0,Job ID,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description,Salary Negotiation,Min Salary,Max Salary,Year of Experience,Fresh Graduate
1191,15663049,Marketing Associate,PT Metalindo Multiperkasa,Bandung Kota,Negosiasi,Marketing / Pemasaran,2025-06-18 17:04:19,https://www.loker.id/pemasaran/marketing-assoc...,Marketing Executive,Full Time,SMA / SMK / STM;Diploma/D1/D2/D3;Sarjana / S1,Junior / Entry Level,Kami mencari seorang Marketing Associate yang ...,Yes,,,1-2 Tahun,0


In [34]:
# Example row with semicolon in 'Category' (this means it main category and subcategory)
df[df['Category'].str.count(';') == 1].head(1)

Unnamed: 0,Job ID,Title,Company,Location,Salary,Category,Posted Date,Link,Function,Type,Education,Level,Description,Salary Negotiation,Min Salary,Max Salary,Year of Experience,Fresh Graduate
0,15758013,Staff Engineering,Rusunami Citypark,Jakarta Barat,Rp 3 – 4 Juta,Teknik;Teknik Elektro / Elektronika,2025-07-09 17:01:00,https://www.loker.id/teknik/elektro/staff-engi...,Teknisi Elektro,Kontrak,SMA / SMK / STM,Staff / Officer,"Badan Pengelola Rusunami City Park adalah, Pen...",No,3000000,4000000,1-2 Tahun,0


In [35]:
# Split into main and subcategory
split_cat = df['Category'].str.split(';', n=1, expand=True)

# Assign main category and subcategory
df['Category'] = split_cat[0]
df['Subcategory'] = split_cat[1]  # This will be NaN if there's no subcategory

In [36]:
# After processing results
df[['Category', 'Subcategory']].iloc[0:1]

Unnamed: 0,Category,Subcategory
0,Teknik,Teknik Elektro / Elektronika


### Standardizing Job Type and Education Fields

##### Standardizing Job Type Language

In [37]:
# Check unique values from "Type" column
print(df['Type'].unique())

['Kontrak' 'Full Time' 'Part Time' 'Magang' 'Freelance']


In [38]:
# Translate job types from Indonesian to English for consistency
df['Type'] = df['Type'].replace({
    'Kontrak': 'Contract',
    'Magang': 'Internship'
})
print(df['Type'].unique())

['Contract' 'Full Time' 'Part Time' 'Internship' 'Freelance']


##### Education Field Handling 

In [39]:
df['Education'].iloc[0]

'SMA / SMK / STM'

In [40]:
# Clean up slashes and extra spaces in "Education" column
df['Education'] = df['Education'].str.replace(r'\s*/\s*', '/', regex=True)
df['Education'] = df['Education'].str.replace(r'\s{2,}', ' ', regex=True).str.strip()
df['Education'].iloc[0] # example results

'SMA/SMK/STM'

In [41]:
# Check unique values from "Education" column
all_edu = df['Education'].dropna().astype(str).str.split(';')
flat_list = [item.strip() for sublist in all_edu for item in sublist]
unique_values = sorted(set(flat_list))
unique_values

['Diploma/D1/D2/D3', 'Doctor/S3', 'Master/S2', 'SMA/SMK/STM', 'Sarjana/S1']

In [42]:
# Turn each education level into its own column
edu_binary = df['Education'].fillna('').astype(str).str.get_dummies(sep=';')
df = pd.concat([df, edu_binary], axis=1)

In [43]:
df[['Education', 'SMA/SMK/STM', 'Diploma/D1/D2/D3', 'Sarjana/S1', 'Master/S2', 'Doctor/S3']].iloc[4:7] # After processing results example

Unnamed: 0,Education,SMA/SMK/STM,Diploma/D1/D2/D3,Sarjana/S1,Master/S2,Doctor/S3
4,Diploma/D1/D2/D3;Sarjana/S1,0,1,1,0,0
5,SMA/SMK/STM,1,0,0,0,0
6,SMA/SMK/STM;Diploma/D1/D2/D3;Sarjana/S1,1,1,1,0,0


### Parsing and Extracting Years of Experience Field

In [44]:
# Extract the minimum number of years from the 'Year of Experience' column
df['Min Experience'] = df['Year of Experience'].astype(str).str.lower().str.split(';').apply(
    lambda parts: min(
        sum([
            [int(re.findall(r'\d+', p)[0]), int(re.findall(r'\d+', p)[1])]
            if re.match(r'\d+\s*-\s*\d+', p)
            else [10] if 'lebih dari' in p
            else [int(re.findall(r'\d+', p)[0])] if re.match(r'\d+', p)
            else []
            for p in parts
        ], [])
    ) if parts != [''] else pd.NA
)

# Extract the maximum number of years from the 'Year of Experience' column
df['Max Experience'] = df.apply(
    lambda row: max(
        sum([
            [int(re.findall(r'\d+', p)[0]), int(re.findall(r'\d+', p)[1])]
            if re.match(r'\d+\s*-\s*\d+', p)
            else [10] if 'lebih dari' in p
            else [int(re.findall(r'\d+', p)[0])] if re.match(r'\d+', p)
            else []
            for p in str(row['Year of Experience']).lower().split(';')
        ], [])
    ) if 'lebih dari' not in str(row['Year of Experience']).lower() and str(row['Year of Experience']).strip() != ''
    else pd.NA,
    axis=1
)

In [45]:
df[['Year of Experience', 'Min Experience', 'Max Experience']].iloc[0:2] # After processing results example

Unnamed: 0,Year of Experience,Min Experience,Max Experience
0,1-2 Tahun,1,2
1,1-2 Tahun;3-5 Tahun,1,5


### Skills Extraction

In [46]:
# Extract text "Keahlian" in Description column as skills
df['Skills'] = df['Description'].str.extract(r'Keahlian *:([\s\S]+?)Kualifikasi *:')
df['Skills'] = df['Skills'].fillna(df['Description']).str.strip()

In [47]:
# Remove bullet points and unnecessary symbols (e.g., '-', '•', '*')
df['Skills'] = df['Skills'].apply(lambda x: re.sub(r'[\-\•\●\*]+', '', x))

# Remove numbered list formats like "1.", "1,", or "1 ."
df['Skills'] = df['Skills'].apply(lambda x: re.sub(r'\b\d+\s*[\.,]\s*', '', x))

# Remove alphabetic list formats like "a.", "b," or "c ."
df['Skills'] = df['Skills'].apply(lambda x: re.sub(r'\b[a-zA-Z]\s*[\.,]\s*', '', x))

# Remove unwanted characters, keeping only letters, numbers, whitespace, and basic punctuation
df['Skills'] = df['Skills'].apply(lambda x: re.sub(r'[^\w\s\.,/&]', '', x))

# Split by line breaks, clean up, and ensure each sentence ends with a period
df['Skills'] = df['Skills'].apply(
    lambda x: '. '.join([
        i.strip().rstrip('.') + '.' for i in re.split(r'[\n\r]', x) if i.strip()
    ]) if x else ''
)

# Remove repeated periods, extra spaces, and leading punctuation
df['Skills'] = df['Skills'].str.replace(r'\.\s*\.', '.', regex=True)
df['Skills'] = df['Skills'].str.replace(r'\s{2,}', ' ', regex=True)
df['Skills'] = df['Skills'].str.replace(r'^\.\s*', '', regex=True)
df['Skills'] = df['Skills'].str.strip()

In [79]:
# Translate only if the language is not English
df['Skills_En'] = df['Skills'].apply(
    lambda x: (
        GoogleTranslator(source='auto', target='en').translate(x)
        if detect(x) != 'en' else x
    ) if pd.notna(x) and x.strip() != '' else ''
)

In [80]:
df[['Skills', 'Skills_En']].iloc[0:1]

Unnamed: 0,Skills,Skills_En
0,Mengerti mekanikal elektrikal dan plumbing. Me...,Understand the mechanical electrical and plumb...


In [81]:
nlp = spacy.load("en_core_web_trf")

df['Skill_Set'] = df['Skills_En'].apply(
    lambda text: [
        ' '.join([token.text for token in chunk if not token.is_stop and not token.is_punct]).strip()
        for chunk in nlp(re.sub(r'[^\w\s\-\.,]', '', text)).noun_chunks
        if any(not token.is_stop for token in chunk)
    ] if pd.notna(text) and text.strip() != '' else []
)

In [None]:
# Count unique skills
all_skills = [skill for sublist in df['Skill_Set'] for skill in sublist]
jumlah_skill_unik = len(set(all_skills))
print("Unique skill:", jumlah_skill_unik)

Unique skill: 7310


In [90]:
df['Skill_Set'] = df['Skill_Set'].apply(
    lambda skills: [
        re.sub(r'^[\W_]+|[\W_]+$', '', re.sub(r'[^\w\s\-]', '', skill)).strip()
        for skill in skills
        if isinstance(skill, str) and skill.strip() != ''
    ]
)

In [91]:
# Count unique skills
all_skills = [skill for sublist in df['Skill_Set'] for skill in sublist]
jumlah_skill_unik = len(set(all_skills))
print("Unique skill:", jumlah_skill_unik)

Unique skill: 7301


In [106]:
print(df.loc[10, 'Skill_Set']) # Example

['good communication skills', 'products', 'Google Office Google Doc', 'Google Sheets', 'Photographic skills', 'installation', 'content', 'background', 'interior', 'architecture', 'social media', 'ecommerce platforms', 'additional advantage']


In [None]:
# Standardize the skills format
df['Skill_Set'] = df['Skill_Set'].apply(
    lambda skills: [skill.title() for skill in skills if isinstance(skill, str)]
)

In [108]:
print(df.loc[10, 'Skill_Set']) # After processing example

['Good Communication Skills', 'Products', 'Google Office Google Doc', 'Google Sheets', 'Photographic Skills', 'Installation', 'Content', 'Background', 'Interior', 'Architecture', 'Social Media', 'Ecommerce Platforms', 'Additional Advantage']


In [109]:
# Choose important column and save data to csv format
df[['Job ID', 'Title', 'Company', 'Location', 'Category', 'Subcategory',
    'Function', 'Type', 'Level', 'Min Salary', 'Max Salary', 'Salary Negotiation',
    'Min Experience', 'Max Experience', 'Fresh Graduate',
    'SMA/SMK/STM', 'Diploma/D1/D2/D3', 'Sarjana/S1', 'Master/S2', 'Doctor/S3',
    'Posted Date', 'Description', 'Skill_Set', 'Link']].to_csv('job_postings_processed.csv', index=False)