# Data Preparation 

1) Pengumpulan Data

In [1]:
# Import Libraries

# Mengimpor library dasar
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load Dataset

# Mengimpor dataset dari penyimpanan internal 
data_set = 'bahan/final bahan/mergeFile-en new.csv'
df = pd.read_csv(data_set, encoding='utf-8', sep=';')

2) Penanganan Data yang Hilang

In [3]:
# Menggunakan isnull() dan sum()
missing_count = df["descriptions"].isnull().sum()
print(f"Jumlah missing value dalam kolom 'descriptions': {missing_count}")

Jumlah missing value dalam kolom 'descriptions': 107


In [4]:
# Menampilkan baris dengan missing value
rows_with_missing = df[df["descriptions"].isnull()]
print("\nBaris dengan missing value dalam kolom 'desccriptions':")
print(rows_with_missing)


Baris dengan missing value dalam kolom 'desccriptions':
                             job_title  \
20                   Engineering Staff   
44                Senior Web Developer   
47         Web Developer (WIX) and SEO   
54                   Software Engineer   
58         Back end developer / Devops   
...                                ...   
2258                   Systems Analyst   
2288  GA System Apps. Engineer, IT, SS   
2290                   Systems Analyst   
2320  GA System Apps. Engineer, IT, SS   
2322                   Systems Analyst   

                                      company descriptions  \
20                     PT PRIMA PRESISI ABADI          NaN   
44                    PT Monitor News Sources          NaN   
47               PT Petrolindo Energi Perkasa          NaN   
54            PT Tigernix Solutions Indonesia          NaN   
58                    PT CahayaThanks Perfect          NaN   
...                                       ...          ...   
2258

In [5]:
# Menghapus baris yang mengandung missing values
df["descriptions"] = df["descriptions"].dropna()

3) Penanganan Outlier

In [6]:
# Analisis Panjang String
df['string_length'] = df['descriptions'].str.len()
length_mean = df['string_length'].mean()
length_std = df['string_length'].std()
length_outliers = df[(df['string_length'] > length_mean + 2*length_std) | 
                     (df['string_length'] < length_mean - 2*length_std)]

print("Outliers berdasarkan panjang string:")
print(length_outliers[['descriptions', 'string_length']])

Outliers berdasarkan panjang string:
                                           descriptions  string_length
3     Minimum working experience 2 years; Passion fo...         3266.0
9     Mastering one of the programming languages; Ma...         3668.0
138   Define and establish the end to end operation ...         3565.0
212   Manage work within the SOC scope and be respon...         2614.0
257   Apply emerging technologies to help our client...         2954.0
...                                                 ...            ...
2259  Performs analysis and makes recommendations in...         2801.0
2278  Project Management Experience\n: Strong projec...         2738.0
2291  Performs analysis and makes recommendations in...         2801.0
2310  Project Management Experience\n: Strong projec...         2738.0
2323  Performs analysis and makes recommendations in...         2801.0

[111 rows x 2 columns]


In [7]:
# Analisis Frekuensi Kata
def word_count(text):
    return len(str(text).split())

df['word_count'] = df['descriptions'].apply(word_count)
word_count_mean = df['word_count'].mean()
word_count_std = df['word_count'].std()
word_count_outliers = df[(df['word_count'] > word_count_mean + 2*word_count_std) | 
                         (df['word_count'] < word_count_mean - 2*word_count_std)]

print("\nOutliers berdasarkan jumlah kata:")
print(word_count_outliers[['descriptions', 'word_count']])


Outliers berdasarkan jumlah kata:
                                           descriptions  word_count
3     Minimum working experience 2 years; Passion fo...         463
9     Mastering one of the programming languages; Ma...         512
138   Define and establish the end to end operation ...         508
212   Manage work within the SOC scope and be respon...         353
241   Who handles technical duties, focused on selli...         369
...                                                 ...         ...
2195  Performs analysis and makes recommendations in...         377
2227  Performs analysis and makes recommendations in...         377
2259  Performs analysis and makes recommendations in...         377
2291  Performs analysis and makes recommendations in...         377
2323  Performs analysis and makes recommendations in...         377

[82 rows x 2 columns]


In [8]:
# Analisis Kata Unik
def unique_word_ratio(text):
    words = str(text).lower().split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

df['unique_word_ratio'] = df['descriptions'].apply(unique_word_ratio)
unique_ratio_mean = df['unique_word_ratio'].mean()
unique_ratio_std = df['unique_word_ratio'].std()
unique_ratio_outliers = df[(df['unique_word_ratio'] > unique_ratio_mean + 2*unique_ratio_std) | 
                           (df['unique_word_ratio'] < unique_ratio_mean - 2*unique_ratio_std)]

print("\nOutliers berdasarkan rasio kata unik:")
print(unique_ratio_outliers[['descriptions', 'unique_word_ratio']])


Outliers berdasarkan rasio kata unik:
                                          descriptions  unique_word_ratio
3    Minimum working experience 2 years; Passion fo...           0.466523
21   Support for Setting Promotion on Invoices; Val...           0.389831
22   Education: Diploma or Bachelor's degree in Com...           0.431373
23   Maximum age 35 years. Minimum education: Bache...           0.459259
33   Minimum Bachelor's Degree in Information Techn...           0.430556
75   Vocational school/Diploma/Bachelor's degree in...           0.435897
102  Work in all phases of front-end and back-end d...           0.466321
138  Define and establish the end to end operation ...           0.490157


In [9]:
df

Unnamed: 0,job_title,company,descriptions,location,subcategory,type,salary,date_ingestion,string_length,word_count,unique_word_ratio
0,Microsoft Dynamics / Power Apps Developer,PT Xumei Food Tech Indonesia,"Bachelor's degree in computer science, enginee...","West Jakarta, Jakarta",Developer/Programmer,full time,,16/01/2024,1296.0,181,0.679558
1,Frontend Internship,PT. Radha Mustika Chakra,Develop new user interface features that meet ...,Jakarta,Engineering - Software,Casual/Vacation,,16/01/2024,931.0,138,0.797101
2,Fullstack Engineer,PT Teknologi Pamadya Analytical,Deeply engaged in the full development lifecyc...,"Tangerang, Banten",Developer/Programmer,full time,,16/01/2024,2395.0,318,0.694969
3,Web Developer,PT Supra Boga Lestari Tbk,Minimum working experience 2 years; Passion fo...,Jakarta,Engineering - Software,full time,,16/01/2024,3266.0,463,0.466523
4,Senior Python IT Staff,PT. Sinar Mas Insurance,"Male/Female, max age 28 years; Bachelor's degr...","Central Jakarta, Jakarta",Engineering - Software,full time,,16/01/2024,344.0,49,0.938776
...,...,...,...,...,...,...,...,...,...,...,...
2335,SYSTEM SUPPORT,PT Asuransi Harta Aman Pratama Tbk,\nRole descriptions Salary insights Tools to h...,Greater Jakarta,Network & System Administration,full time,,27/06/2024,70.0,11,1.000000
2336,Senior Systems Analyst,PT. K-LINK INDONESIA,What's your expected monthly basic salary? How...,South Jakarta,Business/System Analyst,full time,"IDR 10,000,000 – IDR 14,000,000 per month",27/06/2024,232.0,37,0.783784
2337,Inspur IT Server Engineer,TITANICOM TECH LIMITED,Coordinating with team members in order to ide...,Greater Jakarta,Engineering - Hardware,full time,"IDR 13,000,000 – IDR 14,000,000 per month",27/06/2024,2118.0,287,0.679443
2338,IT System Administration and General Affairs -...,PT Tri Dharma Protection,What's your expected monthly basic salary? Whi...,South Jakarta,Network & System Administration,full time,,27/06/2024,593.0,97,0.546392


4) Konversi Tipe Data

In [10]:
df["descriptions"] = df["descriptions"].astype(str)

Pemilihan Dataframe Sementara

In [11]:
# Memilih fitur dan label proses
df_process = df[['descriptions', 'subcategory', 'string_length', 'word_count', 'unique_word_ratio']]
df_process

Unnamed: 0,descriptions,subcategory,string_length,word_count,unique_word_ratio
0,"Bachelor's degree in computer science, enginee...",Developer/Programmer,1296.0,181,0.679558
1,Develop new user interface features that meet ...,Engineering - Software,931.0,138,0.797101
2,Deeply engaged in the full development lifecyc...,Developer/Programmer,2395.0,318,0.694969
3,Minimum working experience 2 years; Passion fo...,Engineering - Software,3266.0,463,0.466523
4,"Male/Female, max age 28 years; Bachelor's degr...",Engineering - Software,344.0,49,0.938776
...,...,...,...,...,...
2335,\nRole descriptions Salary insights Tools to h...,Network & System Administration,70.0,11,1.000000
2336,What's your expected monthly basic salary? How...,Business/System Analyst,232.0,37,0.783784
2337,Coordinating with team members in order to ide...,Engineering - Hardware,2118.0,287,0.679443
2338,What's your expected monthly basic salary? Whi...,Network & System Administration,593.0,97,0.546392


5) Pembersihan Data

In [12]:
import re

# Fungsi untuk membersihkan teks
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\d+', '', text)  # Menghapus angka
        text = re.sub(r'[^\w\s]', '', text)  # Menghapus tanda baca dan karakter khusus kecuali spasi
        text = re.sub(r'\W', ' ', text)  # Menghapus karakter non-alphanumeric
        text = re.sub(r'\s+', ' ', text)  # Menghapus spasi berlebih
        text = text.lower()  # Mengubah teks menjadi huruf kecil
        return text
    else:
        return ''  # Kembalikan string kosong jika bukan string
    
# Contoh penerapan fungsi clean_text pada kolom 'descriptions'
df_process['descriptions'] = df_process['descriptions'].apply(clean_text)
df_process

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,descriptions,subcategory,string_length,word_count,unique_word_ratio
0,bachelors degree in computer science engineeri...,Developer/Programmer,1296.0,181,0.679558
1,develop new user interface features that meet ...,Engineering - Software,931.0,138,0.797101
2,deeply engaged in the full development lifecyc...,Developer/Programmer,2395.0,318,0.694969
3,minimum working experience years passion for p...,Engineering - Software,3266.0,463,0.466523
4,malefemale max age years bachelors degree in i...,Engineering - Software,344.0,49,0.938776
...,...,...,...,...,...
2335,role descriptions salary insights tools to he...,Network & System Administration,70.0,11,1.000000
2336,whats your expected monthly basic salary how m...,Business/System Analyst,232.0,37,0.783784
2337,coordinating with team members in order to ide...,Engineering - Hardware,2118.0,287,0.679443
2338,whats your expected monthly basic salary which...,Network & System Administration,593.0,97,0.546392


6) Stopword removal

In [13]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Model Pemrosesan Bahasa dengan spaCy
nlp = spacy.load("en_core_web_sm")
docs = list(nlp.pipe(df_process['descriptions']))

# Membaca daftar stopword dari file eksternal
with open('bahan/selected_stopwords.txt', 'r') as file:
    external_stopwords = set(file.read().splitlines())  # Menggunakan set untuk performa lebih baik

# Tampilkan daftar stopword yang dibaca
print(external_stopwords)

# Definisi fungsi untuk tokenisasi, penghapusan stop words, huruf asing, dan lemmatization
def preprocess_text(text):
    doc = nlp(text)
    
    # Proses tokenisasi, lemmatization, penghapusan stop words, huruf asing, dan stop words eksternal
    tokens = [
        token.lemma_.strip() for token in doc 
        if not token.is_stop                      # Hapus stop words dari spaCy
        and len(token.lemma_) > 1                 # Hapus token yang hanya 1 karakter
        and token.text.lower() not in STOP_WORDS  # Hapus stop words default dari spaCy
        and token.text.lower() not in external_stopwords  # Hapus stop words dari file eksternal
        and not (len(token.text) == 1 and token.is_alpha) # Hapus huruf asing yang berdiri sendiri
    ]
    
    return tokens

# Contoh penggunaan pada kolom descriptions
df_process['result_descriptions'] = df_process['descriptions'].apply(preprocess_text)

# Fungsi untuk menghapus stopword yang tersisa setelah preprocessing
def remove_stopwords(text, stopwords):
    if not isinstance(text, str):
        text = str(text)  # Mengonversi nilai non-string menjadi string
    words = text.split()
    cleaned_text = ' '.join([word for word in words if word.lower() not in stopwords])
    return cleaned_text

# Terapkan fungsi remove_stopwords pada kolom 'result_descriptions' (bisa dipakai jika diperlukan setelah lemmatization)
df_process['result_descriptions'] = df_process['result_descriptions'].apply(lambda x: remove_stopwords(' '.join(x), external_stopwords))

{'', 'karawaci', 'stun', 'expense', 'contractual', 'completeness', 'mindset', 'adjustment', 'health', 'interesting', 'simulation', 'project', 'judgment', 'rau', 'panel', 'remark', 'efficiently', 'display', 'committed', 'name', 'mathematicsphysicstelecommunication', 'liaise', 'adhoc', 'super', 'multidisciplinary', 'mainly', 'javascripttypescript', 'interchange', 'protect', 'ecommerce', 'reading', 'undergo', 'demonstrate', 'custodian', 'nd', 'crimping', 'culture', 'er', 'eagerness', 'deftly', 'tv', 'methodologie', 'spoken', 'assemble', 'wife', 'add', 'semarang', 'preferred', 'crossfuntional', 'modern', 'free', 'optimize', 'willing', 'paloalto', 'administer', 'confidently', 'normal', 'host', 'avoid', 'py', 'incld', 'assessment', 'craft', 'voip', 'packing', 'collaboration', 'orchestration', 'yarn', 'synthesis', 'involve', 'session', 'solver', 'mastery', 'element', 'active', 'aspmvc', 'pronunciation', 'question', 'encourage', 'upper', 'milestone', 'agent', 'archive', 'fastmove', 'exercise',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
df_process

Unnamed: 0,descriptions,subcategory,string_length,word_count,unique_word_ratio,result_descriptions
0,bachelors degree in computer science engineeri...,Developer/Programmer,1296.0,181,0.679558,science engineering app development develop mi...
1,develop new user interface features that meet ...,Engineering - Software,931.0,138,0.797101,develop uiux design web design design backend ...
2,deeply engaged in the full development lifecyc...,Developer/Programmer,2395.0,318,0.694969,engage development lifecycle design develop te...
3,minimum working experience years passion for p...,Engineering - Software,3266.0,463,0.466523,programming programming js php mysql nosql res...
4,malefemale max age years bachelors degree in i...,Engineering - Software,344.0,49,0.938776,informatics engineer python programming progra...
...,...,...,...,...,...,...
2335,role descriptions salary insights tools to he...,Network & System Administration,70.0,11,1.000000,
2336,whats your expected monthly basic salary how m...,Business/System Analyst,232.0,37,0.783784,analyst programming sql query
2337,coordinating with team members in order to ide...,Engineering - Hardware,2118.0,287,0.679443,troubleshoot device server maintenance install...
2338,whats your expected monthly basic salary which...,Network & System Administration,593.0,97,0.546392,microsoft administrator office software admini...


In [15]:
# Analisis Panjang String
df_process['string_length_00'] = df_process['result_descriptions'].str.len()
length_mean = df_process['string_length_00'].mean()
length_std = df_process['string_length_00'].std()
length_outliers = df_process[(df_process['string_length_00'] > length_mean + 2*length_std) | 
                     (df_process['string_length_00'] < length_mean - 2*length_std)]

print("Outliers berdasarkan panjang string:")
print(length_outliers[['result_descriptions', 'string_length_00']])

Outliers berdasarkan panjang string:
                                    result_descriptions  string_length_00
2     engage development lifecycle design develop te...               641
3     programming programming js php mysql nosql res...               714
6     scienceinformation web programming web program...               491
9     programming netbean sql oracle sql server java...              1053
23    education engineering web developer tsql mysql...               532
...                                                 ...               ...
2186  server engineering scienceinformation manage s...               546
2218  server engineering scienceinformation manage s...               546
2250  server engineering scienceinformation manage s...               546
2282  server engineering scienceinformation manage s...               546
2314  server engineering scienceinformation manage s...               546

[104 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# Analisis Frekuensi Kata
def word_count(text):
    return len(str(text).split())

df_process['word_count_00'] = df_process['result_descriptions'].apply(word_count)
word_count_mean = df_process['word_count_00'].mean()
word_count_std = df_process['word_count_00'].std()
word_count_outliers = df_process[(df_process['word_count_00'] > word_count_mean + 2*word_count_std) | 
                         (df_process['word_count_00'] < word_count_mean - 2*word_count_std)]

print("\nOutliers berdasarkan jumlah kata:")
print(word_count_outliers[['result_descriptions', 'word_count_00']])


Outliers berdasarkan jumlah kata:
                                    result_descriptions  word_count_00
2     engage development lifecycle design develop te...             78
3     programming programming js php mysql nosql res...             94
6     scienceinformation web programming web program...             62
9     programming netbean sql oracle sql server java...            134
23    education engineering web developer tsql mysql...             67
...                                                 ...            ...
2186  server engineering scienceinformation manage s...             61
2218  server engineering scienceinformation manage s...             61
2250  server engineering scienceinformation manage s...             61
2282  server engineering scienceinformation manage s...             61
2314  server engineering scienceinformation manage s...             61

[91 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
# Analisis Kata Unik
def unique_word_ratio(text):
    words = str(text).lower().split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

df_process['unique_word_ratio_00'] = df_process['result_descriptions'].apply(unique_word_ratio)
unique_ratio_mean = df_process['unique_word_ratio_00'].mean()
unique_ratio_std = df_process['unique_word_ratio_00'].std()
unique_ratio_outliers = df_process[(df_process['unique_word_ratio_00'] > unique_ratio_mean + 2*unique_ratio_std) | 
                           (df_process['unique_word_ratio_00'] < unique_ratio_mean - 2*unique_ratio_std)]

print("\nOutliers berdasarkan rasio kata unik:")
print(unique_ratio_outliers[['result_descriptions', 'unique_word_ratio_00']])


Outliers berdasarkan rasio kata unik:
     result_descriptions  unique_word_ratio_00
34                                         0.0
45                                         0.0
146                                        0.0
148                                        0.0
156                                        0.0
...                  ...                   ...
2303                                       0.0
2319                                       0.0
2321                                       0.0
2325                                       0.0
2335                                       0.0

[261 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
df_process

Unnamed: 0,descriptions,subcategory,string_length,word_count,unique_word_ratio,result_descriptions,string_length_00,word_count_00,unique_word_ratio_00
0,bachelors degree in computer science engineeri...,Developer/Programmer,1296.0,181,0.679558,science engineering app development develop mi...,249,30,0.500000
1,develop new user interface features that meet ...,Engineering - Software,931.0,138,0.797101,develop uiux design web design design backend ...,168,25,0.680000
2,deeply engaged in the full development lifecyc...,Developer/Programmer,2395.0,318,0.694969,engage development lifecycle design develop te...,641,78,0.602564
3,minimum working experience years passion for p...,Engineering - Software,3266.0,463,0.466523,programming programming js php mysql nosql res...,714,94,0.361702
4,malefemale max age years bachelors degree in i...,Engineering - Software,344.0,49,0.938776,informatics engineer python programming progra...,84,9,0.777778
...,...,...,...,...,...,...,...,...,...
2335,role descriptions salary insights tools to he...,Network & System Administration,70.0,11,1.000000,,0,0,0.000000
2336,whats your expected monthly basic salary how m...,Business/System Analyst,232.0,37,0.783784,analyst programming sql query,29,4,1.000000
2337,coordinating with team members in order to ide...,Engineering - Hardware,2118.0,287,0.679443,troubleshoot device server maintenance install...,365,41,0.634146
2338,whats your expected monthly basic salary which...,Network & System Administration,593.0,97,0.546392,microsoft administrator office software admini...,60,6,0.666667


Lanjutan Penanganan Outlier

In [20]:
df_before = df_process[['descriptions', 'string_length', 'word_count', 'unique_word_ratio']]
df_after = df_process[['result_descriptions', 'string_length_00', 'word_count_00', 'unique_word_ratio_00']]

In [21]:
df_before

Unnamed: 0,descriptions,string_length,word_count,unique_word_ratio
0,bachelors degree in computer science engineeri...,1296.0,181,0.679558
1,develop new user interface features that meet ...,931.0,138,0.797101
2,deeply engaged in the full development lifecyc...,2395.0,318,0.694969
3,minimum working experience years passion for p...,3266.0,463,0.466523
4,malefemale max age years bachelors degree in i...,344.0,49,0.938776
...,...,...,...,...
2335,role descriptions salary insights tools to he...,70.0,11,1.000000
2336,whats your expected monthly basic salary how m...,232.0,37,0.783784
2337,coordinating with team members in order to ide...,2118.0,287,0.679443
2338,whats your expected monthly basic salary which...,593.0,97,0.546392


In [22]:
df_after

Unnamed: 0,result_descriptions,string_length_00,word_count_00,unique_word_ratio_00
0,science engineering app development develop mi...,249,30,0.500000
1,develop uiux design web design design backend ...,168,25,0.680000
2,engage development lifecycle design develop te...,641,78,0.602564
3,programming programming js php mysql nosql res...,714,94,0.361702
4,informatics engineer python programming progra...,84,9,0.777778
...,...,...,...,...
2335,,0,0,0.000000
2336,analyst programming sql query,29,4,1.000000
2337,troubleshoot device server maintenance install...,365,41,0.634146
2338,microsoft administrator office software admini...,60,6,0.666667


In [24]:
df_compare = df_process[['descriptions', 'result_descriptions']]
df_compare

Unnamed: 0,descriptions,result_descriptions
0,bachelors degree in computer science engineeri...,science engineering app development develop mi...
1,develop new user interface features that meet ...,develop uiux design web design design backend ...
2,deeply engaged in the full development lifecyc...,engage development lifecycle design develop te...
3,minimum working experience years passion for p...,programming programming js php mysql nosql res...
4,malefemale max age years bachelors degree in i...,informatics engineer python programming progra...
...,...,...
2335,role descriptions salary insights tools to he...,
2336,whats your expected monthly basic salary how m...,analyst programming sql query
2337,coordinating with team members in order to ide...,troubleshoot device server maintenance install...
2338,whats your expected monthly basic salary which...,microsoft administrator office software admini...


7) Seleksi Fitur

In [19]:
# Memilih fitur dan label untuk pemodelan
df_selected = df_process[['subcategory', 'result_descriptions']]

In [20]:
df_selected

Unnamed: 0,subcategory,result_descriptions
0,Developer/Programmer,science engineering app development develop mi...
1,Engineering - Software,develop uiux design web design design backend ...
2,Developer/Programmer,engage development lifecycle design develop te...
3,Engineering - Software,programming programming js php mysql nosql res...
4,Engineering - Software,informatics engineer python programming progra...
...,...,...
2335,Network & System Administration,
2336,Business/System Analyst,analyst programming sql query
2337,Engineering - Hardware,troubleshoot device server maintenance install...
2338,Network & System Administration,microsoft administrator office software admini...


In [21]:
# Menyimpan dataset yang sudah diubah
#df_selected.to_csv("bahan/final bahan/result data prep (01).csv", index=False)