## Processing Data

In [1]:
# import library
import re
import pandas as pd

In [2]:
# read data_cleaned.csv
data = pd.read_csv('dataset/data_cleaned.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24824 entries, 0 to 24823
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                24824 non-null  int64 
 1   job_title         24824 non-null  object
 2   location          24824 non-null  object
 3   career_level      24824 non-null  object
 4   experience_level  24824 non-null  object
 5   education_level   24824 non-null  object
 6   employment_type   24824 non-null  object
 7   job_function      24824 non-null  object
 8   company_industry  24824 non-null  object
 9   job_description   24824 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.9+ MB


In [3]:
# drop unnecessary features
data = data.drop(['job_description', 'job_title'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24824 entries, 0 to 24823
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                24824 non-null  int64 
 1   location          24824 non-null  object
 2   career_level      24824 non-null  object
 3   experience_level  24824 non-null  object
 4   education_level   24824 non-null  object
 5   employment_type   24824 non-null  object
 6   job_function      24824 non-null  object
 7   company_industry  24824 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.5+ MB


### Encoding `employment_type`

In [4]:
# one-hot encoding employment_type
df = data['employment_type']
df = df.str.get_dummies(sep = ',')
df

Unnamed: 0,Kontrak,Paruh Waktu,Penuh Waktu,Temporer
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
24819,1,0,0,0
24820,1,0,0,0
24821,0,0,1,0
24822,0,0,1,0


In [5]:
# concat to main data frame
data = pd.concat([data, df], axis=1)
data.drop(['employment_type'], axis = 1, inplace = True)
data.head()

Unnamed: 0,id,location,career_level,experience_level,education_level,job_function,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer
0,1,Jakarta Raya,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1),Diploma Pascasarjana,Gelar Profes...","Manufaktur,Pembelian/Manajemen Material",Manajemen/Konsulting HR,0,0,1,0
1,2,Jakarta Barat,Supervisor/Koordinator,4 tahun,Sarjana (S1),"Penjualan / Pemasaran,Penjualan Ritel",Umum & Grosir,0,0,1,0
2,3,Cirebon,Supervisor/Koordinator,5 tahun,"Sarjana (S1),Diploma Pascasarjana,Gelar Profes...","Pelayanan,Logistik/Rantai Pasokan",Retail/Merchandise,0,0,1,0
3,4,Bekasi,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional,D3 (Diploma),D4 (Diplo...","Lainnya,Jurnalis/Editor",Manajemen/Konsulting HR,0,0,1,0
4,5,Tangerang,Pegawai (non-manajemen & non-supervisor),1 tahun,"SMA,SMU/SMK/STM","Penjualan / Pemasaran,E-commerce",Retail/Merchandise,0,0,1,0


### Encode `education_level`

In [6]:
# one-hot encoding education_level
df = data['education_level']
df = df.str.get_dummies(sep = ',')
df

Unnamed: 0,D3 (Diploma),D4 (Diploma),Diploma Pascasarjana,Doktor (S3),Gelar Professional,Magister (S2),SMA,SMU/SMK/STM,Sarjana (S1),Sertifikat Professional,Tidak terspesifikasi
0,0,0,1,0,1,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,1,1,0,0,1,0,0
3,1,1,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
24819,0,0,0,0,0,0,0,0,1,0,0
24820,0,0,0,0,0,0,0,0,1,0,0
24821,0,0,0,0,0,0,0,0,1,0,0
24822,0,0,0,0,0,0,0,0,1,0,0


In [7]:
# concat to main data frame
data = pd.concat([data, df], axis=1)
data.drop(['education_level'], axis = 1, inplace = True)
data.head()

Unnamed: 0,id,location,career_level,experience_level,job_function,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,...,D4 (Diploma),Diploma Pascasarjana,Doktor (S3),Gelar Professional,Magister (S2),SMA,SMU/SMK/STM,Sarjana (S1),Sertifikat Professional,Tidak terspesifikasi
0,1,Jakarta Raya,Manajer/Asisten Manajer,5 tahun,"Manufaktur,Pembelian/Manajemen Material",Manajemen/Konsulting HR,0,0,1,0,...,0,1,0,1,1,0,0,1,0,0
1,2,Jakarta Barat,Supervisor/Koordinator,4 tahun,"Penjualan / Pemasaran,Penjualan Ritel",Umum & Grosir,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Cirebon,Supervisor/Koordinator,5 tahun,"Pelayanan,Logistik/Rantai Pasokan",Retail/Merchandise,0,0,1,0,...,0,1,0,1,1,0,0,1,0,0
3,4,Bekasi,Pegawai (non-manajemen & non-supervisor),2 tahun,"Lainnya,Jurnalis/Editor",Manajemen/Konsulting HR,0,0,1,0,...,1,0,0,0,0,0,0,1,1,0
4,5,Tangerang,Pegawai (non-manajemen & non-supervisor),1 tahun,"Penjualan / Pemasaran,E-commerce",Retail/Merchandise,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0


### Encode `job_function`

In [8]:
# one-hot encoding job_function
df = data['job_function']
df = df.str.get_dummies(sep = ',')
df

Unnamed: 0,Aktuaria/Statistik,Akuntansi / Keuangan,Akuntansi Umum / Pembiayaan,Angkatan Bersenjata,Arsitek/Desain Interior,Audit & Pajak,Bangunan/Konstruksi,Biomedis,Bioteknologi,Diagnosa/Lainnya,...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24819,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24820,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# concat to main data frame
data = pd.concat([data, df], axis=1)
data.drop(['job_function'], axis = 1, inplace = True)
data.head()

Unnamed: 0,id,location,career_level,experience_level,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,D3 (Diploma),...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,1,Jakarta Raya,Manajer/Asisten Manajer,5 tahun,Manajemen/Konsulting HR,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jakarta Barat,Supervisor/Koordinator,4 tahun,Umum & Grosir,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Cirebon,Supervisor/Koordinator,5 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Bekasi,Pegawai (non-manajemen & non-supervisor),2 tahun,Manajemen/Konsulting HR,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Tangerang,Pegawai (non-manajemen & non-supervisor),1 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Encode `location`

In [10]:
# list of locations
locations = [
    'Aceh', 'Ambon', 'Asahan Kisaran', 'Badung', 'Balangan', 'Bali', 'Balikpapan', 'Bandar Lampung', 'Bandung',
    'Bangka', 'Bangka Belitung', 'Banjar', 'Banjarbaru', 'Banjarmasin', 'Banjarnegara', 'Banten', 'Bantul',
    'Banyuwangi', 'Barito Utara', 'Batam', 'Batu', 'Bekasi', 'Bengkulu', 'Bima', 'Binjai', 'Bintan', 'Bitung',
    'Blitar', 'Bogor', 'Bone', 'Bontang', 'Brebes', 'Bukittinggi', 'Bulungan', 'Cianjur', 'Cibinong', 'Cikarang',
    'Cikupa', 'Cilacap', 'Cilegon', 'Cileungsi', 'Cimahi', 'Cirebon', 'Citeureup', 'Demak', 'Denpasar', 'Depok',
    'Dumai', 'Gianyar', 'Gorontalo', 'Gowa', 'Gresik', 'Gunung Mas', 'Halmahera', 'Hulu Sungai Tengah',
    'Jakarta Barat', 'Jakarta Pusat', 'Jakarta Raya', 'Jakarta Selatan', 'Jakarta Timur', 'Jakarta Utara', 'Jambi',
    'Jawa Barat', 'Jawa Tengah', 'Jawa Timur', 'Jayapura', 'Jember', 'Jepara', 'Kalimantan Barat',
    'Kalimantan Selatan', 'Kalimantan Tengah', 'Kalimantan Timur', 'Kalimantan Utara', 'Kapuas', 'Karangasem',
    'Karawang', 'Kediri', 'Kendari', 'Kepulauan Riau', 'Kepulauan Seribu', 'Ketapang', 'Klaten', 'Klungkung',
    'Kota Banda Aceh', 'Kotabaru', 'Kotawaringin Barat', 'Kotawaringin Timur', 'Kudus', 'Kulon Progo', 'Kupang',
    'Kuta', 'Kutai Barat', 'Kutai Kartanegara', 'Kutai Timur', 'Lampung', 'Lhokseumawe', 'Lombok', 'Madiun',
    'Madura', 'Magelang', 'Mahakam Ulu', 'Makassar', 'Malang', 'Malinau', 'Maluku', 'Maluku Barat', 'Maluku Utara',
    'Mamuju', 'Manado', 'Manokwari', 'Maros', 'Mataram', 'Medan', 'Merauke', 'Metro', 'Meulaboh', 'Minahasa',
    'Mojokerto', 'Muara Enim', 'Murung Raya', 'Nias', 'Nunukan', 'Nusa Tenggara Barat', 'Nusa Tenggara Timur',
    'Padang', 'Padang Sidempuan', 'Palangkaraya', 'Palembang', 'Palopo', 'Palu', 'Pandeglang', 'Pangandaran',
    'Pangkal Pinang', 'Papua', 'Papua Barat', 'Pare-Pare', 'Paser', 'Pasuruan', 'Pekalongan', 'Pekanbaru',
    'Pemalang', 'Pematangsiantar', 'Penajam Paser Utara', 'Ponorogo', 'Pontianak', 'Poso', 'Prabumulih',
    'Probolinggo', 'Purbalingga', 'Purwakarta', 'Purwokerto', 'Purworejo', 'Rangkasbitung', 'Riau', 'Salatiga',
    'Samarinda', 'Semarang', 'Seminyak', 'Serang', 'Sibolga', 'Sidoarjo', 'Singkawang', 'Sintang', 'Sleman',
    'Sorong', 'Sukabumi', 'Sulawesi Barat', 'Sulawesi Selatan', 'Sulawesi Tengah', 'Sulawesi Tenggara',
    'Sulawesi Utara', 'Sumatera Barat', 'Sumatera Selatan', 'Sumatera Utara', 'Sumbawa', 'Surabaya', 'Surakarta',
    'Tabalong', 'Tanah Bumbu', 'Tanah Laut', 'Tangerang', 'Tanjung Balai', 'Tanjung Pinang', 'Tapanuli', 'Tarakan',
    'Tasikmalaya', 'Tegal', 'Ternate', 'Timika', 'Tuban', 'Ubud', 'Ungaran', 'Wonogiri', 'Yogyakarta'
]

# Create a dictionary to map locations to encoded values
location_encoding = {location: i for i, location in enumerate(locations)}

len(locations)

194

In [11]:
# encode location column using encoded values above
data['location'] = data['location'].map(location_encoding)
data.head()

Unnamed: 0,id,location,career_level,experience_level,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,D3 (Diploma),...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,1,57,Manajer/Asisten Manajer,5 tahun,Manajemen/Konsulting HR,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,55,Supervisor/Koordinator,4 tahun,Umum & Grosir,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,42,Supervisor/Koordinator,5 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,21,Pegawai (non-manajemen & non-supervisor),2 tahun,Manajemen/Konsulting HR,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,180,Pegawai (non-manajemen & non-supervisor),1 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Encode `career_level`

In [12]:
# list of career_levels
career_levels = [
   'Manajer/Asisten Manajer', 'Supervisor/Koordinator',
   'Pegawai (non-manajemen & non-supervisor)',
   'Lulusan baru/Pengalaman kerja kurang dari 1 tahun',
   'CEO/GM/Direktur/Manajer Senior'
]

# Create a dictionary to map career_levels to encoded values
career_level_encoding = {career_level: i for i, career_level in enumerate(career_levels)}

len(career_levels)

5

In [13]:
# encode career_level column using encoded values above
data['career_level'] = data['career_level'].map(career_level_encoding)
data.head()

Unnamed: 0,id,location,career_level,experience_level,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,D3 (Diploma),...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,1,57,0,5 tahun,Manajemen/Konsulting HR,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,55,1,4 tahun,Umum & Grosir,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,42,1,5 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,21,2,2 tahun,Manajemen/Konsulting HR,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,180,2,1 tahun,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Encoding `experience_level`

In [14]:
# list of experience_levels
experience_levels = [
  '5 tahun', '4 tahun', '2 tahun', '1 tahun', '3 tahun', '7 tahun',
  '6 tahun', '10 tahun', '8 tahun', '9 tahun', '15 tahun',
  '12 tahun', '14 tahun', '18 tahun', '20 tahun',
  'Lebih dari 20 Tahun', '11 tahun', '16 tahun', '17 tahun', '13 tahun'
]

# Create a dictionary to map experience_levels to encoded values
experience_level_encoding = {experience_level: i for i, experience_level in enumerate(experience_levels)}

len(experience_levels)

20

In [15]:
# encode experience_level column using encoded values above
data['experience_level'] = data['experience_level'].map(experience_level_encoding)
data.head()

Unnamed: 0,id,location,career_level,experience_level,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,D3 (Diploma),...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,1,57,0,0,Manajemen/Konsulting HR,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,55,1,1,Umum & Grosir,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,42,1,0,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,21,2,2,Manajemen/Konsulting HR,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,180,2,3,Retail/Merchandise,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Encode `company_industry`

In [16]:
# list of company_industries
company_industries = [
  'Agrikultural/Perkebunan/Peternakan Unggas/Perikanan', 'Akunting / Audit / Layanan Pajak', 'Asuransi', 
  'Automobil/Mesin Tambahan Automotif/Kendaraan', 'Bahan Kimia/Pupuk/Pestisida', 'BioTeknologi/Farmasi/Riset klinik', 
  'Call Center/IT-Enabled Services/BPO', 'Elektrikal & Elektronik', 'Hiburan/Media', 'Hotel/Pariwisata', 'Hukum/Legal', 
  'Ilmu Pengetahuan & Teknologi', 'Industri Berat/Mesin/Peralatan', 'Jual Beli Saham/Sekuritas', 'Jurnalisme', 
  'Kayu/Fiber/Kertas', 'Keamanan/Penegak hukum', 'Kelautan/Aquakultur', 'Kesehatan/Medis', 
  'Komputer/Teknik Informatika (Perangkat Keras)', 'Komputer/Teknik Informatika (Perangkat Lunak)', 
  'Konstruksi/Bangunan/Teknik', 'Konsultasi (Bisnis & Manajemen)', 'Konsultasi (IT, Ilmu Pengetahuan, Teknis & Teknikal)', 
  'Lainnya', 'Layanan Umum/Tenaga Penggerak', 'Lingkungan/Kesehatan/Keamanan', 'Luar Angkasa/Aviasi/Pesawat Terbang', 
  'Makanan & Minuman/Katering/Restoran', 'Manajemen/Konsulting HR', 'Manufaktur/Produksi', 'Minyak/Gas/Petroleum', 'Olahraga', 
  'Organisasi Nirlaba/Pelayanan Sosial/LSM', 'Pakaian', 'Pameran/Manajemen acara/PIKP', 'Pelayanan Arsitek/Desain Interior', 
  'Pelayanan Perbaikan & Pemeliharaan', 'Pemerintahan/Pertahanan', 'Pendidikan', 'Perawatan/Kecantikan/Fitnes', 
  'Perbankan/Pelayanan Keuangan', 'Percetakan/Penerbitan', 'Periklanan/Marketing/Promosi/Hubungan Masyarakat',
  'Permata/Perhiasan', 'Pertambangan', 'Polymer/Plastik/Karet/Ban', 'Produk Konsumen/Barang konsumen yang bergerak cepat', 
  'Properti/Real Estate', 'R&D', 'Retail/Merchandise', 'Seni/Desain/Fashion', 'Tekstil/Garment', 'Telekomunikasi', 
  'Tembakau', 'Transportasi/Logistik', 'Travel/Pariwisata', 'Umum & Grosir'
]

# Create a dictionary to company_industries to encoded values
company_industry_encoding = {company_industry: i for i, company_industry in enumerate(company_industries)}

len(company_industries)

58

In [17]:
# encode company_industry column using encoded values above
data['company_industry'] = data['company_industry'].map(company_industry_encoding)
data.head()

Unnamed: 0,id,location,career_level,experience_level,company_industry,Kontrak,Paruh Waktu,Penuh Waktu,Temporer,D3 (Diploma),...,Teknik Elektronika,Teknik Industri,Teknik Kimia,Teknik Lainnya,Teknik Lingkungan,Teknik Sipil/Konstruksi Bangunan,Teknikal & Bantuan Pelanggan,Teknologi Makanan/Ahli Gizi,Telesales/Telemarketing,Top Management / Manajemen Tingkat Atas
0,1,57,0,0,29,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,55,1,1,57,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,42,1,0,50,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,21,2,2,29,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,180,2,3,50,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# save data to csv
data.to_csv('dataset/data_encoded.csv', index=False)