## Data Cleaning

In [1]:
# import library
import pandas as pd
import re

In [3]:
# read dataset
data_raw = pd.read_csv('dataset/data_raw.csv',sep='|')

data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34746 entries, 0 to 34745
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34746 non-null  int64  
 1   job_title             34746 non-null  object 
 2   location              34746 non-null  object 
 3   salary_currency       34742 non-null  object 
 4   career_level          34746 non-null  object 
 5   experience_level      30205 non-null  object 
 6   education_level       34746 non-null  object 
 7   employment_type       33402 non-null  object 
 8   job_function          34746 non-null  object 
 9   job_benefits          27330 non-null  object 
 10  company_process_time  24555 non-null  object 
 11  company_size          29103 non-null  object 
 12  company_industry      33132 non-null  object 
 13  job_description       34745 non-null  object 
 14  salary                9352 non-null   float64
dtypes: float64(1), int6

In [5]:
# check dataset's head
data_raw.head()

Unnamed: 0,id,job_title,location,salary_currency,career_level,experience_level,education_level,employment_type,job_function,job_benefits,company_process_time,company_size,company_industry,job_description,salary
0,1,Facility Maintenance & Smart Warehouse Manager,Bandung,IDR,Manajer/Asisten Manajer,5 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Manufaktur,Pemeliharaan",,,,,Deskripsi PekerjaanRequirements :D3/SI from re...,
1,2,Procurement Department Head,Jakarta Raya,IDR,Manajer/Asisten Manajer,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Manufaktur,Pembelian/Manajemen Material",,25 days,51 - 200 pekerja,Manajemen/Konsulting HR,Job Role: 1. Responsible for material availabi...,
2,3,SALES ADMIN,Jakarta Barat,IDR,Supervisor/Koordinator,4 tahun,Sarjana (S1),Penuh Waktu,"Penjualan / Pemasaran,Penjualan Ritel","Waktu regular, Senin - Jumat;Bisnis (contoh: K...",30 days,51 - 200 pekerja,Umum & Grosir,Internal Sales & AdminJob Description :We are ...,
3,4,City Operation Lead Shopee Express (Cirebon),Cirebon,IDR,Supervisor/Koordinator,5 tahun,"Sarjana (S1), Diploma Pascasarjana, Gelar Prof...",Penuh Waktu,"Pelayanan,Logistik/Rantai Pasokan","Tip;Waktu regular, Senin - Jumat;Kasual (conto...",21 days,2001 - 5000 pekerja,Retail/Merchandise,Job Description:Responsible for HSE implementa...,
4,5,Japanese Interpreter,Bekasi,IDR,Pegawai (non-manajemen & non-supervisor),2 tahun,"Sertifikat Professional, D3 (Diploma), D4 (Dip...",Penuh Waktu,"Lainnya,Jurnalis/Editor",,23 days,201 - 500 pekerja,Manajemen/Konsulting HR,Overview: Our clients is manufacture for autom...,


In [6]:
# drop unnecessary features, null, and duplicates
data_dropped = data_raw.drop(['id', 'job_benefits', 'salary_currency', 'company_process_time', 'company_size', 'salary'], axis=1)
data_dropped = data_dropped.dropna().drop_duplicates()

data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24824 entries, 1 to 34745
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_title         24824 non-null  object
 1   location          24824 non-null  object
 2   career_level      24824 non-null  object
 3   experience_level  24824 non-null  object
 4   education_level   24824 non-null  object
 5   employment_type   24824 non-null  object
 6   job_function      24824 non-null  object
 7   company_industry  24824 non-null  object
 8   job_description   24824 non-null  object
dtypes: object(9)
memory usage: 1.9+ MB


In [7]:
# generate id
id_sequence = range(1, len(data_dropped) + 1)
data_dropped['id'] = list(id_sequence)
data_dropped.insert(0, 'id', data_dropped.pop('id'))

In [8]:
# clean education_level and employment_type feature
data_dropped['education_level'] = data_dropped['education_level'].str.replace(', ', ',')
data_dropped['employment_type'] = data_dropped['employment_type'].str.replace(', ', ',')

In [9]:
# dataset info
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24824 entries, 1 to 34745
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                24824 non-null  int64 
 1   job_title         24824 non-null  object
 2   location          24824 non-null  object
 3   career_level      24824 non-null  object
 4   experience_level  24824 non-null  object
 5   education_level   24824 non-null  object
 6   employment_type   24824 non-null  object
 7   job_function      24824 non-null  object
 8   company_industry  24824 non-null  object
 9   job_description   24824 non-null  object
dtypes: int64(1), object(9)
memory usage: 2.1+ MB


In [10]:
# clean job_description feature (remove unusual symbol)
data_dropped['job_description'] = data_dropped['job_description'].str.replace('Â', ' ')
data_dropped['job_description'] = data_dropped['job_description'].str.replace('\xa0', ' ')
data_dropped['job_description'] = data_dropped['job_description'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.,?!/]', ' ', x)) 

In [11]:
# save data to csv
data_dropped.to_csv('dataset/data_cleaned.csv', index=False)