## Tiền xử lý dữ liệu

In [1]:
# Đảm bảo đúng thư mục hiện tại đang ở thư mục HR Analytics
!pwd
%cd ..

/mnt/d/Hp/Documents/University/Nam 3/HKI/LT_DS/HW02/HR Analytics/notebooks
/mnt/d/Hp/Documents/University/Nam 3/HKI/LT_DS/HW02/HR Analytics


In [2]:
# Import thư viện
import numpy as np
import matplotlib as plt
import seaborn as sns
from src import *

### Load và đọc dữ liệu

In [3]:
file_train_path = 'data/raw/aug_train.csv'
data = load_data(file_train_path)

columns = data.dtype.names
table = np.column_stack([data[col].astype(str) for col in columns])

# Đọc dữ liệu
print(data.dtype)
for row in table[:5]:
    print("\t".join(row))

[('f0', '<i8'), ('f1', '<U8'), ('f2', '<f8'), ('f3', '<U6'), ('f4', '<U23'), ('f5', '<U16'), ('f6', '<U14'), ('f7', '<U15'), ('f8', '<U3'), ('f9', '<U9'), ('f10', '<U19'), ('f11', '<U5'), ('f12', '<i8'), ('f13', '<f8')]
8949	city_103	0.92	Male	Has relevent experience	no_enrollment	Graduate	STEM	>20			1	36	1.0
29725	city_40	0.7759999999999999	Male	No relevent experience	no_enrollment	Graduate	STEM	15	50-99	Pvt Ltd	>4	47	0.0
11561	city_21	0.624		No relevent experience	Full time course	Graduate	STEM	5			never	83	0.0
33241	city_115	0.789		No relevent experience		Graduate	Business Degree	<1		Pvt Ltd	never	52	1.0
666	city_162	0.767	Male	Has relevent experience	no_enrollment	Masters	STEM	>20	50-99	Funded Startup	4	8	0.0


### Check duplicated rows

In [4]:
has_duplicates = len(data) != len(np.unique(data))
has_duplicates

False

##### Không có dữ liệu trùng lặp

#### Ta thấy cột đầu tiên là ID, ko có ý nghĩa gì nên ta sẽ loại bỏ cột này

In [5]:
# 1. Lọc tên cột ko phải 'id'
new_fields = [name for name in data.dtype.names if name != 'f0']
data = data[new_fields]

print(data.dtype)
# Đọc dữ liệu
columns = data.dtype.names
table = np.column_stack([data[col].astype(str) for col in columns])
for row in table[:5]:
    print("\t".join(row))

{'names': ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13'], 'formats': ['<U8', '<f8', '<U6', '<U23', '<U16', '<U14', '<U15', '<U3', '<U9', '<U19', '<U5', '<i8', '<f8'], 'offsets': [8, 40, 48, 72, 164, 228, 284, 344, 356, 392, 468, 488, 496], 'itemsize': 504}
city_103	0.92	Male	Has relevent experience	no_enrollment	Graduate	STEM	>20			1	36	1.0
city_40	0.7759999999999999	Male	No relevent experience	no_enrollment	Graduate	STEM	15	50-99	Pvt Ltd	>4	47	0.0
city_21	0.624		No relevent experience	Full time course	Graduate	STEM	5			never	83	0.0
city_115	0.789		No relevent experience		Graduate	Business Degree	<1		Pvt Ltd	never	52	1.0
city_162	0.767	Male	Has relevent experience	no_enrollment	Masters	STEM	>20	50-99	Funded Startup	4	8	0.0


#### Ta thấy cột company_size có chứa nhiều khoảng nên ta sẽ mapping để xử lý

In [6]:
company_size_map = {
    '<10': 'Startup',
    '10/49': 'Small',
    '50-99': 'Small',
    '100-500': 'Medium',
    '500-999': 'Medium',
    '1000-4999': 'Large',
    '5000-9999': 'Large',
    '10000+': 'Large'
}

# Lấy cột f9
col = data['f9'].astype(str) 

# Thay thế theo mapping
for old_val, new_val in company_size_map.items():
    col[col == old_val] = new_val

# Gán lại vào array
data['f9'] = col
data

array([('city_103', 0.92 , 'Male', 'Has relevent experience', 'no_enrollment', 'Graduate', 'STEM', '>20', '', '', '1',  36, 1.),
       ('city_40', 0.776, 'Male', 'No relevent experience', 'no_enrollment', 'Graduate', 'STEM', '15', 'Small', 'Pvt Ltd', '>4',  47, 0.),
       ('city_21', 0.624, '', 'No relevent experience', 'Full time course', 'Graduate', 'STEM', '5', '', '', 'never',  83, 0.),
       ...,
       ('city_103', 0.92 , 'Male', 'Has relevent experience', 'no_enrollment', 'Graduate', 'STEM', '>20', 'Small', 'Pvt Ltd', '4',  44, 0.),
       ('city_65', 0.802, 'Male', 'Has relevent experience', 'no_enrollment', 'High School', '', '<1', 'Medium', 'Pvt Ltd', '2',  97, 0.),
       ('city_67', 0.855, '', 'No relevent experience', 'no_enrollment', 'Primary School', '', '2', '', '', '1', 127, 0.)],
      dtype={'names': ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13'], 'formats': ['<U8', '<f8', '<U6', '<U23', '<U16', '<U14', '<U15', '<U3', '<U9', '<U

### Xử lý missing values

In [7]:
data = process_missing_value(data)
data

array([('city_103', 0.92 , 'Male', 'Has relevent experience', 'no_enrollment', 'Graduate', 'STEM', '>20', 'Small', 'Pvt Ltd', '1',  36, 1.),
       ('city_40', 0.776, 'Male', 'No relevent experience', 'no_enrollment', 'Graduate', 'STEM', '15', 'Small', 'Pvt Ltd', '>4',  47, 0.),
       ('city_21', 0.624, 'Male', 'No relevent experience', 'Full time course', 'Graduate', 'STEM', '5', 'Small', 'Pvt Ltd', 'never',  83, 0.),
       ...,
       ('city_103', 0.92 , 'Male', 'Has relevent experience', 'no_enrollment', 'Graduate', 'STEM', '>20', 'Small', 'Pvt Ltd', '4',  44, 0.),
       ('city_65', 0.802, 'Male', 'Has relevent experience', 'no_enrollment', 'High School', 'STEM', '<1', 'Medium', 'Pvt Ltd', '2',  97, 0.),
       ('city_67', 0.855, 'Male', 'No relevent experience', 'no_enrollment', 'Primary School', 'STEM', '2', 'Small', 'Pvt Ltd', '1', 127, 0.)],
      dtype={'names': ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13'], 'formats': ['<U8', '<f8', '<U6

### Encode dữ liệu dạng chuỗi

In [8]:
cols_to_encode = ['f1', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11']
data = encode_data(data, cols_to_encode)
data

array([( 5, 0.92 , 1, 0, 2, 0, 5, 21, 2, 5, 0,  36, 1.),
       (77, 0.776, 1, 1, 2, 0, 5,  6, 2, 5, 4,  47, 0.),
       (64, 0.624, 1, 1, 0, 0, 5, 15, 2, 5, 5,  83, 0.), ...,
       ( 5, 0.92 , 1, 0, 2, 0, 5, 21, 2, 5, 3,  44, 0.),
       (94, 0.802, 1, 0, 2, 1, 5, 20, 1, 5, 1,  97, 0.),
       (95, 0.855, 1, 1, 2, 4, 5, 11, 2, 5, 0, 127, 0.)],
      dtype=[('f1', '<i8'), ('f2', '<f8'), ('f3', '<i8'), ('f4', '<i8'), ('f5', '<i8'), ('f6', '<i8'), ('f7', '<i8'), ('f8', '<i8'), ('f9', '<i8'), ('f10', '<i8'), ('f11', '<i8'), ('f12', '<i8'), ('f13', '<f8')])

### Lưu dữ liệu sau khi đã preprocessing

In [10]:
save_path = "./data/processed/train.npy"
np.save(save_path, data)