## Import thư viện

In [105]:
import pandas as pd
import re
import numpy as np
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Đọc dữ liệu từ bộ dataset

In [106]:
data_df = pd.read_csv("./analysis_df.csv")
data_df.head()

Unnamed: 0,Age,Gender,Salary,Title,Formal Education,Learning Platforms,Coding Experience,ML Experience,Important Activities,Languages,...,Published Papers,ML Research,ML Serve,Ethical AI tools,Industry Type,Data Storage Products,Data Products,ML Monitor Tools,Country,Year
0,18-21,Man,1528.0,Student,Bachelor's degree,Coursera -- -1,< 1 year,< 1 year,Build and/or run a machine learning service th...,Python -- Java -- C/C++ -- -1,...,,,,,I am a student,,,,Viet Nam,2018
1,18-21,Man,1528.0,Student,Some college/university study without<br>earni...,Udacity -- Coursera -- edX -- Udemy -- -1,< 1 year,< 1 year,Analyze and understand data to influence produ...,Python -- -1,...,,,,,Computers/Technology,,,,Viet Nam,2018
2,18-21,Man,,Student,Bachelor's degree,Coursera -- -1,1-5 years,1-2 years,Build and/or run a machine learning service th...,Python -- -1,...,,,,,Computers/Technology,,,,Viet Nam,2018
3,22-24,Man,1528.0,Student,Master's degree,Coursera -- DataQuest -- -1,< 1 year,< 1 year,Do research that advances the state of the art...,Python -- R -- Bash -- -1,...,,,,,I am a student,,,,Viet Nam,2018
4,22-24,Man,1528.0,Data Scientist,Bachelor's degree,Coursera -- -1,1-5 years,< 1 year,Analyze and understand data to influence produ...,R -- SQL -- -1,...,,,,,Insurance/Risk Assessment,,,,Viet Nam,2018


## Tiền xử lý dữ liệu để huấn luyện

In [107]:
new_df = data_df[["Age", "Gender", "Salary", "Title", "Formal Education", "Coding Experience", "ML Experience", "Country", "Year"]]
new_df = new_df.dropna()

new_df

Unnamed: 0,Age,Gender,Salary,Title,Formal Education,Coding Experience,ML Experience,Country,Year
0,18-21,Man,1528.0,Student,Bachelor's degree,< 1 year,< 1 year,Viet Nam,2018
1,18-21,Man,1528.0,Student,Some college/university study without<br>earni...,< 1 year,< 1 year,Viet Nam,2018
3,22-24,Man,1528.0,Student,Master's degree,< 1 year,< 1 year,Viet Nam,2018
4,22-24,Man,1528.0,Data Scientist,Bachelor's degree,1-5 years,< 1 year,Viet Nam,2018
6,18-21,Woman,1528.0,Student,Bachelor's degree,< 1 year,< 1 year,Viet Nam,2018
...,...,...,...,...,...,...,...,...,...
51727,25-29,Man,10223.0,Data Scientist,Bachelor's degree,5-10 years,2-3 years,India🇮🇳,2022
51739,22-24,Man,2555.0,Data Engineer,Master's degree,1-5 years,< 1 year,India🇮🇳,2022
51748,45-49,Man,32860.0,Manager,Master's degree,20+ years,I do not use machine<br>learning methods,India🇮🇳,2022
51751,35-39,Man,730.0,Data Scientist,Master's degree,1-5 years,1-2 years,India🇮🇳,2022


### Xử lý cột Age

In [108]:
new_df["Age"].unique()

array(['18-21', '22-24', '30-34', '25-29', '40-44', '35-39', '45-49',
       '50-54', '55-59', '60-69', '70+'], dtype=object)

In [109]:
new_df["Age"] = new_df["Age"].apply(lambda x: x.replace("+", "")).apply(lambda x: sum(map(lambda i: int(i), x.split("-"))) / len(x.split("-")))

### Xử lý cột Gender

In [110]:
gender_list = new_df["Gender"].unique()
gender_list

array(['Man', 'Woman', 'Prefer not to say', 'Prefer to self-describe',
       'Nonbinary'], dtype=object)

In [111]:
gender_map = {gender: idx for gender, idx in zip(list(gender_list), range(1, len(list(gender_list)) + 1))}

new_df["Gender"] = new_df["Gender"].map(gender_map)

### Xử lý cột Title

In [112]:
title_list = new_df["Title"].unique()
title_list

array(['Student', 'Data Scientist', 'Software Engineer', 'Manager',
       'Consultant', 'Data Analyst', 'Other', 'Product/Project Manager',
       'Data Engineer', 'Research Assistant', 'Business Analyst',
       'Chief Officer', 'Research Scientist', 'DBA/Database Engineer',
       'Machine Learning Engineer', 'Statistician', 'Product Manager',
       'Program/Project Manager', 'Machine Learning/ MLops Engineer',
       'Teacher / professor', 'Engineer (non-software)',
       'Developer Advocate', 'Principal Investigator',
       'Marketing Analyst', 'Salesperson', 'Data Journalist',
       'Developer Relations/Advocacy', 'Data Architect',
       'Data Administrator'], dtype=object)

In [113]:
title_map = {title: idx for title, idx in zip(list(title_list), range(1, len(list(title_list)) + 1))}

new_df["Title"] = new_df["Title"].map(title_map)

### Xử lý cột Formal Education

In [114]:
education_list = new_df["Formal Education"].unique()
education_list

array(["Bachelor's degree",
       "Some college/university study without<br>earning a bachelor's degree",
       "Master's degree", 'Doctoral degree',
       'No formal education past high school', 'Professional degree',
       'I prefer not to answer'], dtype=object)

In [115]:
education_map = {education: idx for education, idx in zip(list(education_list), [10, 7, 20, 30, 3, 15, 8])}

new_df["Formal Education"] = new_df["Formal Education"].map(education_map)

### Xử lý cột Coding Experience

In [116]:
pattern = r'\b([0-9]+)\b'

In [117]:
exp_list = new_df["Coding Experience"].unique()
exp_list

array(['< 1 year', '1-5 years', 'I have never written code', '5-10 years',
       '10-20 years', '20+ years'], dtype=object)

In [118]:
exp_numeric = []

for exp in exp_list:
    matches = re.findall(pattern, exp)
    numbers = [int(match[:2]) for match in matches]
    if not numbers:
        numbers = [0]
    exp_numeric.append(numbers)

In [119]:
exp_map = {k: sum(v) / len(v) for k, v in zip(list(exp_list), exp_numeric)}

new_df["Coding Experience"] = new_df["Coding Experience"].map(exp_map)

### Xử lý cột ML Experience

In [120]:
ml_list = new_df["ML Experience"].unique()
ml_list

array(['< 1 year', '2-3 years', '5-10 years', '1-2 years', '3-4 years',
       'I do not use machine<br>learning methods', '4-5 years',
       '10-20 years', '20+ years'], dtype=object)

In [121]:
ml_numeric = []

for ml in ml_list:
    matches = re.findall(pattern, ml)
    numbers = [int(match[:2]) for match in matches]
    if not numbers:
        numbers = [0]
    ml_numeric.append(numbers)

In [122]:
ml_map = {k: sum(v) / len(v) for k, v in zip(list(ml_list), ml_numeric)}

new_df["ML Experience"] = new_df["ML Experience"].map(ml_map)

### Xử lý cột Country

In [123]:
country_list = new_df["Country"].unique()

In [124]:
country_map = {k: v for k, v in zip(list(country_list), [5, 20, 18, 15])}

new_df["Country"] = new_df["Country"].map(country_map)

## Chia tập dữ liệu thành train và set

In [125]:
X = new_df.drop("Salary", axis=1)
y = new_df["Salary"]

X = X.to_numpy()
y = y.to_numpy()
y = zscore(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18094, 8), (4524, 8), (18094,), (4524,))

## Huấn luyện mô hình bằng Linear Regression

### Bài toán: Dự đoán mức lương của 1 người tham gia trả lời dựa vào 1 số thông tin cá nhân của họ

**Các cột dữ liệu được sử dụng làm feature:**
- Age: tuổi
- Gender: giới tính
- Title: vị trí/vai trò
- Formal Education: bằng cấp
- Coding Experience: số năm kinh nghiệm lập trình
- ML Experience: số năm kinh nghiệm về machine learning
- Country: quốc gia
- Year: năm người tham gia trả lời câu hỏi

**Cột dữ liệu làm label:**
- Salary: mức lương của người đó

**Mô hình:** Linear Regression

**Metrics đánh giá:** RMSE và MAE

In [127]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression()

## Đánh giá mô hình bằng Linear Regression

In [128]:
linear_predict = linear_model.predict(X_test)
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predict))
linear_mae = mean_absolute_error(y_test, linear_predict)
print("Linear Regression RMSE:", linear_rmse)
print("Linear Regression MAE", linear_mae)

Linear Regression RMSE: 0.9094569355861088
Linear Regression MAE 0.44278705430694054
