In [None]:
# !pip install -U pandas catboost scikit-learn

# [v1] - Training & Evaluation (baseline, dummy, PoC, prototype)

> The notebook's goal is to Training & Evaluation Machine Learning model.

---

## Get Training and Testing datasets

In [None]:
import pandas as pd

train_df = pd.read_csv("Train_rev1.csv")

---

## Get Independent and Dependent (target) variables to training the model

In [None]:
X = train_df.drop(columns=[
    'Id',
    'LocationRaw',
    'SalaryNormalized',
]).astype(str)
X.head(5)

Unnamed: 0,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SourceName
0,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,cv-library.co.uk
1,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,cv-library.co.uk
2,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,cv-library.co.uk
3,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,cv-library.co.uk
4,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,cv-library.co.uk


In [None]:
y = train_df["SalaryNormalized"]
y

0         25000
1         30000
2         30000
3         27500
4         25000
          ...  
244763    22800
244764    22800
244765    22800
244766    22800
244767    42500
Name: SalaryNormalized, Length: 244768, dtype: int64

---

## Split Training dataset "training" and "validation"

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

---

## Training the model

In [None]:
from catboost import Pool

# Encapsulate training data.
pool_train = Pool(
    X_train,
    y_train,
    cat_features= ['LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName'],
    text_features = ['Title', 'FullDescription', 'SalaryRaw'],
)

# Encapsulate validate data.
pool_valid = Pool(
    X_valid,
    y_valid,
    cat_features= ['LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName'],
    text_features = ['Title', 'FullDescription', 'SalaryRaw'],
)

In [None]:
from catboost import CatBoostRegressor
import time

start = time.time()
model = CatBoostRegressor()
model.fit(
    pool_train,
    eval_set=pool_valid,
    silent=True,
)
end = time.time()
elapsed_time = end - start

In [None]:
minutes = elapsed_time / 60
print(f"The model took '{round(minutes, 3)}' minutes to train.")

The model took '31.681' minutes to train.


---

## Making some predictions

In [None]:
# Predictions to validation data.
salaries_predicted = model.predict(X_valid)
salaries_predicted

array([26503.40674528, 25828.66207557, 33846.38709445, ...,
       19153.90994752, 25062.25875614, 26303.63654707])

In [None]:
salaries_predicted.shape

(73431,)

---

## Comparing "predicted salaries" with actual salaries (y_valid)

**Preparing statistics for predicted salaries:**  
I had problems adding the mode() statistic at the end of the DataFrame. So I had to take a manual approach with dictionaries.

In [None]:
# Create a DataFrame to store salaries predicted.
df_salaries_predicted = pd.DataFrame({'Salary Predicted': salaries_predicted})

In [None]:
# Create a dictionary to store describe() method statistics.
predicted_dict = {}
for index, value in zip(df_salaries_predicted.describe().index, df_salaries_predicted.describe().values):
    predicted_dict[index] = value[0]

In [None]:
# Append mode() statistics to dictionary.
predicted_dict['mode'] = df_salaries_predicted.mode().iloc[0, 0]

In [None]:
# Create a DataFrame to store statistics of the predicted salaries.
salaries_predicted_statistics = pd.DataFrame({'Statistics of the "Predicted Salaries"': predicted_dict}, predicted_dict.keys())
salaries_predicted_statistics

Unnamed: 0,"Statistics of the ""Predicted Salaries"""
count,73431.0
mean,34067.298335
std,15468.280694
min,-447.580548
25%,23176.563273
50%,30983.179159
75%,41065.47907
max,157548.13268
mode,23656.885773


**Preparing statistics for actual salaries (y_valid):**

In [None]:
# Create a DataFrame to store actual salaries (y_valid).
df_actual_salaries = pd.DataFrame({'Actual Salaries': y_valid})

In [None]:
# Create a dictionary to store describe() method statistics.
actual_salaries_dict = {}
for index, value in zip(df_actual_salaries.describe().index, df_actual_salaries.describe().values):
    actual_salaries_dict[index] = value[0]

In [None]:
# Append mode() statistics to dictionary.
actual_salaries_dict['mode'] = df_actual_salaries.mode().iloc[0, 0]

In [None]:
# Create a DataFrame to store statistics of the actual salaries (y_valid).
actual_salaries_statistics = pd.DataFrame({'Statistics of the "y_valid"': actual_salaries_dict}, actual_salaries_dict.keys())
actual_salaries_statistics

Unnamed: 0,"Statistics of the ""y_valid"""
count,73431.0
mean,34070.297531
std,17589.390641
min,5000.0
25%,21500.0
50%,30000.0
75%,42500.0
max,200000.0
mode,35000.0


**Create a diff_df to compare the values:**

In [None]:
diff_df = pd.concat([salaries_predicted_statistics, actual_salaries_statistics], axis=1)

In [None]:
diff_df

Unnamed: 0,"Statistics of the ""Predicted Salaries""","Statistics of the ""y_valid"""
count,73431.0,73431.0
mean,34067.298335,34070.297531
std,15468.280694,17589.390641
min,-447.580548,5000.0
25%,23176.563273,21500.0
50%,30983.179159,30000.0
75%,41065.47907,42500.0
max,157548.13268,200000.0
mode,23656.885773,35000.0


---

## Evaluation the model

> Finally, let's **Evaluation the model**.

The **Evaluation Metric** is **[MAE](https://en.wikipedia.org/wiki/Mean_absolute_error)**.

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mae = mean_absolute_error(y_valid, salaries_predicted)

In [None]:
mae

4308.30220874477

---

## Saving the model

In [None]:
model.save_model("model-v1.cbm")

---

# [v1] - Training & Evaluation (Resume)

 - **In this model, we use the features:**
   - **Independent variable:**
     - Title.
     - FullDescription.
     - LocationNormalized.
     - ContractType.
     - ContractTime.
     - Company.
     - Category.
     - SalaryRaw.
     - SourceName.
   - **Dependent variable:**
     - SalaryNormalized
 - **Preprocessing:**
   - For the first training, I just trained the model without *preprocessing*.
   - That is because this is the "baseline (dummy, PoC, prototype)".
 - **Comparison between predicted data and validation data (y_valid):**
   - **Mean:**
     - Salary predicted: 34.067
     - y_valid: 34.070
   - **Standard Deviation (std):**
     - Salary predicted: 15.468
     - y_valid: 17.589
   - **Min value:**
     - Salary predicted: -447.580548
     - y_valid: 5.000
   - **25% = Lower quartile, or first quartile (Q1):**
     - Salary predicted: 23.176
     - y_valid: 21.500
   - **50% = Second quartile (Q2, or the Median):**
     - Salary predicted: 30.983
     - y_valid: 30.000
   - **75% = The upper quartile, or third quartile (Q3):**
     - Salary predicted: 41.065
     - y_valid: 42.500
   - **Max value:**
     - Salary predicted: 157.548
     - y_valid: 200.000
   - **Mode:**
     - Salary predicted: 23.656
     - y_valid: 35.000
 - **The result of Evaluation Metric (MAE) was:**
   - 4.308

---

Ro**drigo** **L**eite da **S**ilva - **drigols**