In [2]:
# !pip install -U pandas catboost scikit-learn

# [v1] - Training & Evaluation (baseline, dummy, PoC, prototype)

> The notebook's goal is to Training & Evaluation Machine Learning model.

---

## Get Training and Testing datasets

In [3]:
import pandas as pd

train_df = pd.read_csv("Train_rev1.csv")

---

## Get Independent and Dependent (target) variables to training the model

In [4]:
X = train_df.drop(columns=[
    'Id',
    'LocationRaw',
    'SalaryRaw',
    'SalaryNormalized',
]).astype(str)
X.info()
X.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Title               244768 non-null  object
 1   FullDescription     244768 non-null  object
 2   LocationNormalized  244768 non-null  object
 3   ContractType        244768 non-null  object
 4   ContractTime        244768 non-null  object
 5   Company             244768 non-null  object
 6   Category            244768 non-null  object
 7   SourceName          244768 non-null  object
dtypes: object(8)
memory usage: 14.9+ MB


Unnamed: 0,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName
0,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
1,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
2,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
3,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
4,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk


In [5]:
y = train_df["SalaryNormalized"]
y

0         25000
1         30000
2         30000
3         27500
4         25000
          ...  
244763    22800
244764    22800
244765    22800
244766    22800
244767    42500
Name: SalaryNormalized, Length: 244768, dtype: int64

---

## Split Training dataset "training" and "validation"

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

In [7]:
X_train.shape

(171337, 8)

In [8]:
y_train.shape

(171337,)

In [9]:
X_valid.shape

(73431, 8)

In [10]:
y_valid.shape

(73431,)

---

## Training the model

In [11]:
from catboost import Pool

# Pool training data.
pool_train = Pool(
    X_train,
    y_train,
    cat_features= ['LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName'],
    text_features = ['Title', 'FullDescription'],
)

# Pool validate data.
pool_valid = Pool(
    X_valid,
    y_valid,
    cat_features= ['LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName'],
    text_features = ['Title', 'FullDescription'],
)

In [12]:
from catboost import CatBoostRegressor
import time

start = time.time()
model = CatBoostRegressor()
model.fit(
    pool_train,
    eval_set=pool_valid,
    silent=True,
)
end = time.time()
elapsed_time = end - start

In [13]:
minutes = elapsed_time / 60
print(f"The model took '{round(minutes, 3)}' minutes to train.")

The model took '21.163' minutes to train.


---

## Making some predictions

**Manual Predict:**

In [14]:
data = {
    'Title': 'Engineering Systems Analyst',
    'FullDescription': 'Engineering Systems Analyst Dorking Surrey Salary ****K Our client is located in Dorking, Surrey and are looking for Engineering Systems Analyst our client provides specialist software development Keywords Mathematical Modelling, Risk Analysis, System Modelling, Optimisation, MISER, PIONEEER Engineering Systems Analyst Dorking Surrey Salary ****K',
    'LocationNormalized': 'Dorking',
    'ContractType': 'full_time',
    'ContractTime': 'permanent',
    'Company': 'Gregory Martin International',
    'Category': 'Engineering Jobs',
    'SourceName': 'cv-library.co.uk'
}

In [17]:
manual_predict = model.predict([*data])

In [18]:
manual_predict

29041.310879113495

**Predicts from "X_valid":**

In [19]:
# Predictions to validation data.
salaries_predicted = model.predict(X_valid)
salaries_predicted

array([29580.07092388, 23685.2599251 , 29864.38212494, ...,
       21278.34007141, 24466.79012426, 33353.1406066 ])

In [20]:
salaries_predicted.shape

(73431,)

## Comparing "predicted salaries" with actual salaries (y_valid)

**Preparing statistics for predicted salaries:**  
I had problems adding the mode() statistic at the end of the DataFrame. So I had to take a manual approach with dictionaries.

In [21]:
# Create a DataFrame to store salaries predicted.
df_salaries_predicted = pd.DataFrame({'Salary Predicted': salaries_predicted})

In [22]:
# Create a dictionary to store describe() method statistics.
predicted_dict = {}
for index, value in zip(df_salaries_predicted.describe().index, df_salaries_predicted.describe().values):
    predicted_dict[index] = value[0]

In [23]:
# Append mode() statistics to dictionary.
predicted_dict['mode'] = df_salaries_predicted.mode().iloc[0, 0]

In [24]:
# Create a DataFrame to store statistics of the predicted salaries.
salaries_predicted_statistics = pd.DataFrame({'Statistics of the "Predicted Salaries"': predicted_dict}, predicted_dict.keys())
salaries_predicted_statistics

Unnamed: 0,"Statistics of the ""Predicted Salaries"""
count,73431.0
mean,34133.796893
std,14212.498298
min,538.33649
25%,23519.597509
50%,32051.749541
75%,42155.870355
max,142030.628833
mode,41641.180291


**Preparing statistics for actual salaries (y_valid):**

In [25]:
# Create a DataFrame to store actual salaries (y_valid).
df_actual_salaries = pd.DataFrame({'Actual Salaries': y_valid})

In [26]:
# Create a dictionary to store describe() method statistics.
actual_salaries_dict = {}
for index, value in zip(df_actual_salaries.describe().index, df_actual_salaries.describe().values):
    actual_salaries_dict[index] = value[0]

In [27]:
# Append mode() statistics to dictionary.
actual_salaries_dict['mode'] = df_actual_salaries.mode().iloc[0, 0]

In [28]:
# Create a DataFrame to store statistics of the actual salaries (y_valid).
actual_salaries_statistics = pd.DataFrame({'Statistics of the "y_valid"': actual_salaries_dict}, actual_salaries_dict.keys())
actual_salaries_statistics

Unnamed: 0,"Statistics of the ""y_valid"""
count,73431.0
mean,34070.297531
std,17589.390641
min,5000.0
25%,21500.0
50%,30000.0
75%,42500.0
max,200000.0
mode,35000.0


**Create a diff_df to compare the values:**

In [29]:
diff_df = pd.concat([salaries_predicted_statistics, actual_salaries_statistics], axis=1)

In [30]:
diff_df

Unnamed: 0,"Statistics of the ""Predicted Salaries""","Statistics of the ""y_valid"""
count,73431.0,73431.0
mean,34133.796893,34070.297531
std,14212.498298,17589.390641
min,538.33649,5000.0
25%,23519.597509,21500.0
50%,32051.749541,30000.0
75%,42155.870355,42500.0
max,142030.628833,200000.0
mode,41641.180291,35000.0


---

## Evaluation the model

> Finally, let's **Evaluation the model**.

The **Evaluation Metric** is **[MAE](https://en.wikipedia.org/wiki/Mean_absolute_error)**.

In [31]:
from sklearn.metrics import mean_absolute_error

In [32]:
mae = mean_absolute_error(y_valid, salaries_predicted)

In [33]:
mae

6586.161901115121

---

## Saving the model

In [34]:
model.save_model("model-v1.cbm")

---

# [v1] - Training & Evaluation (Resume)

 - **In this model, we use the features:**
   - **Independent variable:**
     - **Categorical Features:**
       - LocationNormalized
       - ContractType
       - ContractTime
       - Company
       - Category
       - SourceName
     - **Text Features:**
       - Title
       - FullDescription
   - **Dependent variable:**
     - SalaryNormalized
 - **Preprocessing:**
   - For the first training, I just trained the model without *preprocessing*.
   - That is because this is the "baseline (dummy, PoC, prototype)".
 - **The result of Evaluation Metric (MAE) was:**
   - 6.586

---

Ro**drigo** **L**eite da **S**ilva - **drigols**