In [133]:
import numpy as np
from numpy import ndarray
from typing import List

import pandas as pd
from pathlib import Path
import re
import ast

In [134]:
def infer_salary(salary: str) -> float:
  MONTHLY_HOURS = 168
  HOURLY_MONTHLY_THRESHOLD = 1000
  PATTERN = "(?:umowa|contract|kontrakt)*(?:[a-zA-Z,])*(?:{BB})?(\d*,?\d*)-?(\d*,?\d*)?(?:zł)?"

  matches = re.findall(
    PATTERN,
    salary.replace(' ', '').replace('\n', '').replace('B2B', 'BB'),
    re.UNICODE | re.IGNORECASE
  )

  ranges: List[float] = []
  for tpl in matches:
    first, second = tpl
    if first != '':
      ranges.append(float(first.replace(',', '.')))
    if second != '':
      ranges.append(float(second.replace(',', '.')))

  ranges = ranges[:2]

  if any(s < HOURLY_MONTHLY_THRESHOLD for s in ranges):
    ranges = np.array(ranges) * MONTHLY_HOURS

  return float(np.mean(ranges))

def infer_schedule(schedule: str) -> str:
    schedule_map = {
      "pełny etat": [1, 0, 0],
      "dodatkowa / tymczasowa": [0, 0, 1],
      "pełny etat, część etatu, dodatkowa / tymczasowa" : [1, 1, 1],
      "pełny etat, część etatu": [1, 1, 0],
      "full-time": [1, 0, 0],
      "część etatu": [0, 1, 0],
      "pełny etat, dodatkowa / tymczasowa": [1, 0, 1],
      "część etatu, dodatkowa / tymczasowa": [0, 1, 1],
      "dodatkowa/tymczasowa": [0, 0, 1],
      "nan": [0, 0, 0],
      "full-time, part time": [1, 1, 0],
      "pełny etat, część etatu, dodatkowa/tymczasowa": [1, 1, 1],
      "part time": [0, 1, 0],
      "part time, additional / temporary": [0, 1, 1],
      "część etatu, dodatkowa/tymczasowa": [0, 1, 1]
    }

    if pd.isnull(schedule):
        schedule = "nan"

    return schedule_map[schedule]

def infer_employment_type(employment_type: str) -> str:
  employment_type_map = {
      'specjalista (mid / regular)': 'specialist',
      'pracownik fizyczny': 'manual worker',
      'menedżer': 'manager',
      'asystent': 'assistant',
      'kierownik / koordynator': 'manager',
      'starszy specjalista (senior)': 'senior specialist',
      'dyrektor, prezes': 'director',
      'specialist (mid / regular)': 'specialist',
      'team manager': 'manager',
      'dyrektor': 'director',
      'specjalista (mid / regular), starszy specjalista (senior)': 'specialist',
      'młodszy specjalista (junior)': 'junior specialist',
      'senior specialist (senior), expert': 'senior specialist',
      'specjalista (mid / regular), młodszy specjalista (junior)': 'specialist',
      'junior specialist (junior)': 'junior specialist',
      'praktykant / stażysta': 'assistant',
      'senior specialist (senior)': 'senior specialist',
      'specialist (mid / regular), senior specialist (senior)': 'specialist',
      'manager / supervisor': 'manager',
      'specialist (mid / regular), junior specialist (junior)': 'specialist',
      'kierownik / koordynator, menedżer': 'manager',
      'assistant': 'assistant',
      'dyrektor, menedżer': 'director',
      'praktykant / stażysta, asystent': 'assistant',
      'starszy specjalista (senior), ekspert': 'senior specialist',
      'asystent, młodszy specjalista (junior)': 'assistant',
      'assistant, junior specialist (junior)': 'assistant',
      'entry level & blue collar': 'manual worker',
      'director': 'director',
      'prezes': 'director',
      'trainee': 'assistant',
      'ekspert': 'senior specialist',
      'trainee, assistant': 'assistant',
      'manager / supervisor, team manager': 'manager',
      'nan': 'undefined'
  }

  if pd.isnull(employment_type):
      employment_type = "nan"

  return employment_type_map[employment_type]

In [143]:
DATA_PATH = Path.cwd() / 'pracuj_powiaty_wojew_final.csv'

df = pd.read_csv(DATA_PATH)
# maybe use 'employment_type' as well?
df = df.drop(['benefits', 'url', 'work_modes', 'remote', 'contract', 'locations',
              'location_url', 'duties', 'requirements', 'offered', 'about_us',
              'marker_coords', 'geopy_address', 'county', 'inferred_location'], axis=1)
df = df[df['salary'].notna()]

In [144]:
df['salary'] = df.apply(lambda row: infer_salary(row['salary']), axis='columns')
df['employment_type'] = df.apply(lambda row: infer_employment_type(row['employment_type']), axis='columns')
df['voivodeship'] = df.apply(lambda row: row['voivodeship'].replace('województwo ', ''), axis='columns')
df['state'] = df.apply(lambda row: row['state'].replace('powiat ', ''), axis='columns')
df_schedules = df.apply(lambda row: infer_schedule(row['schedule']), axis='columns', result_type='expand')
df = pd.concat([df, df_schedules], axis='columns')\
  .rename(columns={0: 'full_time', 1: 'part_time', 2: 'temporary'})\
  .drop('schedule', axis='columns')
df

Unnamed: 0,employment_type,salary,voivodeship,state,full_time,part_time,temporary
0,specialist,10050.0,mazowieckie,Warszawa,1,0,0
1,manual worker,3535.0,śląskie,Żory,1,0,0
2,manual worker,2456.5,śląskie,Żory,0,0,1
3,manager,13000.0,pomorskie,Gdańsk,1,0,0
4,manual worker,3535.0,śląskie,Katowice,1,0,0
...,...,...,...,...,...,...,...
4460,manual worker,8250.0,śląskie,Tychy,1,0,0
4461,manual worker,5880.0,pomorskie,wejherowski,1,0,0
4462,specialist,12500.0,mazowieckie,Warszawa,1,0,0
4463,manual worker,6250.0,mazowieckie,warszawski zachodni,1,0,0


# Model

In [198]:
import numpy as np
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

SEED = 0

In [196]:
X = df.drop(['salary', 'full_time', 'part_time', 'temporary', 'state'], axis=1)
y = df['salary']


labels_to_encode = ['employment_type', 'voivodeship']
encoders = [preprocessing.LabelEncoder() for _ in labels_to_encode]

for label, encoder in zip (labels_to_encode, encoders):
  encoder.fit(X[label])
  X[label] = encoder.transform(X[label])

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3,
                                                    random_state=SEED)

In [189]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror',
                        n_estimators=100, seed=SEED)
xgb_r.fit(train_X, train_y)

pred = xgb_r.predict(test_X)

rmse = np.sqrt(MSE(test_y, pred))
r2 = r2_score(test_y, pred)
print("RMSE : % f" %(rmse))
print("R2 : % f" %(r2))

RMSE :  93443.718736
R2 : -0.000994


In [214]:
regr = RandomForestRegressor(random_state=SEED)
regr.fit(train_X, train_y)


pred_regr = regr.predict(test_X)

rmse = np.sqrt(MSE(test_y, pred_regr))
r2 = r2_score(test_y, pred_regr)
print("RMSE : % f" %(rmse))
print("R2 : % f" %(r2))

RMSE :  93446.282031
R2 : -0.001049


In [216]:
data = ['junior specialist', 'mazowieckie']
data = [encoder.transform([d])[0] for d, encoder in zip(data, encoders)]
data = pd.DataFrame([data], columns=['employment_type', 'voivodeship'])
data
data['predicted_salary'] = xgb_r.predict(data)
for label, encoder in zip(labels_to_encode, encoders):
  data[label] = encoder.inverse_transform(data[label])

data

Unnamed: 0,employment_type,voivodeship,predicted_salary
0,junior specialist,mazowieckie,5436.119629
