Импорт библиотек и функций

Импортируем нужные библиотеки для кодирования данных и написания моделей

In [161]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold


from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


import warnings
warnings.filterwarnings(action='ignore')

Чтение данных

In [162]:
df = pd.read_csv('salaries_final.csv')

In [163]:
df

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS
...,...,...,...,...,...,...
14465,2016,"van der Vliet, Albert",Professor,163635.0,Department of Pathology&Laboratory Medicine,COM
14466,2017,"van der Vliet, Albert",Professor,175294.0,Department of Pathology&Laboratory Medicine,COM
14467,2018,"van der Vliet, Albert",Professor,191000.0,Department of Pathology&Laboratory Medicine,COM
14468,2019,"van der Vliet, Albert",Professor,196000.0,Department of Pathology&Laboratory Medicine,COM


In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14470 entries, 0 to 14469
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               14470 non-null  int64  
 1   Name               14470 non-null  object 
 2   Primary Job Title  14470 non-null  object 
 3   Base Pay           14470 non-null  float64
 4   Department         14470 non-null  object 
 5   College            14470 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 678.4+ KB


Обработка данных

In [165]:
df = df.copy()
 # Drop Name column
df = df.drop('Name', axis=1)
# Shuffle the data
df = df.sample(frac=1.0).reset_index(drop=True)
df = df.rename(columns = {'Base Pay': 'Salary'})
x = df.drop('Salary', axis=1)
# Split df into X and y
y = df['Salary']

In [166]:
x

Unnamed: 0,Year,Primary Job Title,Department,College
0,2013,Professor,Department of Political Science,CAS
1,2018,Lecturer (Part-Time),Grossman School of Business,Business
2,2012,Assistant Professor,Department of Med-Pulmonary,COM
3,2018,Assistant Professor,Department of Family Medicine,COM
4,2020,Clinical Practice Phys,Department of Orthopaedics & Rehabilitation,COM
...,...,...,...,...
14465,2019,Lecturer,Department of Classics,CAS
14466,2018,Police Officer Senior,Department of Surg-Urology,COM
14467,2020,Professor,Department of English,CAS
14468,2015,Assistant Professor,Department of Med-Gen Internal Med,COM


In [167]:
y

0        116824.0
1          5727.0
2        140250.0
3         30000.0
4         24000.0
           ...   
14465     38330.0
14466     82139.0
14467     98513.0
14468     27000.0
14469     65805.0
Name: Salary, Length: 14470, dtype: float64

Перевод данных в двоичную систему (1/0)

In [168]:
pd.get_dummies(x['College'])

Unnamed: 0,Business,CALS,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,0,0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14465,0,0,1,0,0,0,0,0,0,0,0,0
14466,0,0,0,0,0,0,1,0,0,0,0,0
14467,0,0,1,0,0,0,0,0,0,0,0,0
14468,0,0,0,0,0,0,1,0,0,0,0,0


In [169]:
def create(reg):

    transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])

    preproc = ColumnTransformer(transformers=[
        ('nominal', transformer, ['Primary Job Title', 'Department', 'College'])
    ], remainder='passthrough')

    model = Pipeline(steps=[
        ('preprocessor', preproc),
        ('scaler', StandardScaler()),
        ('regressor', reg)
    ])

    return model

Модели, которые будем использовать

In [174]:
models = {
    "Linear Regression (Ridge)": create(Ridge()),
    "Decision Tree": create(DecisionTreeRegressor()),
    "Random Forest": create(RandomForestRegressor()),}

Сравнение моделей (K-Fold CV)

In [175]:
def evaluate_model(model, x, y):

    kf = KFold(n_splits=5)
    rmses = []
    r2s = []

    for train_idx, test_idx in kf.split(x):
        # Fit model
        model.fit(x.iloc[train_idx, :], y.iloc[train_idx])

        # Make predictions
        pred = model.predict(x.iloc[test_idx, :])

        # Calculate RMSE
        rmse = np.sqrt(np.mean((y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)

        # Calculate R^2
        r2 = 1 - (np.sum((y.iloc[test_idx] - pred)**2) / np.sum((y.iloc[test_idx] - y.iloc[test_idx].mean())**2))
        r2s.append(r2)

    # Return average RMSE and R^2
    return np.mean(rmses), np.mean(r2s)

In [177]:
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, x, y)[0]))

Linear Regression (Ridge) RMSE: 28446.38
Decision Tree RMSE: 30011.30


In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, x, y)[1]))

Логарифмируем наши зарплаты, для улучшения точности

In [156]:
df['Salary'] = np.log(df['Salary'])
df.tail()

Unnamed: 0,Year,Primary Job Title,Salary,Department,College
14465,2017,Senior Lecturer,11.095954,Department of Mathematics & Statistics,CEMS
14466,2014,Outreach Professional Sr,11.242769,Department of Music,CAS
14467,2018,Professor,11.618942,Department of Psychological Science,CAS
14468,2020,Associate Professor,10.463103,Department of Surg-Vascular,COM
14469,2012,Professor,12.411459,Department of Psychiatry,COM


In [157]:
x = df.drop('Salary', axis=1)
# Split df into X and y
y = df['Salary']

In [158]:
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, x, y)[0]))

Linear Regression (Ridge) RMSE: 0.41
Decision Tree RMSE: 0.43


In [159]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, x, y)[1]))

Linear Regression (Ridge) R^2: 0.67658
Decision Tree R^2: 0.64254
