In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('/kaggle/input/university-salaries/university-salaries/salaries_final.csv')
data

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS
...,...,...,...,...,...,...
14465,2016,"van der Vliet, Albert",Professor,163635.0,Department of Pathology&Laboratory Medicine,COM
14466,2017,"van der Vliet, Albert",Professor,175294.0,Department of Pathology&Laboratory Medicine,COM
14467,2018,"van der Vliet, Albert",Professor,191000.0,Department of Pathology&Laboratory Medicine,COM
14468,2019,"van der Vliet, Albert",Professor,196000.0,Department of Pathology&Laboratory Medicine,COM


In [24]:
def preprocess_inputs(df):
    df = df.copy()
    
    ## Dropping the name column
    df = df.drop('Name', axis = 1)
    
    df = df.sample(frac = 1.0).reset_index(drop = True)
    
    ## split the df into X and y
    y = df['Base Pay']
    X = df.drop('Base Pay', axis = 1)
    
    return X, y

In [25]:
X, y = preprocess_inputs(data)
X

Unnamed: 0,Year,Primary Job Title,Department,College
0,2017,Lecturer,Department of Romance Languages,CAS
1,2015,Assistant Professor,Department of Psychiatry,COM
2,2012,Assistant Professor,Department of COM Microbio & Molec Genetics,COM
3,2017,Professor,Department of Romance Languages,CAS
4,2011,Office/Prgm Support Generalist,Rubenstein Sch Env & Nat Res,RSENR
...,...,...,...,...
14465,2019,Clinical Instructor,Department of Nursing,CNHS
14466,2010,Acting Director,Department of Mathematics & Statistics,CEMS
14467,2012,Outreach Professional Sr,Department of Rehab & Movement Sci,CNHS
14468,2018,Professor,Department of Psychological Science,CAS


In [26]:
y

0         48139.0
1         30000.0
2         95000.0
3        113128.0
4         30500.0
           ...   
14465     65924.0
14466     89250.0
14467     63614.0
14468    181418.0
14469     84571.0
Name: Base Pay, Length: 14470, dtype: float64

In [27]:
def build_pipeline(regressor):
    
    nominal_transformer = Pipeline(steps = [
        ('onehot', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers = [
        ('nominal', nominal_transformer, ['Primary Job Title', 'Department', 'College'])
    ], remainder = 'passthrough')
    
    model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', regressor)
    ])
    
    return model

In [28]:
model = build_pipeline(Ridge(alpha = 10.0))

In [29]:
model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Primary Job Title',
                                                   'Department',
                                                   'College'])])),
                ('scaler', StandardScaler()),
                ('regressor', Ridge(alpha=10.0))])

In [30]:
model.score(X, y)

0.6512902401319441

In [31]:
models = {
    "Linear Regression (Ridge)": build_pipeline(Ridge()),
    "            Decision Tree": build_pipeline(DecisionTreeRegressor()),
    "           Neural Network": build_pipeline(MLPRegressor()),
    "            Random Forest": build_pipeline(RandomForestRegressor()),
    "        Gradient Boosting": build_pipeline(GradientBoostingRegressor())
}

In [32]:
def evaluate_model(model, X, y):
    
    kf = KFold(n_splits=5)
    rmses = []
    r2s = []
    
    for train_idx, test_idx in kf.split(X):
        # Fit model
        model.fit(X.iloc[train_idx, :], y.iloc[train_idx])
        
        # Make predictions
        pred = model.predict(X.iloc[test_idx, :])
        
        # Calculate RMSE
        rmse = np.sqrt(np.mean((y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)
        
        # Calculate R^2
        r2 = 1 - (np.sum((y.iloc[test_idx] - pred)**2) / np.sum((y.iloc[test_idx] - y.iloc[test_idx].mean())**2))
        r2s.append(r2)
        
    # Return average RMSE and R^2
    return np.mean(rmses), np.mean(r2s)

In [33]:
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, X, y)[0]))

Linear Regression (Ridge) RMSE: 28363.11
            Decision Tree RMSE: 29894.14
           Neural Network RMSE: 31082.76
            Random Forest RMSE: 28590.14
        Gradient Boosting RMSE: 31575.07


In [34]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, X, y)[1]))

Linear Regression (Ridge) R^2: 0.63990
            Decision Tree R^2: 0.60161
           Neural Network R^2: 0.57277
            Random Forest R^2: 0.63330
        Gradient Boosting R^2: 0.55371
