# Predict Salary at the University
## *Comparing 7 regression algorithms*

![salary university](https://i.imgur.com/9xhOHFU.png)

# Overview
This data contains salaries of University of Vermont (UVM) faculty from 2009 to 2021. We present two datasets. The second dataset is richer because it contains information on faculty departments/colleges; however, it contains less rows due to how we chose to join this data.

salaries_without_dept.csv contains all of the data we extracted from the PDFs. The four columns are: Year, Faculty Name, Primary Job Title, and Base Pay. There are 47,479 rows.
salaries_final.csv contains the same columns as, but also joins with data about the faculty's "Department" and "College" (for a total of six columns). There are only 14,470 rows in this dataset because we removed rows for which we could not identify the Department/College of the faculty.

# Data dictionary
The column definitions are self-explanatory, but the "College" abbreviation meanings are unclear to a non-UVM-affiliate. We've included data_dictionary.csv to explain what each "College" abbreviation means. You can use this dictionary to filter out miscellaneous "colleges" (e.g. UVM Libraries) and only include colleges within the undergraduate program (e.g. filter out College of Medicine).

# Load the libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')

# 1. Load and shuffle the data<a class="anchor" id="1"></a>

In [None]:
# Load the data
df = pd.read_csv('../input/university-salaries/university-salaries/salaries_final.csv')

# Shuffle the data
df.sample(frac=1)

# Display the first rows
df.head()

# 2. Data Preprocessing<a class="anchor" id="2"></a><a class="anchor" id="2"></a>

In [None]:
X = df[['Year','Primary Job Title', 'Department', 'College']]
y = df['Base Pay']

In [None]:
# Construct a ColumnTransformer
transf = make_column_transformer( (MinMaxScaler(), ['Year']), 
                                (OneHotEncoder(), ['Primary Job Title', 'Department', 'College']),
                                sparse_threshold=0)
# Transform the data
X_transf = transf.fit_transform(X)

In [None]:
# Display the result of the transformation
pd.DataFrame(X_transf).iloc[:5,:15]

In [None]:
# Split into train and test set
# Note: There is a small data leakage for the year, because the dataset was transformed before
#       spliting it
X_train, X_test, y_train, y_test = train_test_split(X_transf, y, test_size=0.2, random_state=0)

# 3. Model comparison<a class="anchor" id="3"></a>

In [None]:
models = {
#     "LinearRegression":{"model":LinearRegression() }, # LR isn't adapted in this case
    "Lasso":{"model":Lasso() },
    "Ridge":{"model":Ridge() },
    "DecisionTreeRegressor":{"model":DecisionTreeRegressor() },
    "RandomForestRegressor":{"model":RandomForestRegressor() },
    "MLPRegressor":{"model":MLPRegressor() },
    "GradientBoostingRegressor":{"model":GradientBoostingRegressor() },
    "AdaBoostRegressor":{"model":AdaBoostRegressor() }
}

# Use the K-fold cross validation for each model
# to get the mean validation accuracy and the mean training time
k = 5
for name, m in models.items():
    # Cross validation of the model
    model = m['model']
    result = cross_validate(model, X_train,y_train, cv = k, scoring='neg_mean_squared_error')
    
    # Mean accuracy and mean training time
    result['test_score'] = result['test_score']
    mean_RMSE = [(-x)**0.5 for x in result['test_score']] # Root Mean Square Error
    mean_RMSE = sum(mean_RMSE)/len(mean_RMSE)
    mean_RMSE = round(mean_RMSE,4)
    mean_fit_time = round( sum(result['fit_time']) / len(result['fit_time']), 4)
    
    # Add the result to the dictionary witht he models
    m['mean_RMSE'] = mean_RMSE
    m['Training time (sec)'] = mean_fit_time
    
    # Display the result
    print(f"{name:27} RMSE for {k}-fold CV: {mean_RMSE} - mean training time {mean_fit_time} sec")

In [None]:
# Create a DataFrame with the results
models_result = []

for name, v in models.items():
    lst = [name, v['mean_RMSE'],v['Training time (sec)']]
    models_result.append(lst)

df_results = pd.DataFrame(models_result, 
                          columns = ['model','RMSE','Training time (sec)'])
df_results.sort_values(by='RMSE', ascending=True, inplace=True)
df_results.reset_index(inplace=True,drop=True)
df_results

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'RMSE', data = df_results)
plt.title(f'{k}-fold mean RMSE for each Model\nSmaller is better', fontsize = 15)
# plt.ylim(0.8,1.005)
plt.xlabel('Model', fontsize=15)
plt.ylabel('RMSE',fontsize=15)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'Training time (sec)', data = df_results)
plt.title('Training time for each Model in sec\nSmaller is better', fontsize = 15)
plt.xticks(rotation=90, fontsize=12)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Training time (sec)',fontsize=15)
plt.show()

# 4. Prediction metrics of the best model using the test set<a class="anchor" id="4"></a>

In [None]:
# Get the model with the highest mean validation accuracy
best_model = df_results.iloc[0]

# Fit the model
model = models[best_model[0]]['model']
model.fit(X_train,y_train)

# Predict the labels with the data set
pred = model.predict(X_test)

RMSE = mean_squared_error(y_test,pred)**0.5
RMSE = int(RMSE)
MAE = mean_absolute_error(y_test,pred)
MAE = int(MAE)

# Display the results
printmd(f'### Best Model: {best_model[0]}')
printmd(f'### RMSE: {RMSE}')
printmd(f'### MAE: {MAE}')
printmd(f'### Trained in: {best_model[2]:.2f} sec')

# 5. Visualization of the result<a class="anchor" id="5"></a>

In [None]:
# Concatenate the ratings of the test set
# with the predictions of those ratings
pred_s = pd.Series(pred)
y_test_s = y_test.reset_index(drop=True)

df_result = pd.concat([y_test_s,pred_s], axis = 1)
df_result.columns = ['Real', 'Predicted']

df_result['Real/Pred'] = df_result['Real'] / df_result['Predicted']

# Convert to integer to facilitate reading
for c in df_result.columns.drop('Real/Pred'):
    df_result[c] = df_result[c].astype('int')

df_result.sample(n = 10, random_state = 0)

In [None]:
df_result.plot.box()
plt.title('Boxplot Real Salary VS Predicted Salary', fontsize = 15)
plt.show()

df_result.plot.scatter(x='Real', y='Predicted')
plt.title('Scatterplot Real Salary VS Predicted Salary', fontsize = 15)
plt.show()