# Task for Today  

***

## University Salary Prediction  
  
Given *data about university employees*, let's try to predict the **salary** for a given employee.  
  
We will use a variety of regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/university-salaries/university-salaries/salaries_final.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Name column
    df = df.drop('Name', axis=1)
    
    # Shuffle the data
    df = df.sample(frac=1.0).reset_index(drop=True)
    
    # Split df into X and y
    y = df['Base Pay']
    X = df.drop('Base Pay', axis=1)
    
    return X, y

In [None]:
X, y = preprocess_inputs(data)

In [None]:
X

In [None]:
y

# Building Pipeline

In [None]:
def build_pipeline(regressor):
    
    nominal_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, ['Primary Job Title', 'Department', 'College'])
    ], remainder='passthrough')
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', regressor)
    ])
    
    return model

In [None]:
models = {
    "Linear Regression (Ridge)": build_pipeline(Ridge()),
    "            Decision Tree": build_pipeline(DecisionTreeRegressor()),
    "           Neural Network": build_pipeline(MLPRegressor()),
    "            Random Forest": build_pipeline(RandomForestRegressor()),
    "        Gradient Boosting": build_pipeline(GradientBoostingRegressor())
}

# Model Selection (K-Fold CV)

In [None]:
def evaluate_model(model, X, y):
    
    kf = KFold(n_splits=5)
    rmses = []
    r2s = []
    
    for train_idx, test_idx in kf.split(X):
        # Fit model
        model.fit(X.iloc[train_idx, :], y.iloc[train_idx])
        
        # Make predictions
        pred = model.predict(X.iloc[test_idx, :])
        
        # Calculate RMSE
        rmse = np.sqrt(np.mean((y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)
        
        # Calculate R^2
        r2 = 1 - (np.sum((y.iloc[test_idx] - pred)**2) / np.sum((y.iloc[test_idx] - y.iloc[test_idx].mean())**2))
        r2s.append(r2)
        
    # Return average RMSE and R^2
    return np.mean(rmses), np.mean(r2s)

In [None]:
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, X, y)[0]))

In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, X, y)[1]))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/qmPJeMvQOkE