In [31]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

## Load Datasets

In [32]:
train_features = pd.read_csv('train_features.csv')
train_target = pd.read_csv('train_salaries.csv')
test_features = pd.read_csv('test_features.csv')

raw_merged = pd.merge(train_features, train_target, how='inner', on='jobId') 

In [33]:
# From data exploration we know the categorical and numerical variables
categorical_vars = ['companyId', 'jobType', 'degree', 'major', 'industry']
numerical_vars = ['yearsExperience', 'milesFromMetropolis']
target_var = 'salary'

## Preprocess Data

In [34]:
def clean_data(df):
    clean_df = df[(df['salary'] >= 8.5)] # Dropping outliers where 8.5 was the lowerbound based on inter-quartile range
    return clean_df

def get_encoded_features_df(df, cat_vars=None, num_vars=None):
    ''' gets dummy variables for categorical variables and combines with numerical variables'''
    category_df = pd.get_dummies(df[cat_vars])
    numerical_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([category_df, numerical_df], axis=1)

def get_target_df(df, target_var):
    ''' returns target variable dataframe'''
    return df[target_var]

In [35]:
# Preprocess the data

clean_df = clean_data(raw_merged)
train_features_df = get_encoded_features_df(clean_df, categorical_vars, numerical_vars)
target_df = get_target_df(clean_df, target_var)

## Models

Will run three different models:
1) Linear Regression
2) Random Forest
3) Gradient Boosting

In [36]:
# Initialize model list and error dictionaries
models = []
mean_mse={}
cv_std={}

In [37]:
# Functions to cross-validate models. Scoring metric is mean squared error

def train_model(model, feature_df, target_df, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=3, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)

def print_summary(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])

### Baseline Model

In [38]:
from sklearn.dummy import DummyRegressor
dummy_reg = DummyRegressor(strategy='mean')
train_model(dummy_reg, train_features_df, target_df, mean_mse, cv_std)
print_summary(dummy_reg, mean_mse, cv_std)


Model:
 DummyRegressor(constant=None, quantile=None, strategy='mean')
Average MSE:
 1499.0193095475988
Standard deviation during CV:
 2.58357923724918


### Regression Models

In [39]:
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=150, max_depth=25, max_features=30) # Hyperparameters tuned by using GridSearchCV
gbm = GradientBoostingRegressor(n_estimators =150, max_depth=5, loss='ls') # Hyperparameters tuned by using GridSearchCV

models.extend([lr, rf, gbm])


In [41]:
for model in models:
    train_model(model, train_features_df, target_df, mean_mse, cv_std)
    print_summary(model, mean_mse, cv_std)


Model:
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Average MSE:
 384.45647337937385
Standard deviation during CV:
 1.2864488234922833

Model:
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=30, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Average MSE:
 373.88899233176966
Standard deviation during CV:
 1.2442792275643169

Model:
 GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estim

**The Gradient Boosting Regression model reduced the mean squared error to 356 compared to 1499 of the base model.**