Installs and imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.metrics as skm

In [10]:
# import college roi data
college_dat = pd.read_csv("college_roi_dat.csv")
college_dat.index = college_dat["School Name"]

In [11]:
# define features and target
features = ["School Ownership", 'Full-time Faculty Rate (%)', 'Faculty Average Salary', 'Student Enrollment Size','Attendance Cost',
            '150% Completion Rate at 4 Yr (%)', 'Admission Rate (%)', 'RetentionRate_4yr',  "Female_Majority",'SAT Average (Overall)']
X = college_dat.loc[:,features]
X.index = college_dat['School Name']

target = college_dat['Mean Earnings (6 Yrs after Entry)']
target2 = college_dat['Mean Earnings (10 Yrs after Entry)']

Xtrain, Xtest0, ytrain, ytest0 = train_test_split(X,target,random_state=4015,test_size=0.3)
Xvalid,Xtest,yvalid,ytest = train_test_split(Xtest0,ytest0, random_state=4015,test_size=0.3)

In [4]:
# read in preprocessed data
X_filled = pd.read_csv("./saved_data/X_filled.csv",index_col="School Name")
# import train and test data
Xtrain_filled = pd.read_csv("./saved_data/Xtrain_filled.csv",index_col="School Name")
ytrain = pd.read_csv("./saved_data/ytrain.csv",index_col="School Name").squeeze()
Xtest_filled = pd.read_csv("./saved_data/Xtest_filled.csv",index_col="School Name") # note: filled test set is from Xtest0
ytest0 = pd.read_csv("./saved_data/ytest.csv",index_col="School Name").squeeze()
# 10 year post entry data
X_filled10 = pd.read_csv("./saved_data/X_filled10.csv",index_col="School Name")
# import train and test data
Xtrain_filled10 = pd.read_csv("./saved_data/Xtrain_filled10.csv",index_col="School Name")
ytrain10 = pd.read_csv("./saved_data/ytrain10.csv",index_col="School Name").squeeze()
Xtest_filled10 = pd.read_csv("./saved_data/Xtest_filled10.csv",index_col="School Name") # note: filled test set is from Xtest0
ytest10 = pd.read_csv("./saved_data/ytest10.csv",index_col="School Name").squeeze()

# add intercept to data for linear model (linear regression) fit
# (could also use penalized regression)
X_plus = sm.add_constant(X_filled,prepend=True)
Xtrainplus = sm.add_constant(Xtrain_filled,prepend=True)
Xtestplus = sm.add_constant(Xtest_filled,prepend=True)

In [5]:
display(X_plus.head())
display(Xtrainplus.head())
display(ytrain.head())

Unnamed: 0_level_0,const,Full-time Faculty Rate (%),Faculty Average Salary,Student Enrollment Size,Attendance Cost,Completion Rate,Admission Rate (%),Retention Rate,SAT Average,Ivy League Plus,x0_Private ForProfit,x0_Private NonProfit,x0_Public
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama A & M University,1.0,99.6,91188.0,5090.0,23445.0,28.66,89.65,54.03,959.0,0.0,0.0,0.0,1.0
University of Alabama at Birmingham,1.0,76.19,136560.0,13549.0,25542.0,61.17,80.6,86.4,1245.0,0.0,0.0,0.0,1.0
University of Alabama in Huntsville,1.0,67.02,116364.0,7825.0,24861.0,57.14,77.11,81.8,1300.0,0.0,0.0,0.0,1.0
Alabama State University,1.0,67.97,86328.0,3603.0,21892.0,31.77,98.88,62.02,938.0,0.0,0.0,0.0,1.0
The University of Alabama,1.0,77.07,124188.0,30610.0,30016.0,72.14,80.39,87.23,1262.0,0.0,0.0,0.0,1.0


Unnamed: 0_level_0,const,Full-time Faculty Rate (%),Faculty Average Salary,Student Enrollment Size,Attendance Cost,Completion Rate,Admission Rate (%),Retention Rate,SAT Average,Ivy League Plus,x0_Private ForProfit,x0_Private NonProfit,x0_Public
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Saginaw Valley State University,1.0,50.28,101148.0,6953.0,22353.0,47.88,89.46,77.37,1086.0,0.0,0.0,0.0,1.0
Centre College,1.0,93.2,99564.0,1333.0,56645.0,83.16,72.08,89.3,1325.0,0.0,0.0,1.0,0.0
SUNY Empire State College,1.0,22.53,89832.0,8221.0,19726.0,27.03,80.041536,64.91,1116.619148,0.0,0.0,0.0,1.0
Boise Bible College,1.0,90.0,54900.0,94.0,24258.0,38.64,90.38,66.67,988.0,0.0,0.0,1.0,0.0
West Coast University-Dallas,1.0,25.56,88980.0,1182.0,27991.0,100.0,73.095845,0.0,1097.854469,0.0,1.0,0.0,0.0


School Name
Saginaw Valley State University    35400.0
Centre College                     38500.0
SUNY Empire State College          42200.0
Boise Bible College                26900.0
West Coast University-Dallas       71900.0
Name: Mean Earnings (6 Yrs after Entry), dtype: float64

In [6]:
# Fit and summarize OLS model
# endog = y, exog = x
mod = sm.OLS(ytrain, Xtrainplus)
ols_res = mod.fit()
print(ols_res.summary())
#                             OLS Regression Results                            
# ==============================================================================
# Dep. Variable:                  GRADE   R-squared:                       0.416
# Model:                            OLS   Adj. R-squared:                  0.353
# Method:                 Least Squares   F-statistic:                     6.646
# Date:                Fri, 05 May 2023   Prob (F-statistic):            0.00157
# Time:                        13:59:54   Log-Likelihood:                -12.978
# No. Observations:                  32   AIC:                             33.96
# Df Residuals:                      28   BIC:                             39.82
# Df Model:                           3                                         
# Covariance Type:            nonrobust                                         
# ==============================================================================
#                  coef    std err          t      P>|t|      [0.025      0.975]
# ------------------------------------------------------------------------------
# GPA            0.4639      0.162      2.864      0.008       0.132       0.796
# TUCE           0.0105      0.019      0.539      0.594      -0.029       0.050
# PSI            0.3786      0.139      2.720      0.011       0.093       0.664
# const         -1.4980      0.524     -2.859      0.008      -2.571      -0.425
# ==============================================================================
# Omnibus:                        0.176   Durbin-Watson:                   2.346
# Prob(Omnibus):                  0.916   Jarque-Bera (JB):                0.167
# Skew:                           0.141   Prob(JB):                        0.920
# Kurtosis:                       2.786   Cond. No.                         176.
# ==============================================================================

# Notes:
# [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


                                    OLS Regression Results                                   
Dep. Variable:     Mean Earnings (6 Yrs after Entry)   R-squared:                       0.437
Model:                                           OLS   Adj. R-squared:                  0.432
Method:                                Least Squares   F-statistic:                     84.82
Date:                               Mon, 18 Dec 2023   Prob (F-statistic):          1.60e-141
Time:                                       21:55:37   Log-Likelihood:                -12736.
No. Observations:                               1215   AIC:                         2.550e+04
Df Residuals:                                   1203   BIC:                         2.556e+04
Df Model:                                         11                                         
Covariance Type:                           nonrobust                                         
                                 coef    std err          t 

There definitely appears to be collinearity here, as terms like full-time faculty rate and retention rates have negative coefficients, not aligned with relationships shwon by exploratory analysis.
I explore this collinearity and these relationships with the correlation matrix below.

In [7]:
# see range of values for numeric variables
X_filled.assign(y6 = target,y10=target2).describe()

NameError: name 'target' is not defined

In [8]:
sns.set(rc={'figure.figsize':(10,5)},font_scale = 1)
corr_map = sns.heatmap(X_filled.assign(y6 = target,y10=target2).corr(),annot=True)
corr_map.set_title("Correlation matrix between main features and two response variables")

NameError: name 'target' is not defined

After examining the correlation matrix above, it appears that there is multicollinearity inherent in the features, and this is likely affecting the sign and the significance of retention rate. Full-time faculty rate may also be affected by multicollinearity, but these seems less likely after examining the correlation matrix, and after seeing the correlation coefficient of this variable with the response it appears that the effect of full-time faculty rate may in fact be negligible towards salary post college entry.

In [9]:
# evaluate performance
print("Test R^2:", skm.r2_score(ytrain,ols_res.predict(Xtrainplus)))
print(f'Train RMSE: {np.sqrt(skm.mean_squared_error(ytrain,ols_res.predict(Xtrainplus)))}')
print(f'Train Scaled RMSE (num sds): {np.sqrt(skm.mean_squared_error(ytrain,ols_res.predict(Xtrainplus)))/np.std(ytrain)}')

print("Test R^2:", skm.r2_score(ytest0,ols_res.predict(Xtestplus)))
print("Test RMSE:", np.sqrt(skm.mean_squared_error(ytest0,ols_res.predict(Xtestplus))))
print("Test Scaled RMSE:", np.sqrt(skm.mean_squared_error(ytest0,ols_res.predict(Xtestplus)))/np.std(ytest0))

Test R^2: 0.43680365073100913
Train RMSE: 8632.322577101606
Train Scaled RMSE (num sds): 0.7504640892600996
Test R^2: 0.4522258680990987
Test RMSE: 8214.078101600946
Test Scaled RMSE: 0.7401176473378411


### Lasso Regression
Here I fit a lasso regression model, another supervized learning technique that requires minimal hyperparameter tuning.
This model can hopefully account for multicollinearity and possible overfitting using regularization.


In [17]:
lcv = LassoCV(cv=10)
lcv.fit(Xtrain_filled,ytrain)

In [18]:
lcv.intercept_

15418.754412458533

In [19]:
print(Xtrain_filled.columns)
lcv.coef_

Index(['Full-time Faculty Rate (%)', 'Faculty Average Salary',
       'Student Enrollment Size', 'Attendance Cost', 'Completion Rate',
       'Admission Rate (%)', 'Retention Rate', 'SAT Average',
       'Ivy League Plus', 'x0_Private ForProfit', 'x0_Private NonProfit',
       'x0_Public'],
      dtype='object')


array([-0.        ,  0.2032263 , -0.06771863,  0.11784527,  0.        ,
       -0.        , -0.        ,  0.        ,  0.        ,  0.        ,
       -0.        , -0.        ])

In [20]:
lcv.alpha_

217782.12334472904

In [21]:
# test out predictions: are they on the right scale?
lcv.predict(X=Xtest_filled)[0:4]

array([36472.72893207, 41141.88934127, 44252.08390608, 37887.17292213])

In [22]:
print("Train R^2:", lcv.score(Xtrain_filled,ytrain))
print(f'Train RMSE: {np.sqrt(skm.mean_squared_error(ytrain,lcv.predict(Xtrain_filled)))}')
print(f'Train Scaled RMSE (num sds): {np.sqrt(skm.mean_squared_error(ytrain,lcv.predict(Xtrain_filled)))/np.std(ytrain)}')

print("Test R^2:", lcv.score(Xtest_filled,ytest0))
print(f'RMSE: {np.sqrt(skm.mean_squared_error(ytest0,lcv.predict(Xtest_filled)))}')
print(f'Scaled RMSE (num sds): {np.sqrt(skm.mean_squared_error(ytest0,lcv.predict(Xtest_filled)))/np.std(ytest0)}')
# results are not horrible
# only slightly worse than linear regression (from a fit and prediction sense)

Train R^2: 0.39498007221660325
Train RMSE: 8947.106051591509
Train Scaled RMSE (num sds): 0.7778302692640577
Test R^2: 0.42736728173586325
RMSE: 8398.39211217511
Scaled RMSE (num sds): 0.7567249951363683


Interesting to see how lasso, which regularized the feature set to only include 3 features, still had almost identical fit and performance to linear regression.
(It will be interesting to see how nonparametric methods perform in comparison: especially those that have implicity regularization like a random forest)