In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor

In [100]:
dat = pd.read_csv("./college_earnings_for_modeling.csv")

In [101]:
print(dat.columns)
print(dat.shape) # plenty of data to work with
dat.head()

Index(['School Name', 'State', 'School Ownership',
       'Full-time Faculty Rate (%)', 'Faculty Average Salary',
       'Student Enrollment Size', 'Attendance Cost',
       '150% Completion Rate at 4 Yr (%)', 'Admission Rate (%)',
       'RetentionRate_4yr', 'Female_Majority', 'SAT Average (Overall)',
       'SAT 75th Percentile Math', 'SAT 75th Percentile Reading',
       'SAT 75th Percentile Writing',
       'Percent of Students Earning >$25K (6 Yrs after Entry)',
       'Percent of Students Earning >$25K (10 Yrs after Entry)',
       'LowIncRatio_10yr', 'HighIncRatio_10yr', 'LowIncRatio_6yr',
       'HighIncRatio_6yr', 'Mean Earnings (6 Yrs after Entry)',
       'Mean Earnings (10 Yrs after Entry)'],
      dtype='object')
(1989, 23)


Unnamed: 0,School Name,State,School Ownership,Full-time Faculty Rate (%),Faculty Average Salary,Student Enrollment Size,Attendance Cost,150% Completion Rate at 4 Yr (%),Admission Rate (%),RetentionRate_4yr,...,SAT 75th Percentile Reading,SAT 75th Percentile Writing,Percent of Students Earning >$25K (6 Yrs after Entry),Percent of Students Earning >$25K (10 Yrs after Entry),LowIncRatio_10yr,HighIncRatio_10yr,LowIncRatio_6yr,HighIncRatio_6yr,Mean Earnings (6 Yrs after Entry),Mean Earnings (10 Yrs after Entry)
0,Alabama A & M University,AL,Public,99.6,91188.0,5090.0,23445.0,28.66,89.65,54.03,...,520.0,457.0,45.3,5990.0,0.090177,0.032613,0.179371,0.028291,28400.0,35500.0
1,University of Alabama at Birmingham,AL,Public,76.19,136560.0,13549.0,25542.0,61.17,80.6,86.4,...,668.0,,66.9,7470.0,0.096243,0.0479,0.108421,0.055355,39400.0,48400.0
2,University of Alabama in Huntsville,AL,Public,67.02,116364.0,7825.0,24861.0,57.14,77.11,81.8,...,700.0,,68.5,7790.0,0.087157,0.048307,0.08754,0.04524,40300.0,52000.0
3,Alabama State University,AL,Public,67.97,86328.0,3603.0,21892.0,31.77,98.88,62.02,...,531.0,,39.3,5280.0,0.421593,0.04191,0.330003,0.034416,24400.0,30600.0
4,The University of Alabama,AL,Public,77.07,124188.0,30610.0,30016.0,72.14,80.39,87.23,...,660.0,600.0,69.5,7860.0,0.046521,0.066122,0.057269,0.077328,42400.0,51600.0


In [111]:
# examine issues further
dat = dat[dat["Student Enrollment Size"]!=0] # 3 universities with no reported enrollment ---also missing many other vars
missing_values = dat.isnull().sum()
# print(missing_values.sort_values(ascending=False))
missing_values_in_rows = dat.isnull().sum(axis=1)
print(missing_values_in_rows.sort_values(ascending=False))
# important to consider: what is good threshold of missing values to constitue throwing out obs?
# could also try EM algorithm + MI for imputing these data

# drop obs with missing values in response column(s)
dat = dat[dat["Mean Earnings (6 Yrs after Entry)"].isnull()==False]



64      15
348     13
641     13
27      12
1027    12
        ..
1263     0
1264     0
1266     0
1267     0
0        0
Length: 1736, dtype: int64


In [112]:
# look for missing values/other potential issues
dat.describe()

Unnamed: 0,Full-time Faculty Rate (%),Faculty Average Salary,Student Enrollment Size,Attendance Cost,150% Completion Rate at 4 Yr (%),Admission Rate (%),RetentionRate_4yr,SAT Average (Overall),SAT 75th Percentile Math,SAT 75th Percentile Reading,SAT 75th Percentile Writing,Percent of Students Earning >$25K (6 Yrs after Entry),Percent of Students Earning >$25K (10 Yrs after Entry),LowIncRatio_10yr,HighIncRatio_10yr,LowIncRatio_6yr,HighIncRatio_6yr,Mean Earnings (6 Yrs after Entry),Mean Earnings (10 Yrs after Entry)
count,1700.0,1720.0,1735.0,1652.0,1665.0,1476.0,1657.0,1060.0,1017.0,1017.0,667.0,1717.0,1693.0,1690.0,1589.0,1701.0,1607.0,1736.0,1707.0
mean,65.420712,96844.172093,5001.518732,38032.791162,55.156354,70.078686,74.219813,1142.49717,610.47296,613.92527,572.245877,64.880955,7391.128175,2.763992,0.642358,3.742308,0.495008,39203.571429,50261.862917
std,26.933711,31461.827918,8442.298665,17114.296557,19.858825,20.19319,14.072419,130.185605,73.280119,60.746718,80.238175,14.047024,1174.523732,37.88364,7.252906,47.429555,4.914861,11389.718248,16219.323778
min,0.0,11916.0,13.0,5663.0,0.0,5.01,0.0,842.0,405.0,415.0,360.0,8.1,900.0,0.001126,0.000568,0.004929,0.002765,13300.0,18000.0
25%,45.8825,77220.0,900.0,23588.5,42.92,59.88,67.59,1053.0,560.0,570.0,520.0,58.0,6880.0,0.065933,0.05786,0.073143,0.062799,32700.0,40850.0
50%,68.275,91572.0,1894.0,34450.0,55.33,74.025,76.16,1117.0,600.0,610.0,560.0,66.7,7650.0,0.121958,0.096552,0.142178,0.101329,37600.0,47900.0
75%,89.3,112833.0,5244.0,50152.75,68.53,84.525,83.17,1207.0,650.0,650.0,610.0,73.4,8140.0,0.208883,0.149088,0.254232,0.155258,43100.0,55550.0
max,100.0,253716.0,109233.0,81531.0,100.0,100.0,100.0,1566.0,800.0,780.0,800.0,97.2,9570.0,892.514019,158.458333,1111.083333,137.083333,104500.0,171800.0


In [113]:
# define features and target
features = ["State", "School Ownership", 'Full-time Faculty Rate (%)', 'Faculty Average Salary', 'Student Enrollment Size','Attendance Cost',
            '150% Completion Rate at 4 Yr (%)', 'Admission Rate (%)', 'RetentionRate_4yr',  "Female_Majority",'SAT Average (Overall)']
X = dat.loc[:,features]
target = dat['Mean Earnings (6 Yrs after Entry)']

Xtrain, Xtest, ytrain, ytest = train_test_split(X,target,random_state=4015,test_size=0.2)

In [114]:
X.dtypes

State                                object
School Ownership                     object
Full-time Faculty Rate (%)          float64
Faculty Average Salary              float64
Student Enrollment Size             float64
Attendance Cost                     float64
150% Completion Rate at 4 Yr (%)    float64
Admission Rate (%)                  float64
RetentionRate_4yr                   float64
Female_Majority                        bool
SAT Average (Overall)               float64
dtype: object

In [122]:
numeric_features = X.select_dtypes(exclude=['object']).columns
categorical_features = ['State','School Ownership']

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    # ("poly2", PolynomialFeatures(degree=2)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

mypipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(oob_score=True))
])

# fit pipe to the training data
mypipe.fit(Xtrain,ytrain)

In [121]:
mypipe.feature_names_in_

AttributeError: 'Pipeline' object has no attribute 'feature_importances'