In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px

import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector

pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer

from sklearn.decomposition import PCA
from joblib import dump, load

In [3]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [4]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [5]:
dependent_df = df[df.columns[-14:]]

In [6]:
features = df.iloc[:,:-14]

#### Running pandas.get_dummies on city of institution would lead to thousands of features and in our case would lead to a wide data set

In [7]:
features.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [8]:
features = pd.get_dummies(features, drop_first=True)

In [9]:
features.head()

Unnamed: 0_level_0,Core_Revenues,Tuition_And_Fees,Government_Grants,Private_Gifts,Investment_Return,Sales_And_Services,Other_Revenues,Tuition_And_Fees_As_Dollar_Amount,Government_Grants_As_Dollar_Amount,Private_Gifts_As_Dollar_Amount,Investment_Return_As_Dollar_Amount,Sales_And_Services_As_Dollar_Amount,Other_Revenues_As_Dollar_Amount,Core_Expenses,Instruction_Expenses,Research_Expenses,Public_Service_Expenses,Academic_Support_Expenses,Student_Service_Expenses,Institutional_Support_Expenses,Other_Core_Expenses,Instruction_Expenses_As_Dollar_Amount,Research_Expenses_As_Dollar_Amount,Public_Service_Expenses_As_Dollar_Amount,Academic_Support_Expenses_As_Dollar_Amount,Student_Service_Expenses_As_Dollar_Amount,Institutional_Support_Expenses_As_Dollar_Amount,Other_Core_Expenses_As_Dollar_Amount,Grand total instructional_staff,Grand total men instructional_staff,Grand total women instructional_staff,American Indian or Alaska Native total instructional_staff,American Indian or Alaska Native men instructional_staff,American Indian or Alaska Native women instructional_staff,Asian total instructional_staff,Asian men instructional_staff,Asian women instructional_staff,Black or African American total instructional_staff,Black or African American men instructional_staff,Black or African American women instructional_staff,Hispanic or Latino total instructional_staff,Hispanic or Latino men instructional_staff,Hispanic or Latino women instructional_staff,Native Hawaiian or Other Pacific Islander total instructional_staff,Native Hawaiian or Other Pacific Islander men instructional_staff,Native Hawaiian or Other Pacific Islander women instructional_staff,White total instructional_staff,White men instructional_staff,White women instructional_staff,Two or more races total instructional_staff,Two or more races men instructional_staff,Two or more races women instructional_staff,Race/ethnicity unknown total instructional_staff,Race/ethnicity unknown men instructional_staff,Race/ethnicity unknown women instructional_staff,Nonresident alien total instructional_staff,Nonresident alien men instructional_staff,Nonresident alien women instructional_staff,Grand total men instructional_staff_as_percentage,Grand total women instructional_staff_as_percentage,American Indian or Alaska Native total instructional_staff_as_percentage,American Indian or Alaska Native men instructional_staff_as_percentage,American Indian or Alaska Native women instructional_staff_as_percentage,Asian total instructional_staff_as_percentage,Asian men instructional_staff_as_percentage,Asian women instructional_staff_as_percentage,Black or African American total instructional_staff_as_percentage,Black or African American men instructional_staff_as_percentage,Black or African American women instructional_staff_as_percentage,Hispanic or Latino total instructional_staff_as_percentage,Hispanic or Latino men instructional_staff_as_percentage,Hispanic or Latino women instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander total instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander men instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander women instructional_staff_as_percentage,White total instructional_staff_as_percentage,White men instructional_staff_as_percentage,White women instructional_staff_as_percentage,Two or more races total instructional_staff_as_percentage,Two or more races men instructional_staff_as_percentage,Two or more races women instructional_staff_as_percentage,Race/ethnicity unknown total instructional_staff_as_percentage,Race/ethnicity unknown men instructional_staff_as_percentage,Race/ethnicity unknown women instructional_staff_as_percentage,Nonresident alien total instructional_staff_as_percentage,Nonresident alien men instructional_staff_as_percentage,Nonresident alien women instructional_staff_as_percentage,Percent of full-time first-time undergraduates awarded any financial aid (SFA1819),Percent of full-time first-time undergraduates awarded federal state local or institutional grant aid (SFA1819),Percent of total enrollment that are American Indian or Alaska Native (DRVEF2013_RV),Percent of total enrollment that are Asian/Native Hawaiian/Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Asian (DRVEF2013_RV),Percent of total enrollment that are Native Hawaiian or Other Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Black or African American (DRVEF2013_RV),Percent of total enrollment that are Hispanic/Latino (DRVEF2013_RV),Percent of total enrollment that are White (DRVEF2013_RV),Percent of total enrollment that are Race/ethnicity unknown (DRVEF2013_RV),Percent of total enrollment that are Nonresident Alien (DRVEF2013_RV),Percent of total enrollment that are two or more races (DRVEF2013_RV),Percent of total enrollment that are women (DRVEF2013_RV),...,Carnegie Classification 2018: Enrollment Profile (HD2018)_Majority graduate,Carnegie Classification 2018: Enrollment Profile (HD2018)_Majority undergraduate,"Carnegie Classification 2018: Enrollment Profile (HD2018)_Not applicable, not in Carnegie universe (not accredited or nondegree-granting)",Carnegie Classification 2018: Enrollment Profile (HD2018)_Very high undergraduate,Carnegie Classification 2018: Enrollment Profile (HD2018)_isMissing,"Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Not applicable, not in Carnegie universe (not accredited or nondegree-granting)","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, large","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, medium","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, small","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, very large","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, very small",Carnegie Classification 2018: Size and Setting (HD2018)_isMissing,Historically Black College or University (HD2018)_Yes,Historically Black College or University (HD2018)_isMissing,"Institution size category (HD2018)_10,000 - 19,999","Institution size category (HD2018)_20,000 and above","Institution size category (HD2018)_5,000 - 9,999","Institution size category (HD2018)_Under 1,000",Institution size category (HD2018)_isMissing,Parent/child indicator - Finance (FLAGS2019)_Child record - reports partial data but other data is included with entity that is not a postsecondary institution,Parent/child indicator - Finance (FLAGS2019)_Not applicable,Parent/child indicator - Finance (FLAGS2019)_Parent record - includes data from branch campuses,Parent/child indicator - Finance (FLAGS2019)_Partial child record - reports revenues/expenses. Assets/liabilties reported with parent,"Sector of institution (HD2018)_Private for-profit, 4-year or above","Sector of institution (HD2018)_Private for-profit, less-than 2-year","Sector of institution (HD2018)_Private not-for-profit, 2-year","Sector of institution (HD2018)_Private not-for-profit, 4-year or above","Sector of institution (HD2018)_Public, 2-year","Sector of institution (HD2018)_Public, 4-year or above",Sector of institution (HD2018)_isMissing,State abbreviation (HD2018)_Alaska,State abbreviation (HD2018)_American Samoa,State abbreviation (HD2018)_Arizona,State abbreviation (HD2018)_Arkansas,State abbreviation (HD2018)_California,State abbreviation (HD2018)_Colorado,State abbreviation (HD2018)_Connecticut,State abbreviation (HD2018)_Delaware,State abbreviation (HD2018)_District of Columbia,State abbreviation (HD2018)_Federated States of Micronesia,State abbreviation (HD2018)_Florida,State abbreviation (HD2018)_Georgia,State abbreviation (HD2018)_Guam,State abbreviation (HD2018)_Hawaii,State abbreviation (HD2018)_Idaho,State abbreviation (HD2018)_Illinois,State abbreviation (HD2018)_Indiana,State abbreviation (HD2018)_Iowa,State abbreviation (HD2018)_Kansas,State abbreviation (HD2018)_Kentucky,State abbreviation (HD2018)_Louisiana,State abbreviation (HD2018)_Maine,State abbreviation (HD2018)_Marshall Islands,State abbreviation (HD2018)_Maryland,State abbreviation (HD2018)_Massachusetts,State abbreviation (HD2018)_Michigan,State abbreviation (HD2018)_Minnesota,State abbreviation (HD2018)_Mississippi,State abbreviation (HD2018)_Missouri,State abbreviation (HD2018)_Montana,State abbreviation (HD2018)_Nebraska,State abbreviation (HD2018)_Nevada,State abbreviation (HD2018)_New Hampshire,State abbreviation (HD2018)_New Jersey,State abbreviation (HD2018)_New Mexico,State abbreviation (HD2018)_New York,State abbreviation (HD2018)_North Carolina,State abbreviation (HD2018)_North Dakota,State abbreviation (HD2018)_Northern Marianas,State abbreviation (HD2018)_Ohio,State abbreviation (HD2018)_Oklahoma,State abbreviation (HD2018)_Oregon,State abbreviation (HD2018)_Pennsylvania,State abbreviation (HD2018)_Puerto Rico,State abbreviation (HD2018)_Rhode Island,State abbreviation (HD2018)_South Carolina,State abbreviation (HD2018)_South Dakota,State abbreviation (HD2018)_Tennessee,State abbreviation (HD2018)_Texas,State abbreviation (HD2018)_Utah,State abbreviation (HD2018)_Vermont,State abbreviation (HD2018)_Virgin Islands,State abbreviation (HD2018)_Virginia,State abbreviation (HD2018)_Washington,State abbreviation (HD2018)_West Virginia,State abbreviation (HD2018)_Wisconsin,State abbreviation (HD2018)_Wyoming,State abbreviation (HD2018)_isMissing
UnitID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1
180203,7078959.0,2.0,60.0,7.0,1.0,0.0,29.0,141579.2,4247375.4,495527.13,70789.59,0.0,2052898.11,7294344.0,29.0,3.0,7.0,3.0,8.0,16.0,36.0,2115360.0,218830.32,510604.08,218830.32,583547.52,1167095.04,2625963.84,13.0,7.0,6.0,8.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.85,46.15,61.54,30.77,30.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.46,23.08,15.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,82.0,88.0,0.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0,0.0,57.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
222178,136379482.0,57.0,4.0,17.0,18.0,0.0,4.0,77736300.0,5455179.28,23184511.94,24548306.76,0.0,5455179.28,121513039.0,38.0,1.0,2.0,10.0,26.0,24.0,0.0,46174950.0,1215130.39,2430260.78,12151303.9,31593390.14,29163129.36,0.0,265.0,154.0,111.0,1.0,1.0,0.0,3.0,2.0,1.0,14.0,7.0,7.0,13.0,8.0,5.0,0.0,0.0,0.0,228.0,131.0,97.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,58.11,41.89,0.38,0.38,0.0,1.13,0.75,0.38,5.28,2.64,2.64,4.91,3.02,1.89,0.0,0.0,0.0,86.04,49.43,36.6,1.13,0.75,0.38,0.0,0.0,0.0,1.13,1.13,0.0,100.0,100.0,0.0,1.0,1.0,0.0,8.0,11.0,69.0,2.0,4.0,3.0,58.0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
138558,70491879.0,17.0,17.0,2.0,0.0,0.0,28.0,11983620.0,11983619.43,1409837.58,0.0,0.0,19737726.12,50201124.0,42.0,0.0,3.0,15.0,9.0,19.0,13.0,21084470.0,0.0,1506033.72,7530168.6,4518101.16,9538213.56,6526146.12,149.0,77.0,72.0,0.0,0.0,0.0,10.0,5.0,5.0,10.0,7.0,3.0,3.0,2.0,1.0,0.0,0.0,0.0,124.0,62.0,62.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,51.68,48.32,0.0,0.0,0.0,6.71,3.36,3.36,6.71,4.7,2.01,2.01,1.34,0.67,0.0,0.0,0.0,83.22,41.61,41.61,0.0,0.0,0.0,0.67,0.67,0.0,0.67,0.0,0.67,91.0,86.0,0.0,1.0,1.0,0.0,12.0,6.0,78.0,0.0,2.0,1.0,54.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
172866,3019292.0,100.0,0.0,0.0,0.0,0.0,0.0,3019292.0,0.0,0.0,0.0,0.0,0.0,2767687.0,13.0,0.0,0.0,36.0,11.0,4.0,37.0,359799.3,0.0,0.0,996367.32,304445.57,110707.48,1024044.19,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100.0,100.0,1.0,9.0,9.0,1.0,14.0,1.0,70.0,3.0,0.0,1.0,21.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
108232,217471648.0,100.0,0.0,0.0,0.0,0.0,0.0,217471600.0,0.0,0.0,0.0,0.0,0.0,211573114.0,52.0,0.0,0.0,0.0,19.0,15.0,14.0,110018000.0,0.0,0.0,0.0,40198891.66,31735967.1,29620235.96,219.0,124.0,95.0,0.0,0.0,0.0,22.0,6.0,16.0,4.0,3.0,1.0,8.0,5.0,3.0,3.0,1.0,2.0,141.0,84.0,57.0,2.0,1.0,1.0,39.0,24.0,15.0,0.0,0.0,0.0,56.62,43.38,0.0,0.0,0.0,10.05,2.74,7.31,1.83,1.37,0.46,3.65,2.28,1.37,1.37,0.46,0.91,64.38,38.36,26.03,0.91,0.46,0.46,17.81,10.96,6.85,0.0,0.0,0.0,59.0,49.0,0.0,7.0,7.0,0.0,6.0,8.0,25.0,20.0,31.0,2.0,58.0,...,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
dependent_df.head()

Unnamed: 0_level_0,Graduation rate total cohort (DRVGR2019),Graduation rate men (DRVGR2019),Graduation rate women (DRVGR2019),Graduation rate American Indian or Alaska Native (DRVGR2019),Graduation rate Asian/Native Hawaiian/Other Pacific Islander (DRVGR2019),Graduation rate Asian (DRVGR2019),Graduation rate Native Hawaiian or Other Pacific Islander (DRVGR2019),Graduation rate Black non-Hispanic (DRVGR2019),Graduation rate Hispanic (DRVGR2019),Graduation rate White non-Hispanic (DRVGR2019),Graduation rate two or more races (DRVGR2019),Graduation rate Race/ethnicity unknown (DRVGR2019),Graduation rate Nonresident alien (DRVGR2019),Transfer-out rate total cohort (DRVGR2019)
UnitID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
180203,29.0,13.0,46.0,26.0,,,,,,33.0,,,,
222178,61.0,56.0,65.0,25.0,58.0,58.0,,53.0,50.0,67.0,54.0,0.0,57.0,
138558,26.0,28.0,25.0,33.0,25.0,25.0,,12.0,32.0,29.0,20.0,100.0,33.0,39.0
172866,0.0,0.0,0.0,,,,,0.0,,0.0,,,,
108232,45.0,38.0,51.0,67.0,56.0,57.0,40.0,20.0,33.0,50.0,40.0,28.0,57.0,14.0


In [11]:
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

## train model and return results

In [12]:
def train_model(model, X, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return (mae, mse)


#### Dictionary holds paramter key value pairs along with model results recorded for each experiment

In [13]:
baseline_epoch = []

In [14]:
num_cols = list(range(124)) #numeric column numbers

In [15]:
X = features
y = df.iloc[:,-14]

# Ridge Regression Pipeline

In [16]:
model = Pipeline(
    [   #Impute Numeric Columns
        ("Imputer", ColumnTransformer([
            ("Impute", SimpleImputer(), num_cols)
        ], remainder='passthrough')),
        
        #Scale Numeric Columns
        ("Scaler", ColumnTransformer([
            ("Scale", StandardScaler(), num_cols)
        ], remainder='passthrough')),
        
        ("classifier", ElasticNet(alpha=0.1, l1_ratio=0.5))
    ]
)

### First predicting total graduation rates

In [17]:
mae_total, mse_total = train_model(model, X,y)

### predicting African American Graduation Rates

In [18]:
# first drop where African American Graduation Rates are not available
df_black = df.dropna(subset=[df.columns[-7]])

In [19]:
X_black = df_black.iloc[:,:-14]
y_black = df_black.iloc[:,-7]

In [20]:
X_black = pd.get_dummies(X_black, drop_first=True)

In [21]:
mae_black, mse_black = train_model(model, X_black, y_black)

In [22]:
mae_black, mse_black

(14.861809376780581, 411.01293077069744)

In [23]:
results = {
    "model": model,
    "alpha": 0.1,
    "l1_ratio": 0.5,
    "mae_total": mae_total,
    "mse_total": mse_total,
    "mae_black": mae_black,
    "mse_black": mse_black
}

In [24]:
baseline_epoch.append(results)

In [25]:
dump(baseline_epoch, "data/model_logging.joblib")

['data/model_logging.joblib']

In [26]:
baseline_epoch[0]

{'model': Pipeline(steps=[('Imputer',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('Impute', SimpleImputer(),
                                                   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                    10, 11, 12, 13, 14, 15, 16,
                                                    17, 18, 19, 20, 21, 22, 23,
                                                    24, 25, 26, 27, 28, 29, ...])])),
                 ('Scaler',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('Scale', StandardScaler(),
                                                   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                    10, 11, 12, 13, 14, 15, 16,
                                                    17, 18, 19, 20, 21, 22, 23,
                                                    24, 25, 26, 27, 28, 29, ...])])),
