In [35]:
import pandas as pd
import numpy as np
import xgboost as xgb

#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px

import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

In [36]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA

In [37]:
df = pd.read_csv("data/cleaned_df.csv")

In [38]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [39]:
dependent_df = df.iloc[:,-14:]

In [40]:
dependent_df.head()

Unnamed: 0,Graduation rate total cohort (DRVGR2019),Graduation rate men (DRVGR2019),Graduation rate women (DRVGR2019),Graduation rate American Indian or Alaska Native (DRVGR2019),Graduation rate Asian/Native Hawaiian/Other Pacific Islander (DRVGR2019),Graduation rate Asian (DRVGR2019),Graduation rate Native Hawaiian or Other Pacific Islander (DRVGR2019),Graduation rate Black non-Hispanic (DRVGR2019),Graduation rate Hispanic (DRVGR2019),Graduation rate White non-Hispanic (DRVGR2019),Graduation rate two or more races (DRVGR2019),Graduation rate Race/ethnicity unknown (DRVGR2019),Graduation rate Nonresident alien (DRVGR2019),Transfer-out rate total cohort (DRVGR2019)
0,29.0,13.0,46.0,26.0,,,,,,33.0,,,,
1,61.0,56.0,65.0,25.0,58.0,58.0,,53.0,50.0,67.0,54.0,0.0,57.0,
2,26.0,28.0,25.0,33.0,25.0,25.0,,12.0,32.0,29.0,20.0,100.0,33.0,39.0
3,0.0,0.0,0.0,,,,,0.0,,0.0,,,,
4,45.0,38.0,51.0,67.0,56.0,57.0,40.0,20.0,33.0,50.0,40.0,28.0,57.0,14.0


In [41]:
features = df.iloc[:,:-14]

In [42]:
features.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [48]:
features = pd.get_dummies(features, drop_first=True)

In [49]:
features.head(2)

Unnamed: 0,UnitID,Core_Revenues,Tuition_And_Fees,Government_Grants,Private_Gifts,Investment_Return,Sales_And_Services,Other_Revenues,Tuition_And_Fees_As_Dollar_Amount,Government_Grants_As_Dollar_Amount,Private_Gifts_As_Dollar_Amount,Investment_Return_As_Dollar_Amount,Sales_And_Services_As_Dollar_Amount,Other_Revenues_As_Dollar_Amount,Core_Expenses,Instruction_Expenses,Research_Expenses,Public_Service_Expenses,Academic_Support_Expenses,Student_Service_Expenses,Institutional_Support_Expenses,Other_Core_Expenses,Instruction_Expenses_As_Dollar_Amount,Research_Expenses_As_Dollar_Amount,Public_Service_Expenses_As_Dollar_Amount,Academic_Support_Expenses_As_Dollar_Amount,Student_Service_Expenses_As_Dollar_Amount,Institutional_Support_Expenses_As_Dollar_Amount,Other_Core_Expenses_As_Dollar_Amount,Grand total instructional_staff,Grand total men instructional_staff,Grand total women instructional_staff,American Indian or Alaska Native total instructional_staff,American Indian or Alaska Native men instructional_staff,American Indian or Alaska Native women instructional_staff,Asian total instructional_staff,Asian men instructional_staff,Asian women instructional_staff,Black or African American total instructional_staff,Black or African American men instructional_staff,Black or African American women instructional_staff,Hispanic or Latino total instructional_staff,Hispanic or Latino men instructional_staff,Hispanic or Latino women instructional_staff,Native Hawaiian or Other Pacific Islander total instructional_staff,Native Hawaiian or Other Pacific Islander men instructional_staff,Native Hawaiian or Other Pacific Islander women instructional_staff,White total instructional_staff,White men instructional_staff,White women instructional_staff,Two or more races total instructional_staff,Two or more races men instructional_staff,Two or more races women instructional_staff,Race/ethnicity unknown total instructional_staff,Race/ethnicity unknown men instructional_staff,Race/ethnicity unknown women instructional_staff,Nonresident alien total instructional_staff,Nonresident alien men instructional_staff,Nonresident alien women instructional_staff,Percent of full-time first-time undergraduates instructional_staff,Percent of full-time first-time undergraduates awarded federal state local instructional_staff,Grand total men instructional_staff_as_percentage,Grand total women instructional_staff_as_percentage,American Indian or Alaska Native total instructional_staff_as_percentage,American Indian or Alaska Native men instructional_staff_as_percentage,American Indian or Alaska Native women instructional_staff_as_percentage,Asian total instructional_staff_as_percentage,Asian men instructional_staff_as_percentage,Asian women instructional_staff_as_percentage,Black or African American total instructional_staff_as_percentage,Black or African American men instructional_staff_as_percentage,Black or African American women instructional_staff_as_percentage,Hispanic or Latino total instructional_staff_as_percentage,Hispanic or Latino men instructional_staff_as_percentage,Hispanic or Latino women instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander total instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander men instructional_staff_as_percentage,Native Hawaiian or Other Pacific Islander women instructional_staff_as_percentage,White total instructional_staff_as_percentage,White men instructional_staff_as_percentage,White women instructional_staff_as_percentage,Two or more races total instructional_staff_as_percentage,Two or more races men instructional_staff_as_percentage,Two or more races women instructional_staff_as_percentage,Race/ethnicity unknown total instructional_staff_as_percentage,Race/ethnicity unknown men instructional_staff_as_percentage,Race/ethnicity unknown women instructional_staff_as_percentage,Nonresident alien total instructional_staff_as_percentage,Nonresident alien men instructional_staff_as_percentage,Nonresident alien women instructional_staff_as_percentage,Percent of full-time first-time undergraduates instructional_staff_as_percentage,Percent of full-time first-time undergraduates awarded federal state local instructional_staff_as_percentage,Percent of full-time first-time undergraduates awarded any financial aid (SFA1819),Percent of full-time first-time undergraduates awarded federal state local or institutional grant aid (SFA1819),Percent of total enrollment that are American Indian or Alaska Native (DRVEF2013_RV),Percent of total enrollment that are Asian/Native Hawaiian/Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Asian (DRVEF2013_RV),Percent of total enrollment that are Native Hawaiian or Other Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Black or African American (DRVEF2013_RV),Percent of total enrollment that are Hispanic/Latino (DRVEF2013_RV),...,Carnegie Classification 2018: Enrollment Profile (HD2018)_Majority graduate,Carnegie Classification 2018: Enrollment Profile (HD2018)_Majority undergraduate,"Carnegie Classification 2018: Enrollment Profile (HD2018)_Not applicable, not in Carnegie universe (not accredited or nondegree-granting)",Carnegie Classification 2018: Enrollment Profile (HD2018)_Very high undergraduate,Carnegie Classification 2018: Enrollment Profile (HD2018)_isMissing,"Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, large, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, medium, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, small, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, highly residential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, primarily nonresidential","Carnegie Classification 2018: Size and Setting (HD2018)_Four-year, very small, primarily residential","Carnegie Classification 2018: Size and Setting (HD2018)_Not applicable, not in Carnegie universe (not accredited or nondegree-granting)","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, large","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, medium","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, small","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, very large","Carnegie Classification 2018: Size and Setting (HD2018)_Two-year, very small",Carnegie Classification 2018: Size and Setting (HD2018)_isMissing,Historically Black College or University (HD2018)_Yes,Historically Black College or University (HD2018)_isMissing,"Institution size category (HD2018)_10,000 - 19,999","Institution size category (HD2018)_20,000 and above","Institution size category (HD2018)_5,000 - 9,999","Institution size category (HD2018)_Under 1,000",Institution size category (HD2018)_isMissing,Parent/child indicator - Finance (FLAGS2019)_Child record - reports partial data but other data is included with entity that is not a postsecondary institution,Parent/child indicator - Finance (FLAGS2019)_Not applicable,Parent/child indicator - Finance (FLAGS2019)_Parent record - includes data from branch campuses,Parent/child indicator - Finance (FLAGS2019)_Partial child record - reports revenues/expenses. Assets/liabilties reported with parent,"Sector of institution (HD2018)_Private for-profit, 4-year or above","Sector of institution (HD2018)_Private for-profit, less-than 2-year","Sector of institution (HD2018)_Private not-for-profit, 2-year","Sector of institution (HD2018)_Private not-for-profit, 4-year or above","Sector of institution (HD2018)_Public, 2-year","Sector of institution (HD2018)_Public, 4-year or above",Sector of institution (HD2018)_isMissing,State abbreviation (HD2018)_Alaska,State abbreviation (HD2018)_American Samoa,State abbreviation (HD2018)_Arizona,State abbreviation (HD2018)_Arkansas,State abbreviation (HD2018)_California,State abbreviation (HD2018)_Colorado,State abbreviation (HD2018)_Connecticut,State abbreviation (HD2018)_Delaware,State abbreviation (HD2018)_District of Columbia,State abbreviation (HD2018)_Federated States of Micronesia,State abbreviation (HD2018)_Florida,State abbreviation (HD2018)_Georgia,State abbreviation (HD2018)_Guam,State abbreviation (HD2018)_Hawaii,State abbreviation (HD2018)_Idaho,State abbreviation (HD2018)_Illinois,State abbreviation (HD2018)_Indiana,State abbreviation (HD2018)_Iowa,State abbreviation (HD2018)_Kansas,State abbreviation (HD2018)_Kentucky,State abbreviation (HD2018)_Louisiana,State abbreviation (HD2018)_Maine,State abbreviation (HD2018)_Marshall Islands,State abbreviation (HD2018)_Maryland,State abbreviation (HD2018)_Massachusetts,State abbreviation (HD2018)_Michigan,State abbreviation (HD2018)_Minnesota,State abbreviation (HD2018)_Mississippi,State abbreviation (HD2018)_Missouri,State abbreviation (HD2018)_Montana,State abbreviation (HD2018)_Nebraska,State abbreviation (HD2018)_Nevada,State abbreviation (HD2018)_New Hampshire,State abbreviation (HD2018)_New Jersey,State abbreviation (HD2018)_New Mexico,State abbreviation (HD2018)_New York,State abbreviation (HD2018)_North Carolina,State abbreviation (HD2018)_North Dakota,State abbreviation (HD2018)_Northern Marianas,State abbreviation (HD2018)_Ohio,State abbreviation (HD2018)_Oklahoma,State abbreviation (HD2018)_Oregon,State abbreviation (HD2018)_Pennsylvania,State abbreviation (HD2018)_Puerto Rico,State abbreviation (HD2018)_Rhode Island,State abbreviation (HD2018)_South Carolina,State abbreviation (HD2018)_South Dakota,State abbreviation (HD2018)_Tennessee,State abbreviation (HD2018)_Texas,State abbreviation (HD2018)_Utah,State abbreviation (HD2018)_Vermont,State abbreviation (HD2018)_Virgin Islands,State abbreviation (HD2018)_Virginia,State abbreviation (HD2018)_Washington,State abbreviation (HD2018)_West Virginia,State abbreviation (HD2018)_Wisconsin,State abbreviation (HD2018)_Wyoming,State abbreviation (HD2018)_isMissing
0,180203,7078959.0,2.0,60.0,7.0,1.0,0.0,29.0,141579.18,4247375.4,495527.13,70789.59,0.0,2052898.11,7294344.0,29.0,3.0,7.0,3.0,8.0,16.0,36.0,2115359.76,218830.32,510604.08,218830.32,583547.52,1167095.04,2625963.84,13.0,7.0,6.0,8.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,82.0,53.85,46.15,61.54,30.77,30.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.46,23.08,15.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,653.85,630.77,85.0,82.0,88.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,222178,136379482.0,57.0,4.0,17.0,18.0,0.0,4.0,77736304.74,5455179.28,23184511.94,24548306.76,0.0,5455179.28,121513039.0,38.0,1.0,2.0,10.0,26.0,24.0,0.0,46174954.82,1215130.39,2430260.78,12151303.9,31593390.14,29163129.36,0.0,265.0,154.0,111.0,1.0,1.0,0.0,3.0,2.0,1.0,14.0,7.0,7.0,13.0,8.0,5.0,0.0,0.0,0.0,228.0,131.0,97.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,100.0,100.0,58.11,41.89,0.38,0.38,0.0,1.13,0.75,0.38,5.28,2.64,2.64,4.91,3.02,1.89,0.0,0.0,0.0,86.04,49.43,36.6,1.13,0.75,0.38,0.0,0.0,0.0,1.13,1.13,0.0,37.74,37.74,100.0,100.0,0.0,1.0,1.0,0.0,8.0,11.0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [50]:
X = features.copy(deep=True)
y = dependent_df.copy(deep=True).iloc[:, 0]

In [51]:
pipeline = Pipeline(
    [
        ("imputer", KNNImputer()),
        ("scaler", StandardScaler()),
        ("classifier", ElasticNet(alpha=0.1, l1_ratio=0.5)),
    ]
)

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [53]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('imputer', KNNImputer()), ('scaler', StandardScaler()),
                ('classifier', ElasticNet(alpha=0.1))])

In [54]:
y_pred = pipeline.predict(x_test)

In [55]:
mean_absolute_error(y_pred, y_test)

10.016345505156485

In [56]:
y_pred[:5]

array([29.99800245, 77.98495129, 33.90579771, 25.8826557 , 61.80923343])

In [57]:
y_test[:5]

2157    41.0
563     81.0
789     33.0
1325    46.0
570     52.0
Name: Graduation rate  total cohort (DRVGR2019), dtype: float64

# Baseline Elastic Net Model Predicts within 10%

## Lets do some feature engineering and model optimization to see if we can bring that down

In [70]:
clean_pipeline = Pipeline(
    [
        ("imputer", KNNImputer()),
        ("scaler", StandardScaler()),
    ]
)

In [71]:
new_features = clean_pipeline.fit_transform(features)

In [74]:
new_features = pd.DataFrame(new_features, columns=features.columns)

### First Step is to make principle components

In [75]:
# Create PCA Transformer
pca_transformer = PCA(n_components=1) 

In [76]:
df.columns[3:9] # Revenues Distribution

Index(['Tuition_And_Fees', 'Government_Grants', 'Private_Gifts',
       'Investment_Return', 'Sales_And_Services', 'Other_Revenues'],
      dtype='object')

In [77]:
#Transform these columns into a principle component
rev_dist_pc = pca_transformer.fit_transform(new_features[df.columns[3:9]])

In [82]:
dependent_df.iloc[:,0].to_numpy()

array([29., 61., 26., ..., 41., 54., 58.])

In [89]:
data = {"rev_dist_pc":rev_dist_pc.flatten(), "target":dependent_df.iloc[:,0].to_numpy()}
df_pc1 = pd.DataFrame(data)

In [107]:
# Visualize principle component
fig = px.scatter(
    df_pc1, x="rev_dist_pc", y="target", height=600, width=800, trendline='ols',
    title="Revenue Distribution Principle Component", trendline_color_override="black",
    
)
results = px.get_trendline_results(fig) # Calculate R2 of Linear Regression
fig.show()

results.iloc[0]['px_fit_results'].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,24.62
Date:,"Sun, 05 Sep 2021",Prob (F-statistic):,7.49e-07
Time:,18:25:09,Log-Likelihood:,-10416.0
No. Observations:,2315,AIC:,20840.0
Df Residuals:,2313,BIC:,20850.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,51.5352,0.453,113.870,0.000,50.648,52.423
x1,1.6556,0.334,4.962,0.000,1.001,2.310

0,1,2,3
Omnibus:,24.6,Durbin-Watson:,1.877
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.249
Skew:,-0.088,Prob(JB):,0.00018
Kurtosis:,2.616,Cond. No.,1.36


In [None]:
#polyfit principle component to graduation rate
x = 

In [None]:
# calculate new R2 of polyfitted pc

In [61]:
df.columns[9:15] # Revenues Dollar Amount

Index(['Tuition_And_Fees_As_Dollar_Amount',
       'Government_Grants_As_Dollar_Amount', 'Private_Gifts_As_Dollar_Amount',
       'Investment_Return_As_Dollar_Amount',
       'Sales_And_Services_As_Dollar_Amount',
       'Other_Revenues_As_Dollar_Amount'],
      dtype='object')

In [65]:
df.columns[16:23] # expenses Distribution

Index(['Instruction_Expenses', 'Research_Expenses', 'Public_Service_Expenses',
       'Academic_Support_Expenses', 'Student_Service_Expenses',
       'Institutional_Support_Expenses', 'Other_Core_Expenses'],
      dtype='object')