# Predicting Graduation Rates at 4 Year Universities

In [1]:
import pandas as pd
import numpy as np

#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display
import pandas as pd 
import numpy as np
import cufflinks as cf
import chart_studio.plotly as py

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

%matplotlib inline
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns
init_notebook_mode(connected=True)
cf.go_offline()

from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

from joblib import dump, load
from Logger import RegressionLogger, FuncTransformer, ColSelect

In [2]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")
df.drop("Unnamed: 0",axis=1, inplace=True)
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)
X = df.iloc[:, :-14]
y = df.iloc[:,-14]
X = pd.get_dummies(X, drop_first=True)

In [50]:
df.filter(regex='Grad').describe().loc['mean'].sort_values().index

Index(['Graduation rate  Black  non-Hispanic (DRVGR2019)',
       'Graduation rate  American Indian or Alaska Native (DRVGR2019)',
       'Graduation rate  Hispanic (DRVGR2019)',
       'Graduation rate  men (DRVGR2019)',
       'Graduation rate  two or more races (DRVGR2019)',
       'Graduation rate  Native Hawaiian or Other Pacific Islander (DRVGR2019)',
       'Graduation rate  Race/ethnicity unknown (DRVGR2019)',
       'Graduation rate  total cohort (DRVGR2019)',
       'Graduation rate  White  non-Hispanic (DRVGR2019)',
       'Graduation rate  women (DRVGR2019)',
       'Graduation rate  Asian/Native Hawaiian/Other Pacific Islander (DRVGR2019)',
       'Graduation rate  Nonresident alien (DRVGR2019)',
       'Graduation rate  Asian (DRVGR2019)'],
      dtype='object')

In [59]:
means = df.filter(regex='Grad').describe().loc['mean'].sort_values().to_numpy()
cols = ['Black', 'American Indian', 'Hispanic', 'men', 'two or more races', 
        'Native Hawaiian', 'unknown', 'total cohort', 'White',
        'women', 'Asian/Native Hawaiian', 'Nonresident alien', 'Asian']
#cols = cols.str.replace("Graduation rate", "")
#cols = cols.str.replace("DRVGR2019", "")
#cols = cols.str.replace("Other Pacific Islander", "")
fig = go.Figure()
fig.add_trace(go.Bar(
    x=cols, y=means, text=np.round(means, 2)
))
fig.update_layout(
    title='Mean Graudation Rates',
    xaxis={
        'title': "Gender/Race"
    },
    yaxis = {
        'title': "Mean Graduation Rate"
    },
    height = 600
)

fig.show()

In [62]:
# Reset Our Data Frame
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")
df.drop("Unnamed: 0",axis=1, inplace=True)

# Attributes That Are Uncontrolable vs Actionable Attributes

# Sector

In [73]:
sector = df.filter(regex="Sector").columns[0]
black = df.filter(regex="Grad.*Black").columns[0]
total = df.filter(regex="Grad").columns[0]

In [77]:
all_sectors = df[sector].unique()

In [75]:
sector_group = df.groupby(by=sector)

In [92]:
grad_rate_by_sector = sector_group.agg('mean').sort_values(by=black)[[black, total]]

In [107]:
grad_rate_by_sector.drop(['Public, 2-year', 'isMissing', 'Private not-for-profit, 2-year', 
                          'Private for-profit, 2-year', 'Private for-profit, less-than 2-year'],
                         inplace=True)

In [110]:
fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_by_sector.index, y=grad_rate_by_sector[black], 
               text=np.round(grad_rate_by_sector[black], 1), name="African American"),
        go.Bar(x=grad_rate_by_sector.index, y=grad_rate_by_sector[total], 
               text=np.round(grad_rate_by_sector[total], 1), name="Total Cohort")
    ]
)

fig.update_layout(
    title="Mean Graduation Rates By Sector of Institution",
    xaxis={
        'title': "Sector"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    }
)
fig.show()

# Graduation Rates by State

In [113]:
state = df.filter(regex="State").columns[0]
all_states = df[state].unique()

In [115]:
state_group = df.groupby(by=state)

In [119]:
grad_rate_by_state = state_group.agg('mean').sort_values(by=black)[[black, total]].dropna()

In [133]:
state_yblack = grad_rate_by_state[black].to_numpy()[0:51:10]
state_y = grad_rate_by_state[total].to_numpy()[0:51:10]

fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_by_state.index[0:51:10], y= state_yblack, 
               text=np.round(state_yblack, 1), name="African American"),
        go.Bar(x=grad_rate_by_state.index[0:51:10], y=state_y, 
               text=np.round(state_y, 1), name="Total Cohort")
    ]
)

fig.update_layout(
    title="Mean Graduation Rates By State",
    xaxis={
        'title': "State"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    }
)
fig.show()

# Graduation Rates (HBCU vs Non HBCU)

In [142]:
hbcu = df.filter(regex="Historically").columns[0]
hbcu_group = df.groupby(by=hbcu)
grad_rate_hbcu = hbcu_group.agg('mean').sort_values(by=black)[[black, total]]

In [145]:
grad_rate_hbcu = grad_rate_hbcu.drop("isMissing")

In [147]:
fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_hbcu.index, y=grad_rate_hbcu[black], 
               text=np.round(grad_rate_hbcu[black], 1), name="African American"),
        go.Bar(x=grad_rate_hbcu.index, y=grad_rate_hbcu[total], 
               text=np.round(grad_rate_hbcu[total], 1), name="Total Cohort")
    ]
)

fig.update_layout(
    title="Mean Graduation Rates For Historically Black Colleges and Universities",
    xaxis={
        'title': "Is_HBCU"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    }
)
fig.show()

# Model Performance Total Graduation Rate

In [None]:
# load in feature engineered features
X_fe = load("data/features.joblib")

In [3]:
model_log = load("data/model_logging.joblib")

In [4]:
pd.DataFrame(model_log[4], index=[0])

Unnamed: 0,Imputer,model_name,model,alpha,l1_ratio,rsquared,rmse,mae,train_rsquared,train_rmse,train_mae,notes
0,KNNImputer,Elastic Net,"(FeatureUnion(transformer_list=[('numeric',\n ...",0.3,0.8,0.536707,14.015952,9.509088,0.634621,13.306718,9.332271,Tuned Elstic Net Model


In [5]:
model = model_log[4]['model']

In [11]:
params = {}
logger = RegressionLogger(params)
logger.train_update(model, X_fe, y)

# Residual Plots

In [16]:
residuals = logger.y_pred - logger.y_test
resid_data = {
    'y_test': logger.y_test,
    'Residuals':residuals,
}
fig = px.scatter(resid_data, x='y_test', y='residuals')
fig.update_layout(title='Residuals vs Observed Graduation Rates')
fig.show()

In [18]:
residuals = logger.y_pred - logger.y_test
resid_data = {
    'y_pred': logger.y_pred,
    'Residuals':residuals,
}
fig = px.scatter(resid_data, x='y_pred', y='Residuals')
fig.update_layout(title='Residuals vs Predicted Graduation Rates')
fig.show()

# Model Coefficients

In [6]:
coefs = model['classifier'].coef_
feature_names = load("data/feature_names.joblib")
final_coefs = sorted(list(zip(feature_names, coefs)), key= lambda x: np.abs(x[1]))
coef_dict = {k: v for k, v in sorted(dict(zip(feature_names, coefs)).items(), key= lambda x: x[1])}

In [7]:
best = final_coefs[-11:]
best = sorted(best, key=lambda x: x[1])

In [8]:
fig = make_subplots(rows=1, cols=1, subplot_titles=["Model Coeficients"])

fig.add_trace(
    go.Bar(x=[i[1] for i in best], y=[i[0] for i in best], 
           orientation='h', text=[round(i[1], 2) for i in best]),
    1, 1)

fig.update_yaxes(
    tickangle = 10
)
fig.show()

# Interpreting Coefficients With Numbers

In [9]:
# Getting Coefficient Interpretation
preprocess = model['preprocessing'].transformer_list[0][1] # Getting float preprocessing steps

# preprocess features the with the same steps as our pipeline
data = preprocess.fit_transform(X_fe)

# Set Log_Other_Revenues to 1 Unit
transformed_df = pd.DataFrame(data, columns=X_fe.select_dtypes(include='float').columns)
transformed_df['Log_Other_Revenues_As_Dollar_Amount'] = 1

# Inverse transform the preprocessed data
inverse_data = preprocess['Scale'].inverse_transform(transformed_df)
inverse_df = pd.DataFrame(inverse_data, columns=X_fe.select_dtypes(include='float').columns)

In [10]:
# View what a one unit increase in log other revenues realy means
new_line = '/n'
log_amount = inverse_df['Log_Other_Revenues_As_Dollar_Amount'].iloc[0]

f'A One Unit Increase in Log_Revenues is {log_amount}, therefore a {np.exp(log_amount)}'\
' increase in other revenues will increase the projected graduation rates for school X to increase' \
' by 4.29%'

'A One Unit Increase in Log_Revenues is 18.100074082796795, therefore a 72570864.4257857 increase in other revenues will increase the projected graduation rates for school X to increase by 4.29%'

# Model Performance African American Graduation Rate

# Controlling for Total Revenues HBCU vs non HBCU