<h1><center>Exploring Graduation Rates By Ethnicity</center></h1>

**By: Robert Campbell**

In [33]:
import cbell as cb

In [2]:
import pandas as pd
import numpy as np

#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display
import pandas as pd 
import numpy as np
import cufflinks as cf
import chart_studio.plotly as py

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

%matplotlib inline
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns
init_notebook_mode(connected=True)
cf.go_offline()

from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

from joblib import dump, load
from Logger import RegressionLogger, FuncTransformer, ColSelect

In [103]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")
df.drop("Unnamed: 0",axis=1, inplace=True)
institution_name = df["Institution Name"]
df.drop(["Institution Name", "City location of institution (HD2019)"], axis=1, inplace=True)
X = df.iloc[:,:-14]
X = pd.get_dummies(X, drop_first=True)
X["Institution Name"] = institution_name

grad_rates = df.iloc[:,-14:]
X = pd.concat([X, grad_rates], axis=1)

In [5]:
X_df = load("objects/engineered_features.joblib")
y = load("objects/y.joblib")
x_train, x_test, y_train, y_test, x_holdout, y_holdout = load("objects/holdout_total.joblib")

In [7]:
model_log = load("objects/model_logging.joblib")

In [8]:
# from joblib import dump, load
model = model_log[4]['model']

In [9]:
# load in feature engineered features
X_fe = load("objects/features.joblib")

In [10]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [12]:
means = df.filter(regex='Grad').describe().loc['mean'].sort_values().to_numpy()
cols = ['Black', 'American Indian', 'Hispanic', 'men', 'two or more races', 
        'Native Hawaiian', 'unknown', 'total cohort', 'White',
        'women', 'Asian/Native Hawaiian', 'Nonresident alien', 'Asian']

#cols = cols.str.replace("Graduation rate", "")
#cols = cols.str.replace("DRVGR2019", "")
#cols = cols.str.replace("Other Pacific Islander", "")

fig = go.Figure()
_ = fig.add_trace(go.Bar(
    x=cols, y=means, text=np.round(means, 2)
))
_ = fig.update_layout(
    title='Mean Graudation Rates',
    xaxis={
        'title': "Gender/Race"
    },
    yaxis = {
        'title': "Mean Graduation Rate"
    },
    height = 600
)

In [13]:
fig.show()

<ul><li>4 year institutions</li></ul>

<ul><li>2019 Cohort</li></ul>

<ul><li>6 year Graduation Rates</li></ul>

<h1><center>HBCU Graduation Rates Vs Price</center></h1>

## <ul><li>Problem</li></ul>
University X is facing pressure from the community claiming that the university is underperforming particularly with students of color. The University needs to respond to the community with a data backed explanation for their past performance along with a data driven plan to improve outcome measures (Graduation Rates)

## <ul><li>Source of Data</li></ul>

The Data Science team has access to the free publicly available Ipeds Data Center, which has data about all 4-year public and private institutions.

<a src='https://nces.ed.gov/ipeds/use-the-data'>Ipeds Data Center</a>

In [14]:
# Reset Our Data Frame
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")
df.drop("Unnamed: 0",axis=1, inplace=True)

<h1><center>Overview of Features</center></h1>

<h3><center>Numeric Features</center></h3>
<center><ul>
    <li>Revenues and Expenses</li>
    <li>Price of School</li>
    <li>Admissions Test Scores</li>
    <li>Staff Diversity</li>
    <li>Enrollment Diversity</li>
    <li>Student-to-Faculty Ratio</li>
</ul></center>

<h3><center>HBCU Graduation Rates Vs Non HBCU Schools</center></h3>
<ul>
    <li>State</li>
    <li>Sector</li>
    <li>Carnegie Classification</li>
    <li>HBCU</li>
</ul>

<h1><center>Graduation Rates By Sector</center></h1>

In [15]:
sector = df.filter(regex="Sector").columns[0]
black = df.filter(regex="Grad.*Black").columns[0]
total = df.filter(regex="Grad").columns[0]

In [16]:
all_sectors = df[sector].unique()

sector_group = df.groupby(by=sector)

grad_rate_by_sector = sector_group.agg('mean').sort_values(by=black)[[black, total]]

In [17]:
grad_rate_by_sector.drop(['Public, 2-year', 'isMissing', 'Private not-for-profit, 2-year', 
                          'Private for-profit, 2-year', 'Private for-profit, less-than 2-year'],
                         inplace=True)

## BootStrap Confidence Interval for Black Graduation Rates Across Sector

In [18]:
sector_error1 = cb.get_error(sector_group.get_group(all_sectors[1])[black].dropna(), np.mean, 10000)
sector_error2 = cb.get_error(sector_group.get_group(all_sectors[2])[black].dropna(), np.mean, 10000)
sector_error3 = cb.get_error(sector_group.get_group(all_sectors[3])[black].dropna(), np.mean, 10000)
sector_error4 = cb.get_error(sector_group.get_group(all_sectors[1])[total].dropna(), np.mean, 10000)
sector_error5 = cb.get_error(sector_group.get_group(all_sectors[2])[total].dropna(), np.mean, 10000)
sector_error6 = cb.get_error(sector_group.get_group(all_sectors[3])[total].dropna(), np.mean, 10000)


diff1 = sector_group.get_group(all_sectors[1])[total] - sector_group.get_group(all_sectors[1])[black]
diff2 = sector_group.get_group(all_sectors[2])[total] - sector_group.get_group(all_sectors[2])[black]
diff3 = sector_group.get_group(all_sectors[3])[total] - sector_group.get_group(all_sectors[3])[black]

diff_error1 = cb.get_error(diff1.dropna(), np.mean, 10000)
diff_error2 = cb.get_error(diff2.dropna(), np.mean, 10000)
diff_error3 = cb.get_error(diff3.dropna(), np.mean, 10000)

In [19]:
grad_rate_by_sector["Mean_Difference"] = grad_rate_by_sector[total] - grad_rate_by_sector[black]

In [20]:
fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_by_sector.index, y=grad_rate_by_sector[total], 
               text=np.round(grad_rate_by_sector[total], 1), name="Total Cohort",
               error_y=dict(type='data', array=[sector_error4, sector_error5, sector_error6])),
        
        go.Bar(x=grad_rate_by_sector.index, y=grad_rate_by_sector[black], 
               text=np.round(grad_rate_by_sector[black], 1), name="African American",
               error_y=dict(type='data', array=[sector_error1, sector_error2, sector_error3])),
        
    ]
)

_ = fig.update_layout(
    title="Mean Graduation Rates By Sector of Institution",
    xaxis={
        'title': "Sector"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    },
    font=dict(
        family="Courier New, monospace",
        size=16,
    )
)
fig.show()

<h2><center>Difference Between African American Graduation Rates and Total Cohort Graduation</center></h2>

In [21]:
fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_by_sector.index, y=grad_rate_by_sector["Mean_Difference"], 
               text=np.round(grad_rate_by_sector["Mean_Difference"], 1), name="Difference of Means",
               error_y=dict(type='data', array=[diff_error1, diff_error2, diff_error3])),
    ]
)

_ = fig.update_layout(
    title="Difference Between Black and Total Graduation Rates By Sector of Institution",
    xaxis={
        'title': "Difference By Sector"
    },
    yaxis={
        'title': 'Difference in Graduation Rate'
    },
    font=dict(
        family="Courier New, monospace",
        size=16,
    )
)
fig.show()

<h1><center>Graduation Rates By State</center></h1>

In [22]:
state = df.filter(regex="State").columns[0]
all_states = df[state].unique()

In [23]:
state_group = df.groupby(by=state)

In [24]:
grad_rate_by_state = state_group.agg('mean').sort_values(by=black)[[black, total]].dropna()

In [25]:
state_yblack = grad_rate_by_state[black].to_numpy()[0:51:10]
state_y = grad_rate_by_state[total].to_numpy()[0:51:10]

fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_by_state.index[0:51:10], y=state_y, 
               text=np.round(state_y, 1), name="Total Cohort"),
        go.Bar(x=grad_rate_by_state.index[0:51:10], y= state_yblack, 
               text=np.round(state_yblack, 1), name="African American"),
        
    ]
)

_ = fig.update_layout(
    title="Mean Graduation Rates By State",
    xaxis={
        'title': "State"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    },
    font=dict(
        family="Courier New, monospace",
        size=16,      
    )
)
fig.show()     

<h1><center>HBCU Graduation Rates Vs Non HBCU Schools</center></h1>

In [26]:
hbcu = df.filter(regex="Historically").columns[0]
hbcu_group = df.groupby(by=hbcu)
grad_rate_hbcu = hbcu_group.agg('mean').sort_values(by=black)[[black, total]]

In [27]:
grad_rate_hbcu = grad_rate_hbcu.drop("isMissing")

In [28]:
fig = go.Figure(
    data = [
        go.Bar(x=grad_rate_hbcu.index, y=grad_rate_hbcu[total], 
               text=np.round(grad_rate_hbcu[total], 1), name="Total Cohort"),
        go.Bar(x=grad_rate_hbcu.index, y=grad_rate_hbcu[black], 
               text=np.round(grad_rate_hbcu[black], 1), name="African American"),
    ]
)

_ = fig.update_layout(
    title="Mean Graduation Rates For Historically Black Colleges and Universities",
    xaxis={
        'title': "Is_HBCU"
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    },
    font=dict(
        family="Courier New, monospace",
        size=16,
    )
    
)
fig.show()

<h2><center>Whats Wrong With These comparisons?</center></h2>

In [29]:
hbcu_df = df[df[hbcu] == 'Yes']
hwcu_df = df[df[hbcu] == 'No']
missing = df[df[hbcu] == 'isMissing']

In [30]:
price = df.filter(regex='price').columns[2]
test_score = df.filter(regex='Composite').columns[0]

<h1><center>HBCU Graduation Rates Vs Price</center></h1>

In [89]:
range_ = [12, 26]
degree = wg.IntSlider(value=1, min=1, max=3)

feature_picker = wg.Dropdown(
    options=X.columns,
    value=X.filter(regex="price").columns[-1],
    description='Feature:',
    disabled=False,
)
target_picker = wg.Dropdown(
    options=grad_rates.columns,
    value=grad_rates.columns[0],
    description='Feature:',
    disabled=False,
)
color_picker = wg.Dropdown(
    options=X.filter(regex="Sector|Historically|State|Basic|Size and Setting|category").columns,
    value=X.filter(regex="Historically").columns[0],
    description='Color:',
    disabled=False,
)


def fe_plot(feature, target, color):
    
    fig2 = px.scatter(x=X[feature], y=grad_rates[target], color=X[color].replace({1.0: 'Yes', 0.0:'No'}))
    
    
    fig2.add_trace(go.Scatter(
        x=x_holdout[feature], y=y_holdout, mode="markers", name="Sample Universities"
    ))
    
    fig2.update_layout(
        title="Multi Variate Plots",
        xaxis={
            "title": feature,
            "range": [df[feature].min() - df[feature].std() / 10, df[feature].max() + df[feature].std() / 10]
        },
        yaxis={
            "title": "Graduation Rate",
            "range": [-5,105]
        },
        legend={
            'orientation': 'v',
            'title': color,
            'x': 0.5,
            'y': 1.1,
            'bgcolor': "rgba(176,196,222,0.7)",
            'bordercolor': "black",
            'borderwidth': 2,
            'itemclick': "toggle",
        },
        font=dict(
            family="Courier New, monospace",
            size=14),
        height=700
    )

    fig2.update_traces(marker=dict(size=12,
                                  line=dict(width=2,
                                  color='DarkSlateGrey')),
                      selector=dict(mode='markers'))

    display(fig2.show())
    

ui = wg.HBox([feature_picker, target_picker, color_picker])  

func_params = {
    'feature': feature_picker,
    'target': target_picker,
    'color': color_picker
}


output = wg.interactive_output(fe_plot, func_params)   


display(ui, output)   

HBox(children=(Dropdown(description='Feature:', index=117, options=('Core_Revenues', 'Tuition_And_Fees', 'Gove…

Output()

In [86]:
set(df.index) - set(X_df.index)

{130943, 141060, 199111}

<h1><center>Human Resources (Total Instructional Staff)</center></h1>

In [112]:
range_ = [12, 26]
degree = wg.IntSlider(value=1, min=1, max=3)

feature_picker = wg.Dropdown(
    options=X.columns,
    value=X.filter(regex="Grand.*instructional_staff").columns[0],
    description='Feature:',
    disabled=False,
)
target_picker = wg.Dropdown(
    options=grad_rates.columns,
    value=grad_rates.columns[0],
    description='Feature:',
    disabled=False,
)
color_picker = wg.Dropdown(
    options=X.filter(regex="Sector|Historically|State|Basic|Size and Setting|category").columns,
    value=X.filter(regex="Historically").columns[0],
    description='Color:',
    disabled=False,
)


def fe_plot(feature, target, color):
    
    fig2 = px.scatter(X, x=feature, y=target, color=color)

    
    fig2.add_trace(go.Scatter(
        x=x_holdout[feature], y=y_holdout, mode="markers", name="Sample Universities"
    ))
    
    fig2.update_layout(
        title="Multi Variate Plots",
        xaxis={
            "title": feature,
            "range": [df[feature].min() - df[feature].std() / 10, df[feature].max() + df[feature].std() / 10]
        },
        yaxis={
            "title": "Graduation Rate",
            "range": [-5,105]
        },
        legend={
            'orientation': 'v',
            'title': color,
            'x': 0.5,
            'y': 1.1,
            'bgcolor': "rgba(176,196,222,0.7)",
            'bordercolor': "black",
            'borderwidth': 2,
            'itemclick': "toggle",
        },
        font=dict(
            family="Courier New, monospace",
            size=14),
        height=700
    )

    fig2.update_traces(marker=dict(size=12,
                                  line=dict(width=2,
                                  color='DarkSlateGrey')),
                      selector=dict(mode='markers'))

    display(fig2.show())
    

ui = wg.HBox([feature_picker, target_picker, color_picker])  

func_params = {
    'feature': feature_picker,
    'target': target_picker,
    'color': color_picker
}


output = wg.interactive_output(fe_plot, func_params)   


display(ui, output)   

HBox(children=(Dropdown(description='Feature:', index=28, options=('Core_Revenues', 'Tuition_And_Fees', 'Gover…

Output()