In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.linear_model import ElasticNet
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
import copy

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
from joblib import load, dump

In [3]:
#We will be using total cohort and african american cohort graduation rates a lot in this eda
black = "graduation_rate_black_non-hispanic"
total = "graduation_rate_total_cohort"
price = 'total_price_for_out-of-state_students_living_on_campus'
revenue = 'core_revenues_total_dollars'
act_nan = 'act_composite_75th_percentile_score_isnan'
act = 'act_composite_75th_percentile_score'
hbcu = 'historically_black_college_or_university'


In [4]:
def convert_residential(x):
    if 'highly' in x:
        return 1
    elif 'nonresidential' in x:
        return 2
    elif 'primarily residential' in x:
        return 3
    else:
        return 2
    

In [5]:
def size_ordinal(x):
    if 'Under' in x:
        return 1
    elif '1,000 - 4,999' in x:
        return 2
    elif '5,000 - 9,999' in x:
        return 3
    elif '10,000 - 19,999' in x:
        return 4
    elif 'above' in x:
        return 5
    else:
        return 3

**Retrain Model Keeping Target University as holdout**

In [6]:
# features.reset_index()[features.reset_index()["institution_name"].str.contains("Delaware")]['institution_name']

In [7]:
# define a list of universities that you want to analyze
universities = ['Spelman College']

In [8]:
# load in model_log and features
model_log = load("objects/model_log.joblib")
features = load("objects/features.joblib")
features_xgb = load("objects/engineered_features.joblib")
targets = load("objects/targets.joblib")

In [9]:
features_xgb = features_xgb.swaplevel()
features.index = features.index.swaplevel()
targets.index = targets.index.swaplevel()

**Features**

In [10]:
# defining holdout schools that we want to use for prediction
my_schools = features.loc[universities]
my_targets = targets.loc[universities]

In [11]:
# Holdout universities with advanced feature interactions
my_schools_xgb = features_xgb.loc[universities]
features_xgb.drop(universities, inplace=True)
targets.drop(universities, inplace=True)

In [12]:
xgb_model = model_log[-1]['model']

In [13]:
x_train_xgb, x_test_xgb, y_train_xgb, y_test_xgb =  train_test_split(features_xgb, targets[total], test_size=0.05, random_state=0)

**Elastic Net**

In [14]:
# Use MinMax Scaler instead of standard scaler
elasticNet = Pipeline([
    ('encoder', TargetEncoder()),
    ('scaler', MinMaxScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])


In [15]:
_ = elasticNet.fit(x_train_xgb, y_train_xgb)

  elif pd.api.types.is_categorical(cols):


In [16]:
enet_pred = elasticNet.predict(x_test_xgb)

In [17]:
# holdout predictions
my_schools_pred = elasticNet.predict(my_schools_xgb)

In [18]:
mean_absolute_error(enet_pred, y_test_xgb)

10.78303843076136

In [19]:
# Getting Coefficients
model_coefs = list(zip(elasticNet['regressor'].coef_, x_train_xgb.columns))

In [20]:
model_coefs = sorted(model_coefs)

**XGB**

In [21]:
_ = xgb_model.fit(x_train_xgb, y_train_xgb)

In [22]:
xgb_pred = xgb_model.predict(x_test_xgb)

In [23]:
mean_absolute_error(xgb_pred, y_test_xgb)

6.862257791602093

In [24]:
my_schools_pred_xgb = xgb_model.predict(my_schools_xgb)

In [25]:
my_schools_pred_xgb - my_targets[total].to_numpy()

array([-18.53157806, -17.67862701])

In [26]:
def interact(df):
    df['log_core_revenues_total_dollars'] = df['core_revenues_total_dollars'].apply(lambda x: np.log(x) if x > 0 else 0.01 * x)
    df['log_core_expenses_total_dollars'] = df['core_expenses_total_dollars'].apply(lambda x: np.log(x) if x > 0 else 0.01 * x)
    df['log_staff'] = df["grand_total_instructional_staff"].apply(lambda x: np.log(x) if x > 0 else 0.01 * x)
    df["residential_level"] = df['carnegie_classification_2018:_size_and_setting'].apply(lambda x: convert_residential(x))
    df["ordinal_institution_size_category"] = df['institution_size_category'].apply(lambda x: size_ordinal(x))
    df["approximate_incoming_class_grand_total"] = df['grand_total_age_25_and_over'] + df['grand_total_age_25_and_under']
    df["age_ratio"] = df['grand_total_age_25_and_over'] / df['grand_total_age_25_and_under']
    df["percent_of_women_age_25_and_over"] = df['total_women_age_25_and_over'] / df["approximate_incoming_class_grand_total"]
    df["percent_of_women_age_25_and_under"] = df['total_women_age_25_and_under'] / df["approximate_incoming_class_grand_total"]
    df["price * revenue"] = df['total_price_for_out-of-state_students_living_off_campus'] * df['log_core_revenues_total_dollars']
    df["log_expenses * residential leve"] = df['log_core_expenses_total_dollars'] * df["residential_level"]
    df['log_staff * price'] = df['log_staff'] * df['total_price_for_out-of-state_students_living_on_campus']
    return df

In [27]:
original_df = pd.read_csv("data/4_year/aggregate.csv", index_col=["UnitID", "institution_name"])
original_df.drop('Unnamed: 0', axis=1, inplace=True)
grad_rates = original_df.filter(regex="_rate").columns
original_df.drop(grad_rates, axis=1, inplace=True)
original_df.index = original_df.index.swaplevel()

In [28]:
index = 0
year = 2019
schools = [i[0] for i in my_schools.index.unique()]

In [29]:
university_x = schools[0]

# University Graduation Rate Prediction

- By: Robert Campbell

Finding a good real world data set is difficult at first. As a rookie data scientist, a lot of doubts come into your mind when choosing a project. Is my target variable forecastable? Does predicting X or Y actually solve a real world problem? Or will all the time spent be waisted on a failed project? All of these things I hoped to overcome by the end of this first project. Here I will tell the story of my first data science project and some of the hurtles that I overcame during the process. The data set that I found was data for over 2000 4-year Universities. The most obvious target variable for a predictive model about Universities is the graduation rates. In the following article, I will uncover some of the disparities between African American graduation rates and the rest of the population while demonstrating how to use the Data Science Process to solve a problem for a specific University.  

In [30]:
colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
          'rgba(190, 192, 213, 1)', 'rgba(195, 197, 222, 1)']

In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [32]:
df = pd.read_csv("data/4_year/eda.csv")

In [34]:
describe = df.filter(regex="graduation").describe().loc["mean", :].sort_values()

In [98]:
fig = px.bar(describe, y='mean', text=np.round(describe, 2))
fig.data[0].marker = {
    'color': colors[-1],
    'line': {
        'color': colors[-2],
        'width': 3
    }
}
fig.update_layout(
    title='Mean Graduation Rates by Gender and Ethnicity',
    title_x=0.5,
    height=650,
)

During my initial inspection, their were already many alarming statistics. The first being that graduation rates are very low, 51.14% of students graduate from 4-year universities. It is worth noting that there are some 2-year colleges in this study; however, they were only included because they reported data with their parent institution and were therefore not excluded when extracting data from the [ipeds data center]("https://nces.ed.gov/ipeds/datacenter/SelectVariables.aspx") And if that isn't alarming enough, the two most marginalized communities in the country, African Americans and American Indians, have a significantly lower graduation rate than the total cohort. African Americans and American Indians graduate at a rate of 38.6% and 41.1% respectfully. Additionally, these are completions within 150% of the time (6 years in the 4-year case). The plot above and the statistics it revealed really inspired how I went about shaping my problem statement for this project.

# Problem Statement

## Problem
**{{university_x}}** has very low graduation rates and even lower amongst students of color. They are facing immmence pressure from the community and board members and the leadership of the school needs to respond with a data backed explanation for their poor performance along with a data **driven plan** for **improving** their future **graduation rates**

## Criteria for success
<br>
If we can predict graduation rates within 5% Mean Absolute Error, then we would consider the model accurate enough to implement suggested improvements, whether it be to increase admissions standards or reallocate revenues.
<br>
<br>
<br>

### Source of Data

The DataScience team has access to the publically available [Ipeds Data Center]("https://nces.ed.gov/ipeds/datacenter/SelectVariables.aspx") which has all of the required information to predict graduation rates. For this project I manually selected variables from this link to incorporate into a prediction

### Stakeholders to Provide Key Insights
The leadership at {{university_x}} will be providing these insights to the community and will be working with the data science team to ensure a sustainable solution
<br> <br> 

In [120]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=["Total Cohort"], y=[my_targets.iloc[0][total]], name="Total Cohort Graduation Rates",
        text=[my_targets.iloc[0][total]]
    )
)
fig.add_trace(
    go.Bar(
        x=["African American"], y=[my_targets.iloc[0][black]], name="African American Graduation Rates",
        text=[my_targets.iloc[0][total]]
    )
)

fig.update_layout(
    title=university_x + " Graduation Rate (African American vs Total)",
    yaxis=dict(title='Graduation Rates')
)

## So Where Do African Americans Go to School?

In [37]:
percent_cols = df.filter(regex="percent_of_total_enrollment(?!_isnan)").columns
percent_cols

Index(['percent_of_total_enrollment_that_are_american_indian_or_alaska_native',
       'percent_of_total_enrollment_that_are_asian',
       'percent_of_total_enrollment_that_are_asian/native_hawaiian/pacific_islander',
       'percent_of_total_enrollment_that_are_black_or_african_american',
       'percent_of_total_enrollment_that_are_hispanic/latino',
       'percent_of_total_enrollment_that_are_native_hawaiian_or_other_pacific_islander',
       'percent_of_total_enrollment_that_are_nonresident_alien',
       'percent_of_total_enrollment_that_are_race/ethnicity_unknown',
       'percent_of_total_enrollment_that_are_two_or_more_races',
       'percent_of_total_enrollment_that_are_white',
       'percent_of_total_enrollment_that_are_women'],
      dtype='object')

In [38]:
grand_total_enrollment_cols = df.filter(regex="grand").columns[0:2]

In [39]:
df["total_enrollment"] = df[grand_total_enrollment_cols[0]] + df[grand_total_enrollment_cols[1]] 

In [40]:
for i in percent_cols:
    col_name = re.sub(r'percent_of_total_enrollment_that_are_', '', i)
    col_name = col_name + "_total"
    df[col_name] = df["total_enrollment"] * (df[i] / 100)


In [41]:
percentages = list(percent_cols)
_ = percentages.pop()

'percent_of_total_enrollment_that_are_women'

In [43]:
df_2019 = df[df.cohort == 2019]

In [44]:
my_schools
my_targets

Unnamed: 0_level_0,Unnamed: 1_level_0,graduation_rate_american_indian_or_alaska_native,graduation_rate_asian,graduation_rate_asian/native_hawaiian/other_pacific_islander,graduation_rate_black_non-hispanic,graduation_rate_hispanic,graduation_rate_men,graduation_rate_native_hawaiian_or_other_pacific_islander,graduation_rate_nonresident_alien,graduation_rate_race/ethnicity_unknown,graduation_rate_total_cohort,graduation_rate_two_or_more_races,graduation_rate_white_non-hispanic,graduation_rate_women,transfer-out_rate_total_cohort
institution_name,UnitID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Spelman College,141060,,100.0,100.0,75.0,67.0,,,75.0,82.0,75.0,63.0,,75.0,8.0
Spelman College,141060,100.0,,,75.0,50.0,,,91.0,67.0,75.0,71.0,,75.0,17.0


In [45]:
cat_groups = []
for i in df_2019.select_dtypes("object").columns:
    cat_groups.append(df_2019.groupby(i))

In [46]:
df_2019.select_dtypes("object").columns

Index(['carnegie_classification_2018:_basic',
       'carnegie_classification_2018:_enrollment_profile',
       'carnegie_classification_2018:_size_and_setting',
       'city_location_of_institution',
       'historically_black_college_or_university', 'institution_name',
       'institution_size_category', 'parent/child_indicator_-_finance',
       'sector_of_institution', 'state_abbreviation'],
      dtype='object')

In [47]:
enrollment_by_race = cat_groups[-2].agg('mean')[percentages]

In [48]:
enrollment_by_race.drop(['Private for-profit, 2-year', 'Private not-for-profit, 2-year', 'Public, 2-year', 'Private for-profit, less-than 2-year','isMissing'], inplace=True)

In [49]:
enrollment_by_race.loc[university_x] = my_schools.loc[:, percentages].iloc[0, :]

In [50]:
race_cols = enrollment_by_race.columns.to_list()

In [51]:
enrollment_by_race['Other'] = 100 - (enrollment_by_race.iloc[:, [3,4,9]].sum(axis=1))

In [52]:
enrollment_by_race.drop(enrollment_by_race.iloc[:, [0,1,2,5,6,7, 8]].columns.to_list(), axis=1, inplace=True)

In [53]:
race_cols = enrollment_by_race.columns.to_list()

In [54]:
enrollment_by_race = enrollment_by_race.reindex([university_x, enrollment_by_race.index[0], enrollment_by_race.index[1], enrollment_by_race.index[2]])

In [55]:
enrollment_by_race

Unnamed: 0_level_0,percent_of_total_enrollment_that_are_black_or_african_american,percent_of_total_enrollment_that_are_hispanic/latino,percent_of_total_enrollment_that_are_white,Other
sector_of_institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Spelman College,82.0,0.0,0.0,18.0
"Private for-profit, 4-year or above",22.609412,20.145098,36.138824,21.106667
"Private not-for-profit, 4-year or above",12.855124,10.909006,58.245186,17.990683
"Public, 4-year or above",12.744799,12.376422,56.958946,17.919834


In [56]:
ai = 'percent_of_total_enrollment_that_are_american_indian_or_alaska_native'

In [127]:
fig = go.Figure()
half = university_x.index(" ")
races = ["Black", "Hispanic", "White", "Other"]
sectors = [university_x[:half] + '<br>' + university_x[half:],
           'Private for-profit<br>4-year or above','Private not-for-profit<br>4-year or above', 
           'Public<br>4-year or above']

for i, v in enumerate(race_cols):
    
    for j, x in enumerate(enrollment_by_race.index.to_list()):
        
        value = np.round(enrollment_by_race[v].values[j], 1)
        if value > 3:
            fig.add_trace(go.Bar(
                x=[value],
                y=[enrollment_by_race.index[j]],
                text=[str(value) + '%'],
                orientation='h', marker={'color': colors[i]}
            ))
        else:
            fig.add_trace(go.Bar(
                x=[value],
                y=[enrollment_by_race.index[j]],
                orientation='h', marker={'color': colors[i]}
            ))

locs = [5,20, 60, 95]
for i, v in enumerate(races):
    fig.add_annotation(text=v,
                      xref="x", yref="y",
                      x=locs[i] , y=3.6, showarrow=False)  
for i, v in enumerate(enrollment_by_race.index):
    fig.add_annotation(
        text=sectors[i], xref='x', yref='y', x=-14, 
        y=enrollment_by_race.index[i], showarrow=False
    )

    
fig.update_layout(
    title='Race Distribution By Sector',
    title_x=0.5,
    height=600,
    width=700,
    font={
            'size': 10
        },
    violinmode='group',
    xaxis={
        'title': 'Mean race distribution',
    },
    
    barmode='stack',
    showlegend=False
    
)
fig.update_yaxes(showticklabels=False)

fig.update_traces(
    marker={
        'line': {'width': 0.5},
        
    }, 
)

The variables used to create this plot were the percentage of enrollment that are African American, percentage of enrollment that are white, the percentage of enrollemnt that are Hispanic and an aggregation of all of the other races. As you can see African Americans are highly represented at Private for-profit schools, which you will see in the following plot has the lowest average graduation rate. On the other hand, private not for profit schools and public schools tend to perform much better and are the same schools where African Americans are under represented

In [58]:
df_2019.select_dtypes("object").columns #view groupby groups

Index(['carnegie_classification_2018:_basic',
       'carnegie_classification_2018:_enrollment_profile',
       'carnegie_classification_2018:_size_and_setting',
       'city_location_of_institution',
       'historically_black_college_or_university', 'institution_name',
       'institution_size_category', 'parent/child_indicator_-_finance',
       'sector_of_institution', 'state_abbreviation'],
      dtype='object')

In [59]:
df_2019.filter(regex="graduation_rate") #find col names

Unnamed: 0,graduation_rate_american_indian_or_alaska_native,graduation_rate_asian,graduation_rate_asian/native_hawaiian/other_pacific_islander,graduation_rate_black_non-hispanic,graduation_rate_hispanic,graduation_rate_men,graduation_rate_native_hawaiian_or_other_pacific_islander,graduation_rate_nonresident_alien,graduation_rate_race/ethnicity_unknown,graduation_rate_total_cohort,graduation_rate_two_or_more_races,graduation_rate_white_non-hispanic,graduation_rate_women
1,100.0,100.0,67.0,30.0,27.0,23.0,50.0,11.0,100.0,30.0,0.0,33.0,36.0
2,0.0,85.0,85.0,56.0,68.0,59.0,65.2,83.0,75.0,63.0,59.0,63.0,66.0
5,36.8,71.0,77.4,0.0,51.2,52.6,33.2,63.4,46.0,33.0,41.2,50.0,33.0
7,50.0,65.0,65.0,43.0,33.0,55.0,65.0,60.0,82.0,58.0,31.0,61.0,61.0
8,50.0,25.0,20.0,31.0,53.0,27.0,0.0,63.0,25.0,33.0,41.0,47.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4594,23.2,0.0,0.0,0.0,0.0,5.0,70.6,69.6,60.8,12.0,0.0,13.0,18.0
4596,36.6,100.0,50.0,12.0,14.0,20.0,0.0,73.4,13.0,16.0,0.0,25.0,14.0
4598,84.6,100.0,100.0,90.0,88.0,88.0,48.8,93.6,95.4,84.0,94.4,77.0,80.0
4599,50.0,0.0,50.0,0.0,0.0,38.0,100.0,50.0,44.2,33.0,54.2,36.0,19.0


In [60]:
sector_grad = cat_groups[-2].agg('mean')[[total, black]].sort_values(by=total)
sector_grad.drop(['Private not-for-profit, 2-year', 'Private for-profit, less-than 2-year', 'Public, 2-year', 'Private for-profit, 2-year', 'isMissing'], inplace=True)

### Computing Confidence Intervals

In [61]:
private_for_profit = df_2019[total][df_2019.sector_of_institution == 'Private for-profit, 4-year or above']
replicate = np.array([private_for_profit.sample(len(private_for_profit), replace=True).mean() for i in range(10000)])
private_for_profit_percentile = np.percentile(replicate, [2.5, 97.5])
private_for_profit_error = (private_for_profit_percentile[1] - private_for_profit_percentile[0]) / 2

In [62]:
private_non_profit = df_2019[total][df_2019.sector_of_institution == 'Private not-for-profit, 4-year or above']
replicate = np.array([private_non_profit.sample(len(private_non_profit), replace=True).mean() for i in range(10000)])
private_non_profit_percentile = np.percentile(replicate, [2.5, 97.5])
private_non_profit_error = (private_non_profit_percentile[1] - private_non_profit_percentile[0]) / 2

In [63]:
public = df_2019[total][df_2019.sector_of_institution == 'Public, 4-year or above']
replicate = np.array([public.sample(len(public), replace=True).mean() for i in range(10000)])
public_percentile = np.percentile(replicate, [2.5, 97.5])
public_error = (public_percentile[1] - public_percentile[0]) / 2

In [94]:
y_black = sector_grad[black]
y_total = sector_grad[total]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=sector_grad.index, y=y_total, name="Total Cohort", 
    text=np.round(y_total, 2), 
    marker=dict(color=colors[0], line=dict(width=3, color=colors[3])),
    error_y=dict(
        type='data', color=colors[3],
        array=[private_for_profit_error, public_error, private_non_profit_error])
))

fig.add_trace(go.Bar(
    x=sector_grad.index, y=y_black, name="African American", 
    text=np.round(y_black, 2), marker=dict(color=colors[4], line=dict(width=3, color=colors[3])), 
))

fig.update_layout(
    title="Mean Graduation Rate By Sector",
    yaxis={
        'title': "Mean Graduation Rate"
    },
    title_x=0.5,
    height=600,
    
    font={
        'size': 10
    },
    legend=dict(xanchor='left', yanchor='top', x=0.01, y=1.0),
    barmode='group'
)

The graph above reiterates the point. African Americans are represented the most at the sector that performs the worst. So, the problem of representation is not just an issue at {{university_x}}, but a persistent problem accross schools throughout the public and private non-profit sectors.

In [138]:
df_2019.select_dtypes("object")

Unnamed: 0,carnegie_classification_2018:_basic,carnegie_classification_2018:_enrollment_profile,carnegie_classification_2018:_size_and_setting,city_location_of_institution,historically_black_college_or_university,institution_name,institution_size_category,parent/child_indicator_-_finance,sector_of_institution,state_abbreviation
1,Master's Colleges & Universities: Larger Programs,High undergraduate,"Four-year, medium, highly residential",Normal,Yes,Alabama A & M University,"5,000 - 9,999",Not applicable,"Public, 4-year or above",AL
2,Doctoral Universities: Very High Research Acti...,Majority undergraduate,"Four-year, large, primarily nonresidential",Birmingham,No,University of Alabama at Birmingham,"20,000 and above",Not applicable,"Public, 4-year or above",AL
5,Master's Colleges & Universities: Small Programs,Majority undergraduate,"Four-year, very small, primarily nonresidential",Montgomery,No,Amridge University,"Under 1,000",Not applicable,"Private not-for-profit, 4-year or above",AL
7,Doctoral Universities: High Research Activity\r\n,High undergraduate,"Four-year, medium, primarily nonresidential",Huntsville,No,University of Alabama in Huntsville,"5,000 - 9,999",Not applicable,"Public, 4-year or above",AL
8,Master's Colleges & Universities: Medium Programs,Very high undergraduate,"Four-year, medium, highly residential",Montgomery,Yes,Alabama State University,"1,000 - 4,999",Not applicable,"Public, 4-year or above",AL
...,...,...,...,...,...,...,...,...,...,...
4594,"Not applicable, not in Carnegie universe (not ...","Not applicable, not in Carnegie universe (not ...","Not applicable, not in Carnegie universe (not ...",Springfield,No,Drury University-College of Continuing Profess...,"1,000 - 4,999",Partial child record - reports revenues/expens...,"Private not-for-profit, 4-year or above",MO
4596,"Not applicable, not in Carnegie universe (not ...","Not applicable, not in Carnegie universe (not ...","Not applicable, not in Carnegie universe (not ...",Fort Wayne,No,Indiana Institute of Technology-College of Pro...,"1,000 - 4,999",Child record - data reported with parent campus,"Private not-for-profit, 4-year or above",IN
4598,isMissing,isMissing,isMissing,Tampa,isMissing,Faith Theological Seminary and Christian College,isMissing,Not applicable,isMissing,ismissing
4599,isMissing,isMissing,isMissing,Fort Worth,isMissing,Southwestern Baptist Theological Seminary,isMissing,Not applicable,isMissing,ismissing


In [140]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df_2019["percent_of_total_enrollment_that_are_black_or_african_american"][df_2019['carnegie_classification_2018:_size_and_setting'].str.contains("non")]

))

# Do African Americans Perform Better at HBCU Schools?

In [65]:
hbcu_grad = cat_groups[4]
hbcu_grad = hbcu_grad.agg('mean')[[total, black]].drop("isMissing")

In [66]:

fig = go.Figure()

fig.add_trace(go.Bar(
    x=hbcu_grad.index, y=hbcu_grad[total],name="Total Cohort", 
    text=np.round(hbcu_grad[total], 2), marker=dict(color=colors[0], line=dict(width=3, color=colors[3]))
))
fig.add_trace(go.Bar(
    x=hbcu_grad.index, y=hbcu_grad[black], 
    name="African American", text=np.round(hbcu_grad[black], 2),
    marker=dict(color=colors[4], line=dict(width=3, color=colors[3]))
))

fig.update_layout(
    
    title="HBCU vs Non-HBCU: Mean Graduation Rate",
    title_x= 0.5,
    xaxis={
        'title': 'HBCU'
    },
    yaxis={
        'title': 'Mean Graduation Rate'
    },
    legend={
        'title': 'Legend',
        'xanchor': 'right',
        'yanchor':'top'
    },
    height=500,
    width=600
)

If you view HBCU vs non HBCU graduation rates you might get the impressions that HBCU's perform worse than non HBCUs even for African Americans. However, viewing univariate plots like this can be quite misleading. If you don't control for other variables like revenues, price, sat scores, and other things that have a proven impact on graduation rates then you can't say much about the true effect of being an HBCU school. To demonstrate this lets first control for price. 

# Controling For Price

In [67]:
my_schools = my_schools.reset_index()

In [68]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_2019[price][df_2019[hbcu] == "No"], y=df_2019[total][df_2019[hbcu] == 'No'], 
    mode='markers', name="Non-HBCU", hovertext=df_2019["institution_name"]
))
fig.add_trace(go.Scatter(
    x=df_2019[price][df_2019[hbcu] == "Yes"], y=df_2019[total][df_2019[hbcu] == 'Yes'], 
    mode='markers', name="HBCU", hovertext=df_2019["institution_name"]
))

fig.add_trace(go.Scatter(
    x=my_schools[price], y=my_targets[total], 
    mode='markers', name=university_x, 
    hovertext=my_schools["institution_name"]
))


fig.update_layout(
    title='Price vs Graduation Rate',
    title_x=0.5,
    xaxis={
        'title': 'Price'
    },
    yaxis={
        'title': 'Graduation Rate Total Cohort',
    },
    height=600
)

fig.update_traces(
    marker=dict(size=12, line=dict(width=2, color=colors[0]))
)

As you can see if you control for revenues, HBCUs dont perform any worse or better than other institutions. This also probes one to think about the socio economic class that the enrollment class. This plot shows that higher priced schools tend to have higher graduation rates. Perhaps students of higher socio economic class are able to afford higher priced schools. I would tell my stakeholders at this point that socio economic class might be something worth collecting to improve our data driven explanation for {{university_x}}'s graduation rates. Now Lets have a look at how revenues effect graduation rates.

## Revenues Vs Graduation Rate

In [69]:
fig = px.scatter(df_2019.sample(2315, random_state=42), x=revenue, y=total, color=hbcu)



fig.update_layout(
    title='Revenue vs Graduation Rate',
    title_x=0.5,
    xaxis={
        'title': 'Revenues',
        'range': [-300000000, 8000000000]
    },
    yaxis={
        'title': 'Graduation Rate Total Cohort',
        'range': [-10, 110]
    },
    height=700,
    
    legend={
        'xanchor':'right',
        'yanchor':'top',
        'bgcolor': 'rgba(176,196,222,0.7)',
        'borderwidth': 2,
        'bordercolor': 'black'
    }
    
)

fig.update_traces(
    marker=dict(size=10,line=dict(width=1.5))
)

In [129]:
features.filter('log')

institution_name,UnitID
Alabama A & M University,100654
Alabama A & M University,100654
University of Alabama at Birmingham,100663
University of Alabama at Birmingham,100663
Amridge University,100690
...,...
Indiana Institute of Technology-College of Professional Studies,492962
Indiana Institute of Technology-College of Professional Studies,492962
Faith Theological Seminary and Christian College,494597
Southwestern Baptist Theological Seminary,494603


## Elastic Net Model

In [70]:
bottom = [i[0] for i in model_coefs][0:5]
bottom_labels = [i[1] for i in model_coefs][0:5]

top = [i[0] for i in model_coefs][-5:]
top_labels = [i[1] for i in model_coefs][-5:]

In [71]:
# add <br> tags to the labels

new_bottom = []
for i in bottom_labels:
    if len(i) > 50:
        as_list = i.split('_')
        index =  len(as_list) // 3
        index2 = 4 * len(as_list) // 5 
        as_list.insert(index, '<br>')
        as_list.insert(index2, '<br>')
        new_string = " ".join(as_list)
        new_bottom.append(new_string)
     
    else:   
        as_list = i.split('_')
        index =  len(as_list) // 2
        as_list.insert(index, '<br>')
        new_string = " ".join(as_list)
        new_bottom.append(new_string)

new_top = []
for i in top_labels:
    as_list = i.split('_')
    index =  len(as_list) // 2
    as_list.insert(index, '<br>')
    new_string = " ".join(as_list)
    new_top.append(new_string) 

In [72]:
new_bottom

['percent of total <br> enrollment that are black <br> or african american',
 'percent of women <br> age 25 and over',
 'student service expenses <br> as a percent of <br> total core expenses',
 'carnegie classification <br> 2018: size <br> and setting Four-year, small, primarily nonresidential',
 'carnegie classification <br> 2018: size <br> and setting Four-year, medium, primarily nonresidential']

In [73]:
new_top

['log <br> staff * price',
 'total price for out-of-state <br> students living on campus',
 'sat critical reading <br> 75th percentile score',
 'sat writing <br> 75th percentile score',
 'sat math <br> 75th percentile score']

In [74]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=bottom + top,
    y=new_bottom + new_top,
    text = new_bottom + new_top,
    orientation='h'
))
fig.update_layout(
    height=800,
    yaxis={
        'showticklabels': False
    },
    title="Elastic Net Coefficients",
    title_x=0.5
)

**<center>Lets interperate {{feature}}</center>**

In [75]:
spelman = my_schools_xgb.loc[universities[0]].iloc[[1], :]
feature = model_coefs[-1][1]

In [76]:
model_coefs[-1][1]

'sat_math_75th_percentile_score'

In [77]:
# 1 % change in graduation rate calculation

#the amount any feature needs to change to have a 1% increase in graduation rates
delta = (x_train_xgb[feature].max() - x_train_xgb[feature].min()) / model_coefs[-1][0]

pred1 = elasticNet.predict(spelman)

spelman[feature] += delta

pred2 = elasticNet.predict(spelman)

In [78]:
delta = np.round(delta, 2)

In [144]:
fig = go.Figure()
try:
    fig.add_trace(go.Scatter(
        x=df_2019[feature][df_2019[feature + "_isnan"] == 0] , 
        y=df_2019[total][df_2019[feature + "_isnan"] == 0], mode='markers',
        marker=dict(size=10, color='rgba(70, 70, 250, 1.0)', line=dict(width=1)),
        name="SAT scores"
    ))
    fig.add_trace(go.Scatter(
        x=df_2019[feature][df_2019[feature + "_isnan"] == 1] , 
        y=df_2019[total][df_2019[feature + "_isnan"] == 1],
        marker=dict(size=10, color='rgba(255, 40, 40, 0.6)', line=dict(width=1)),
        mode='markers', name="Imputed"
    ))
except:
    fig.add_trace(go.Scatter(
        x=df_2019[feature], y=df_2019[total], mode='markers'
    ))
    fig.add_trace(go.Scatter(
        x=df_2019[feature], y=df_2019[total], mode='markers'
    ))
    
fig.update_layout(
    xaxis=dict(
        range=[375, 810],
        title=feature
    ),
    yaxis=dict(
        title='Graduation Rate',
        range=[-10, 110]
    ),
    legend=dict(xanchor="left", x=0.01),
    height=450,
    width=550
)
fig.show()    
    

- In red are the missing values

<ul><li> A {{delta}} increase in {{model_coefs[-1][1]}} will cause a 1 % increase in graduation rates'</li></ul>

- The Coeficient Should be less steep than the observed trend because we are controlling for all other variables

- Some of the effect can be explained by other features

## Advantages and Disadvantages of ElasticNet

## Advantages

## Disadvantages

- Simple Linear Interpretation

- No Feature Interactions

<ul><li> A {{delta}} increase in {{model_coefs[-1][1]}} will cause a 1 % increase in graduation rates'</li></ul>

- This statement can be misleading if you have many highly correlated features

In [80]:
residential = df_2019.filter(regex='carnegie').columns[2]

In [81]:
highly_res = df_2019[df_2019[residential].str.contains("highly")]
primarily = df_2019[df_2019[residential].str.contains("primarily residential")]
non = df_2019[df_2019[residential].str.contains("nonresidential")]

In [82]:
p_high = np.polyfit(np.log(highly_res[revenue]), highly_res[total], 1)
p_prim = np.polyfit(np.log(primarily[revenue]), primarily[total], 1)
p_non = np.polyfit(np.log(non[revenue]), non[total], 1)

## <center>Elastic Net Feature Interactions Example</center>

In [83]:
fig = go.Figure()
x = np.linspace(12, 24, 100)
# Highly Residential
fig.add_trace(go.Scatter(
    x=np.log(highly_res[revenue]),
    y=highly_res[total], mode='markers', 
    legendgroup="Highly Residential", name="Highly Residential"
))
# Line of Best Fit
fig.add_trace(go.Scatter(
    x=x,
    y=np.polyval(p_high, x), mode='lines',
    legendgroup="Hgihly Residential", name= "Highly Residential"
))

# Primarily Residential
fig.add_trace(go.Scatter(
    x=np.log(highly_res[revenue]),
    y=primarily[total], mode='markers',
    legendgroup="Primarily Residential", name= "Primarily Residential"
))
#Primarily Residential Line of best fit
fig.add_trace(go.Scatter(
    x=x,
    y=np.polyval(p_prim, x), mode='lines',
    legendgroup="Primarily Residential", name= "Primarily Residential", 
))

# nonresidential
fig.add_trace(go.Scatter(
    x=np.log(highly_res[revenue]),
    y=non[total], mode='markers',
    legendgroup="Non Residential", name="Non Residential"
))
# nonresidential line of best fit
fig.add_trace(go.Scatter(
    x=x,
    y=np.polyval(p_non, x), mode='lines',
    legendgroup="Non Residential", name="Non Residential"
))

#update colors
fig.data[1].line.color = 'blue' 
fig.data[3].line.color = 'green' 
fig.data[5].line.color = 'orange' 

fig.update_layout(
    height=600,
    yaxis={
        'showticklabels': False,
        'range': [-10, 110],
        'title': 'Graduation Rate'
    },
    title="Residential Level and Revenues Feature Interaction",
    xaxis=dict(title='Revenues', range=[12,24]),
    font=dict(size=20),
    title_x=0.5
)

## Model Performance

# <center>Elastic Net Model Performance</center>

In [84]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=enet_pred, y=y_test_xgb, mode='markers', name='(y_pred, y_test)'))
fig.add_trace(go.Scatter(x=np.linspace(0,100,100), y=np.linspace(0,100,100), name="100% Accuracy"))
fig.update_layout(
    title= 'Elastic Net Accuracy',
    xaxis={'title':"Predicted"},
    yaxis={'title':"Observed"},
    font=dict(size=16),
    height=550
)

In [85]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=enet_pred, y=enet_pred - y_test_xgb, mode='markers', name='(y_pred, y_test)'))
fig.update_layout(
    title= 'Elastic Net Residuals',
    xaxis={'title':"Observed Graduation Rate"},
    yaxis={'title':"Residuals"},
    title_x=0.5,
    font=dict(size=20),
    height=550
)

# <center> XGB Model Performance </center>

In [86]:
y_test = y_test_xgb.reset_index()

In [87]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=xgb_pred, y=y_test[total], mode='markers', 
    hovertext=y_test["institution_name"], name='(y_pred, y_test)'
))
fig.add_trace(go.Scatter(x=np.linspace(0,100,100), y=np.linspace(0,100,100), name="100% Accuracy"))
fig.update_layout(
    title= 'XGB Model Performance',
    title_x=0.5,
    font=dict(size=16),
    height=700,
    
    legend={
        'xanchor': 'left',
        'x': 0.01,
        'yanchor': 'top',
        'y': 0.99, 
        'bgcolor': 'rgba(150, 160, 160, 0.6)'
    }
)

In [88]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test_xgb, y=xgb_pred - y_test_xgb, mode='markers', name='(y_pred, y_test)'))

fig.update_layout(
    title= 'XGB Residuals',
    title_x=0.5,
    xaxis={'title':"Observed Graduation Rate"},
    yaxis={'title':"Residuals"},
    font=dict(size=20),
    height=700
)

In [89]:
my_schools_xgb['predicted'] = xgb_model.predict(my_schools_xgb)

In [90]:
my_schools_xgb['actual'] = my_targets[total]

In [91]:
school = my_schools_xgb.index.to_list()[0]

In [92]:
school

('Spelman College', 141060)

In [93]:
#show predicted and actual graduation rates from 2018-2019 for university of asheville
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=my_schools_xgb.loc[school,'cohort'],
    y=my_schools_xgb.loc[school, 'predicted'],
    name="Predicted Graduation Rates",

))
fig.add_trace(go.Scatter(
   x=my_schools_xgb.loc[school,'cohort'],
    y=my_schools_xgb.loc[school, 'actual'],
    name="Actual Graduation Rates",
 
))

fig.update_layout(
    title=school[0],
    title_x=0.5,
    xaxis={'title':"Year"},
    yaxis={'title':"Residuals"},
    font=dict(size=20),
    height=600
)

## <center>Future Work</center>

- #### Collect all historical graduation rates and create a time series forecast for graduation rates

- #### Deploy web application where you can pick your school and get a dashboard report