In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator

import matplotlib.pyplot as plt
import seaborn as sns
import re
from joblib import load, dump
import plotly.figure_factory as ff

In [63]:
percent_black = 'percent_of_total_enrollment_that_are_black_or_african_american'
total = 'graduation_rate_total_cohort'
black = 'graduation_rate_black_non-hispanic'

In [64]:
# grabbing model and train_test_split from disk
model_log = load("objects/model_log.joblib")
model = model_log[-1]['model']
x_train, x_test, y_train, y_test = load("objects/train_test_split.joblib")

In [4]:
# colors for plots
sequential = ["#edf8fb", "#b2e2e2", "#66c2a4", "#2ca25f", "#006d2c"]
diverging = ["#a6611a", "#dfc27d", "#f5f5f5", "#80cdc1", "#018571"]
qualitative = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00"]

In [5]:
df = pd.read_csv("data/4_year/eda.csv")

In [6]:
university_x = 'Georgia State University'

# Creating a median graduation rates dataframe with median grad rates of all universities and our specific university
median_grad_rates = df.filter(regex="graduation_rate").describe().loc[['50%']].T.sort_index()

our_median = df[df["institution_name"] == university_x]
our_median = our_median[our_median["cohort"] == 2019]
our_median = our_median.filter(regex="graduation_rate").describe().loc['50%'].sort_index()

median_grad_rates[university_x] = our_median


median_grad_rates.index = median_grad_rates.index.str.replace("graduation_rate_", "", regex=True)
median_grad_rates.index = median_grad_rates.index.str.replace("_or.*native", "", regex=True)
median_grad_rates.index = median_grad_rates.index.str.replace("_or.*islander", "", regex=True)
median_grad_rates.index = median_grad_rates.index.str.replace("/other_pacific_islander", "", regex=True)

median_grad_rates.columns = ['all_universities', university_x]
median_grad_rates.sort_values(by='all_universities', inplace=True)
median_grad_rates['differences'] =  median_grad_rates[university_x] - median_grad_rates['all_universities']

In [7]:
df_2019 = df[df.cohort == 2019]

In [8]:
revenue_cols = df_2019.filter(regex='revenue').columns
price_cols = df_2019.filter(regex='price').columns
res_col = 'carnegie_classification_2018:_size_and_setting'
sector_col = 'sector_of_institution'
test_cols = df_2019.filter(regex='sat|act').columns

In [9]:
revenue_cols[0]

'core_revenues_total_dollars'

In [24]:
#sector
sample_sector = df_2019[df_2019.institution_name == university_x][sector_col].values[0]

# revenue
sample_revenue = df_2019[df_2019.institution_name == university_x][revenue_cols[0]].values[0]

# price
sample_price = df_2019[df_2019.institution_name == university_x][price_cols[0]].values[0]

# residential level
sample_residential = df_2019[df_2019.institution_name == university_x][res_col].values[0]

# graduation rates
sample_total = df_2019[df_2019.institution_name == university_x][total].values[0]

# sat math scores
sample_test = df_2019[df_2019.institution_name == university_x][test_cols[7]].values[0]

# percentage of black enrollment
sample_black = df_2019[df_2019.institution_name == university_x][percent_black].values[0]


## <center>A Data Driven Study for Higher Education Performance</center>

These Are Notes

- by: Rob Campbell

- Data Science

- Jupyter Rise

<center><h1>Problem Statement</h1></center>

# Problem
**{{university_x}}** has very low graduation rates and even lower amongst students of color. They are facing immmence pressure from the community and board members and the leadership of the school needs to respond with a data backed explanation for their poor performance along with a data **driven plan** for **improving** their future **graduation rates**

# Source of Data

The DataScience team has access to the publically available [Ipeds Data Center]("https://nces.ed.gov/ipeds/datacenter/SelectVariables.aspx") which has all of the required information to predict graduation rates. For this project I manually selected variables from this link to incorporate into my analysis.
<br>
<br>

# <center>Data Driven Response</center>

In [25]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=median_grad_rates.index,
    y=median_grad_rates['all_universities'] / 100,
    texttemplate = "%{y}",
    textposition = "inside", name="All Universities"
))

fig.add_trace(go.Bar(
    x=median_grad_rates.index,
    y=median_grad_rates[university_x] / 100,
    texttemplate = "%{y}",
    textposition = "inside",
    opacity=1, name=university_x
))
fig.update_layout(
    title='Median Graduation Rates By Gender and Ethinicity',
    title_x=0.5,
    yaxis=dict(tickformat=',.0%'),
    height=750,
    barmode='group'
)

In [26]:
feature_importances = load('objects/feature_importances.joblib')
top_ten = feature_importances[-10:]

In [27]:
top_ten_cat = [i[0] for i in top_ten]
top_ten_val = [i[1] for i in top_ten]

In [28]:
y_names = []
for i in top_ten_cat:
    if len(i) > 30:
        y_names.append(i[:30] + str('...'))
    else:
        y_names.append(i)

## <center>Major Influences on Graduation Rate</center>

In [29]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=top_ten_val[:5],
    y=top_ten_cat[:5],
    orientation='h',
    text = y_names[:5],
    visible='legendonly',
    name='5-10', legendrank=2,
    marker_color=sequential[3]
))

fig.add_trace(go.Bar(
    x=top_ten_val[5:],
    y=top_ten_cat[5:],
    orientation='h',
    text = y_names[5:],
    name='Top 5', legendrank=1,
    marker_color=sequential[4]
))





fig.update_layout(
    title='Feature Importances',
    title_x = 0.5,
    height=750,
    font=dict(size=14),
    yaxis={
        'showticklabels': False
    },
)

**Total Graduation Rate vs African American Graduation Rates**

1. SAT Scores

2. Sector

3. Residential Level

4. Price

5. Revenue

<center><h2>SAT Scores<h2>

In [31]:
raw_symbols = SymbolValidator().values

In [32]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_2019['sat_math_75th_percentile_score'][df_2019['sat_math_75th_percentile_score_isnan'] == 0],
    y=df_2019['graduation_rate_total_cohort'][df_2019['sat_math_75th_percentile_score_isnan'] == 0],
    mode='markers', name='sat_scores', marker_color=qualitative[1]
))

fig.add_trace(go.Scatter(
    x=df_2019['sat_math_75th_percentile_score'][df_2019['sat_math_75th_percentile_score_isnan'] == 1],
    y=df_2019['graduation_rate_total_cohort'][df_2019['sat_math_75th_percentile_score_isnan'] == 1],
    mode='markers', name='Imputed Data', opacity=0.8, marker_color=qualitative[0]
))
fig.update_traces(
    marker=dict(size=10, line=dict(width=2))
    
)

fig.add_trace(go.Scatter(
    x=[sample_test],
    y=[sample_total],
    marker=dict(color=qualitative[4], symbol='x', size=14),
    mode='markers',
    name=university_x, opacity=1
))



fig.update_layout(
    title='SAT Math 75th Percentile Scores vs. Graduation Rate',
    xaxis=dict(title='SAT Math Scores', range=[300, 820]),
    yaxis=dict(title='Graduation Rate', range=[-15, 115]),
    height=700,
)

### Stakeholders Insights

- 1140 Missing Values of 2315 observations

- KNNImputer to impute values that were missing instaed of just the average or median

- SAT math scores are highly correlated with Graduation Rates

In [33]:
sector_group = df_2019.groupby('sector_of_institution')

In [34]:
sector_group.groups.keys()

dict_keys(['Private for-profit, 2-year', 'Private for-profit, 4-year or above', 'Private for-profit, less-than 2-year', 'Private not-for-profit, 2-year', 'Private not-for-profit, 4-year or above', 'Public, 2-year', 'Public, 4-year or above', 'isMissing'])

In [35]:
sectors = ['Private not-for-profit, 4-year or above', 'Private for-profit, 4-year or above', 'Public, 4-year or above']

In [36]:
private_non_profit_sector = sector_group.get_group(sectors[0])[[total, black, percent_black]]

private_for_profit_sector = sector_group.get_group(sectors[1])[[total, black, percent_black]]

public_sector = sector_group.get_group(sectors[2])[[total, black, percent_black]]

sector_data = [private_non_profit_sector, private_for_profit_sector, public_sector]

In [37]:
x_test.filter(regex='sector')

Unnamed: 0_level_0,Unnamed: 1_level_0,"sector_of_institution_Public, 2-year","sector_of_institution_Public, 4-year or above","sector_of_institution_Private for-profit, 4-year or above","sector_of_institution_Private not-for-profit, 4-year or above"
institution_name,cohort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anna Maria College,2018,0.0,0.0,0.0,1.0
Southeastern Baptist College,2018,0.0,0.0,0.0,1.0
Capitol Technology University,2018,0.0,0.0,0.0,1.0
Keuka College,2019,0.0,0.0,0.0,1.0
Southern Technical College,2018,0.0,0.0,1.0,0.0
...,...,...,...,...,...
Spelman College,2018,0.0,0.0,0.0,1.0
University of North Carolina at Asheville,2019,0.0,1.0,0.0,0.0
University of North Carolina at Asheville,2018,0.0,1.0,0.0,0.0
Spring Hill College,2018,0.0,0.0,0.0,1.0


## <center>Sector - ({{sample_sector}})</center> 

In [41]:

fig = go.Figure()

fig.add_trace(go.Violin(
    y = sector_data[0][total],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[0],
    opacity=0.7,
    name=sectors[0]
))

fig.add_trace(go.Violin(
    y = sector_data[2][total],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[2],
    opacity=0.7,
    name=sectors[2]
))

fig.add_trace(go.Violin(
    y = sector_data[1][total],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[1],
    opacity=0.7,
    name=sectors[1]
))

fig.update_traces(
    #points='all'
)
# Set axes ranges
fig.update_xaxes(range=[-0.5, 2.5])
fig.update_yaxes(range=[-19, 119])

fig.add_shape(type="line",
    x0=-0.5, y0=sample_total, 
    x1=2.5, y1=sample_total,
    line=dict(color=qualitative[4],width=3)
)
fig.update_layout(
    title=f'Graduation Rates By Sector ({sample_total}%)',
    yaxis=dict(title='Graduation Rate'),
    height=750
    
)

In [42]:
fig = go.Figure()

fig.add_trace(go.Violin(
    y = sector_data[1][percent_black],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[1],
    opacity=0.7,
    name=sectors[1]
))

fig.add_trace(go.Violin(
    y = sector_data[2][percent_black],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[2],
    opacity=0.7,
    name=sectors[2]
))

fig.add_trace(go.Violin(
    y = sector_data[0][percent_black],
    box_visible=True,
    line_color='black',
    fillcolor=qualitative[0],
    opacity=0.7,
    name=sectors[0]
))

# Set axes ranges
fig.update_xaxes(range=[-0.5, 2.5])
fig.update_yaxes(range=[-19, 119])

fig.add_shape(
    type="line",
    x0=-0.5, y0=sample_black, 
    x1=2.5, y1=sample_black,
    line=dict(color=qualitative[4],width=3)
)

fig.update_layout(
    title=f'Percentage of African American Enrollment By Sector ({sample_black}%)',
    yaxis=dict(title='% of enrollment that are black'),
    height=750
)

fig.update_traces(
    #points='all'
)

In [43]:
# binn percentage of enrollment that are black
black_bin = 'black_enrollment_binned'
df_2019[black_bin] = pd.cut(df_2019[percent_black], [-1, 5, 10, 15, 20, 100], labels=['0-5', '5-10', '10-15', '15-20', '20 <'])

## <center>Price</center>

In [45]:
fig =go.Figure()
fig.add_trace(go.Scatter(
    x=df_2019[price_cols[0]], 
    y=df_2019[total], 
    mode='markers',
    marker=dict(
        color=qualitative[1], size=0.6 * df_2019[percent_black]
    )
))

fig.update_layout(
    title='Price Impact on Graduation Rate and African American Enrollment',
    yaxis=dict(title='Graduation Rate'),
    height=750
)
fig.show()

In [46]:
carneigie_size_setting = ['highly residential', 'primarilly residential', 'nonresidential']

In [47]:
carnegie_size = df_2019.filter(regex='carnegie.*size').columns[0]

In [48]:
def map_residential(x):
    if "highly" in x:
        return 'highly residential'
    elif "primarily" in x:
        return 'primarilly residential'
    elif 'non' in x:
        return 'nonresidential'
    else:
        return 'missing'

In [49]:
df_2019['residential_level'] = df[carnegie_size].apply(lambda x: map_residential(x))

In [50]:
res_group = df_2019.groupby(carnegie_size)

In [51]:
res_levels = res_group.groups

In [52]:
to_pop = [
    'Exclusively graduate/professional','isMissing',
    'Two-year, very small','Two-year, very large',
    'Two-year, small','Two-year, medium',
    'Two-year, large', 'Not applicable, not in Carnegie universe (not accredited or nondegree-granting)'
]

In [53]:
for i in to_pop:
    del res_levels[i]

In [54]:
res_levels = list(res_levels.keys())

res_levels = pd.Index(res_levels).str.replace('highly residential', 'A highly residential')\
    .str.replace('primarily residential', 'B primarily residential')\
    .str.replace('primarily nonresidential', 'C primarily nonresidential')

In [55]:
res_levels = res_levels.sort_values()

In [56]:
res_levels = res_levels.str.replace('A\s|B\s|C\s', '', regex=True)

<center><h3> Residential Level - {{sample_residential}}</h3></center>

In [57]:
fig = go.Figure()

for i, v in enumerate(res_levels):
    legend_group = ''
    if 'large' in v:
        legend_group = 'large'
    elif 'medium' in v:
        legend_group='medium'
    elif 'very small' in v:
        legend_group='very small'
    else:
        legend_group='small'
            
    
    fig.add_trace(go.Violin(
        y=res_group.get_group(v)[total],
        box_visible=True,
        line_color='black',
        fillcolor=qualitative[i % 3],
        legendgroup=legend_group,
        opacity=0.6,
        name=res_levels[i]

    ))
    
fig.update_layout(
    title='Carnegie Size and Setting',
    height=750
)
fig.show()

# <center>Revenue</center>

In [58]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_2019[revenue_cols[0]],
    y=df_2019[total],
    mode='markers', 
    marker=dict(color=qualitative[1], size=9, line=dict(width=2)),
    name='All Universities'
))
fig.add_trace(go.Scatter(
    x=[sample_revenue],
    y=[sample_total],
    marker=dict(color=qualitative[-1], size=16, symbol='x', line=dict(width=2)),
    name=university_x
))
fig.update_layout(
    title="Revenues vs Graduation Rate",
    title_x= 0.5,
    xaxis=dict(title="Revenues"),
    yaxis=dict(title="Graduation Rate"),
    font=dict(size=16)
)

In [60]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.log((df_2019[revenue_cols[0]] + 1)),
    y=df_2019[total],
    marker=dict(color=qualitative[1], size=9, line=dict(width=2)),
    mode='markers',
    name='All Universities'
))
fig.add_trace(go.Scatter(
    x=[np.log(sample_revenue)],
    y=[sample_total],
    marker=dict(color=qualitative[-1], size=16, symbol='x', line=dict(width=2)),
    name=university_x
))
fig.update_layout(
    title="Revenues vs Graduation Rate",
    title_x=0.5,
    xaxis=dict(title="log revenues"),
    yaxis=dict(title="Graduation Rate"),
    font=dict(size=16)
)
fig.show()

# Elastic Net Model

![xgb](Images/elasticNetAccuracy.png)

# <center> XGB Model </center>

![xgb](Images/xgbAccuracy.png)

**Evaluation Metrics

- RMSE = 

- MAE = 7.16