In [1]:
import ipywidgets as widgets
import pandas as pd
import random
import itertools
from matplotlib import colors

import os
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML

import matplotlib.pyplot as plt
import json
import datetime
import re

import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model

import crosswalks
import s3fs


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def split_data(df):
    
    X = df[['SAT_AVG_ALL','CREDDESC', 'CIPDESC_new',
            'CONTROL', 'REGION', 'tuition', 'LOCALE', 'ADM_RATE_ALL']]
    y = df[['EARN_MDN_HI_2YR']]
    
    return [X, y]

def trainmodel(X, y):
    enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
    imp = IterativeImputer(max_iter=10, random_state=0, initial_strategy='mean', add_indicator = True)
    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    ct = ColumnTransformer(
        [('onehot', enc, ['CONTROL','CREDDESC', 'CIPDESC_new','REGION', 'LOCALE']),
        ('impute', imp, ['SAT_AVG_ALL', 'ADM_RATE_ALL'])], 
        remainder='passthrough' 
    )

    pipe = Pipeline(steps=[('coltrans', ct), ('linear', linear_model.LinearRegression())])
    pipe = pipe.fit(X_train, y_train) 
    
    return(pipe, pipe.score(X_test,y_test))

In [4]:
# Load data to get feature values
s3 = s3fs.S3FileSystem(anon=True)
s3fpath = 's3://saturn-public-data/college-scorecard/cleaned_merged.csv'

major = pd.read_csv(
    s3fpath,
#     storage_options={'anon': True},
    dtype = 'object',
    na_values = ['PrivacySuppressed']
)

creds = sorted(list(major['CREDDESC'].unique()))
cips = sorted(list(major['CIPDESC_new'].unique()))

In [6]:
reg = crosswalks.reg
coltype = crosswalks.coltype
locs = crosswalks.locs
cipcats = crosswalks.cipcats

In [7]:
X, y = split_data(major)
modobj, modscore = trainmodel(X, y)

In [8]:
sat_widget = widgets.IntSlider(
    value=800,
    min=0,
    max=1600,
    step=1,
    description='Admitted Avg SAT Score:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

tuit_widget = widgets.IntSlider(
    value=15000,
    min=min(round(major['tuition'].astype(float), 0)),
    max=max(round(major['tuition'].astype(float), 0)),
    step=1,
    description='Cost of attendance, USD:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

adm_widget = widgets.FloatSlider(
    value=.3,
    min=min(major['ADM_RATE_ALL'].astype(float)),
    max=max(major['ADM_RATE_ALL'].astype(float)),
    step=.01,
    description='Admission Rate:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

coltype_widget = widgets.Dropdown(
    options= coltype,
    value='Public',
    description='College Type:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

credential_widget = widgets.Dropdown(
    options=creds,
    value="Associate's Degree",
    description='Credential:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')

)

cippick_widget = widgets.Dropdown(
    options=cips,
    value='Accounting and Related Services.',
    description='Major:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')

)

region_widget = widgets.Dropdown(
    options=reg,
    value='8',
    description='Region:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')

)

locale_widget = widgets.Dropdown(
    options=locs,
    value='11',
    description='Locale:',
    disabled=False,
    style = {'description_width': 'initial'},
    layout=widgets.Layout(width='80%')

)

In [35]:
def plotly_hist(df=major, prediction=1, degreetype='All', majorfield = "All Fields"):

    plot_series1 = df[df.CREDDESC == degreetype]['EARN_MDN_HI_2YR'].astype(int)
    plot_series2 = df[(df.CIPDESC_new == majorfield) & (df.CREDDESC == degreetype)]['EARN_MDN_HI_2YR'].astype(int)
               
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=plot_series1, name = "All Fields",histnorm='percent', xbins=dict(size = 1000)))
    fig.add_trace(go.Histogram(x=plot_series2, name = majorfield, histnorm='percent', xbins=dict(size = 1000)))
    
    fig.update_layout(
        barmode='overlay',
        title_text=f'Median Earnings, {degreetype}', # title of plot
        xaxis_title_text='USD ($)', # xaxis label
        yaxis_title_text='Percent of Total', # yaxis label
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1, # gap between bars of the same location coordinates
        template = "plotly_white",
        font=dict(
            family="Helvetica",
            size=12
        ),
        legend=dict(yanchor = 'top', y=1, 
                    xanchor = 'right', x = 1)
    )
    fig.add_vline(x=prediction, line_dash="dash", annotation_text=f"Predicted: ${round(prediction, 2):,}")

    fig.update_traces(opacity=0.55)
    return fig
    

In [37]:
def results(sat, cred, cip, col, reg, tuit, loc, adm, modscore=modscore):
    newdf = pd.DataFrame([[sat, cred, cip, col, reg, tuit, loc, adm]], 
        columns = ['SAT_AVG_ALL','CREDDESC', 'CIPDESC_new','CONTROL', 'REGION', 'tuition', 'LOCALE', 'ADM_RATE_ALL'])

    [[prediction]] = modobj.predict(newdf)
    pred_final = prediction if prediction > 0 else np.nan

    pred_html = f""" <h4>Your Selections:</h4>
        <ul style="line-height: 1.5">
            <li>Locale: {loc}</li>
            <li>Region: {reg}</li>
            <li>Major Field: {cip}</li>
            <li>Credential: {cred}</li>
            <li>College Type: {col}</li>
            <li>Avg SAT Score: {sat}</li>
            <li>Cost: ${round(tuit, 0):,}</li>
            <li>Admit Rate: {adm}</li>
        </ul>
        <BR>
        """
    
    display(HTML(pred_html))
        
def results2(sat, cred, cip, col, reg, tuit, loc, adm, modscore=modscore):
    newdf = pd.DataFrame([[sat, cred, cip, col, reg, tuit, loc, adm]], 
        columns = ['SAT_AVG_ALL','CREDDESC', 'CIPDESC_new','CONTROL', 'REGION', 'tuition', 'LOCALE', 'ADM_RATE_ALL'])

    [[prediction]] = modobj.predict(newdf)
    pred_final = prediction if prediction > 0 else np.nan

    pred_html = f"""  
      <hr>
      <h2> Model Predicts... </h2>
Two years after graduating, median earnings should be roughly <b> ${round(pred_final, 2):,} </b>per year.

    """
    
    display(HTML(pred_html))
    p3 = plotly_hist(df=major, degreetype=cred, prediction=pred_final, majorfield=cip)
    display(go.FigureWidget(p3))

out = widgets.interactive_output(results, {
    'loc':locale_widget,
    'reg':region_widget,
    'cip':cippick_widget,
    'cred':credential_widget,
    'col':coltype_widget,
    'sat':sat_widget,
    'tuit':tuit_widget,
    'adm':adm_widget})

out2 = widgets.interactive_output(results2, {
    'loc':locale_widget,
    'reg':region_widget,
    'cip':cippick_widget,
    'cred':credential_widget,
    'col':coltype_widget,
    'sat':sat_widget,
    'tuit':tuit_widget,
    'adm':adm_widget})


In [38]:

title_html = f"""
<h2>Predict Earnings by College Choices</h2>
In this model, several characteristics are used to predict median earnings two years after degree completion.

<ul style="line-height: 1.5">
    <li>Geographic location of college</li>
    <li>Type of college (public/private/for-profit, size)</li>
    <li>Major field of study (CIP code)</li>
    <li>Credential (Bachelor, Master, etc)</li>
    <li>Admission rate</li>
    <li>Average SAT score for admitted students (proxy for college prestige)</li>
</ul>

Using these features, a linear regression is able to very closely predict the median earnings for a graduate. Try it and see!
<BR><BR>
Model's R^2 value: {round(modscore, 3)}
<BR>
<hr>
<h2> Enter your choices... </h2>
        """

In [39]:
from ipywidgets import Button, GridBox, Layout, ButtonStyle

vb = widgets.VBox()
vb.children = [locale_widget,
    region_widget,
    cippick_widget,
    credential_widget,
    coltype_widget,
    sat_widget,
    tuit_widget,
    adm_widget]

vb2 = widgets.VBox()
vb2.children = [out]

vb3 = widgets.HBox(layout=Layout(width='auto', grid_area='vb3'))
vb3.children = [out2]

In [40]:
gb = widgets.GridBox()
gb.children = [vb, vb2, vb3]
gb.layout.width='80%'
gb.layout.grid_template_rows='auto auto'
gb.layout.grid_template_columns='50% 50%'
gb.layout.grid_template_areas='''"vb vb2"
                                "vb3 vb3"'''

In [41]:
appcont = [widgets.HTML(title_html, 
                        layout=widgets.Layout(margin='0 0 3em 0', max_width='1000px')),
           gb]

In [42]:
app = widgets.VBox(appcont)

In [43]:
display(app)

VBox(children=(HTML(value='\n<h2>Predict Earnings by College Choices</h2>\nIn this model, several characterist…