In [1]:

import ipywidgets as widgets
import matplotlib as mpl
import pandas as pd
import random
import itertools
from matplotlib import colors

import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import wait 
import dask
from dask_saturn import SaturnCluster
from dask.distributed import Client

import matplotlib.pyplot as plt
import json
import datetime
import re

import bokeh as bk
from bokeh.io import show, output_notebook
from bokeh.plotting import figure

from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def split_data(df):
    
    X = df[['SAT_AVG_ALL','CREDDESC', 'CIPDESC_new',
            'CONTROL', 'REGION', 'tuition', 'LOCALE', 'ADM_RATE_ALL']]
    y = df[['EARN_MDN_HI_2YR']]
    
    return [X, y]

def trainmodel(X, y):
    enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
    imp = IterativeImputer(max_iter=10, random_state=0, initial_strategy='mean', add_indicator = True)
    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    ct = ColumnTransformer(
        [('onehot', enc, ['CONTROL','CREDDESC', 'CIPDESC_new','REGION', 'LOCALE']),
        ('impute', imp, ['SAT_AVG_ALL', 'ADM_RATE_ALL'])], 
        remainder='passthrough' 
    )

    pipe = Pipeline(steps=[('coltrans', ct), ('linear', linear_model.LinearRegression())])
    pipe = pipe.fit(X_train, y_train) 
    
    return(pipe, pipe.score(X_test,y_test))

In [3]:
# Load data to get feature values
import s3fs
s3 = s3fs.S3FileSystem(anon=True)
s3fpath = 's3://saturn-public-data/college-scorecard/cleaned_merged.csv'

major = pd.read_csv(
    s3fpath,
    storage_options={'anon': True},
    dtype = 'object',
    na_values = ['PrivacySuppressed']
)

creds = sorted(list(major['CREDDESC'].unique()))
cips = sorted(list(major['CIPDESC_new'].unique()))

In [5]:
reg = [('U.S. Service Schools', '0'),
('New England (CT, ME, MA, NH, RI, VT)','1'),
('Mid East (DE, DC, MD, NJ, NY, PA)', '2'),
('Great Lakes (IL, IN, MI, OH, WI)', '3'),
('Plains (IA, KS, MN, MO, NE, ND, SD)', '4'),
('Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)', '5'),
('Southwest (AZ, NM, OK, TX)', '6'),
('Rocky Mountains (CO, ID, MT, UT, WY)', '7'),
('Far West (AK, CA, HI, NV, OR, WA)', '8'),
('Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)', '9')]

coltype = ['Public',
'Private, nonprofit',
'Private, for-profit']


In [6]:
locs = [
('City: Large (population of 250,000 or more)', '11')
,('City: Midsize (population of at least 100,000 but less than 250,000)', '12')
,('City: Small (population less than 100,000)', '13')
,('Suburb: Large (outside principal city, in urbanized area with population of 250,000 or more)', '21')
,('Suburb: Midsize (outside principal city, in urbanized area with population of at least 100,000 but less than 250,000)', '22')
,('Suburb: Small (outside principal city, in urbanized area with population less than 100,000)', '23')
,('Town: Fringe (in urban cluster up to 10 miles from an urbanized area)', '31')
,('Town: Distant (in urban cluster more than 10 miles and up to 35 miles from an urbanized area)', '32')
,('Town: Remote (in urban cluster more than 35 miles from an urbanized area)', '33')
,('Rural: Fringe (rural territory up to 5 miles from an urbanized area or up to 2.5 miles from an urban cluster)', '41')
,('Rural: Distant (rural territory more than 5 miles but up to 25 miles from an urbanized area or more than 2.5 and up to 10 miles from an urban cluster)', '42')
,('Rural: Remote (rural territory more than 25 miles from an urbanized area and more than 10 miles from an urban cluster)', '43')
]

In [7]:
cipcats = [
('AGRICULTURE, AGRICULTURE OPERATIONS, AND RELATED SCIENCES', '01'),
 ('NATURAL RESOURCES AND CONSERVATION', '03'),
 ('ARCHITECTURE AND RELATED SERVICES', '04'),
 ('AREA, ETHNIC, CULTURAL, AND GENDER STUDIES', '05'),
 ('COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS', '09'),
 ('COMMUNICATIONS TECHNOLOGIES/TECHNICIANS AND SUPPORT SERVICES', '10'),
 ('COMPUTER AND INFORMATION SCIENCES AND SUPPORT SERVICES', '11'),
 ('PERSONAL AND CULINARY SERVICES', '12'),
 ('EDUCATION', '13'),
 ('ENGINEERING', '14'),
 ('ENGINEERING TECHNOLOGIES/TECHNICIANS', '15'),
 ('FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS', '16'),
 ('FAMILY AND CONSUMER SCIENCES/HUMAN SCIENCES', '19'),
 ('LEGAL PROFESSIONS AND STUDIES', '22'),
 ('ENGLISH LANGUAGE AND LITERATURE/LETTERS', '23'),
 ('LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMANITIES', '24'),
 ('LIBRARY SCIENCE', '25'),
 ('BIOLOGICAL AND BIOMEDICAL SCIENCES', '26'),
 ('MATHEMATICS AND STATISTICS', '27'),
 ('RESERVE OFFICER TRAINING CORPS (JROTC, ROTC', '28'),
 ('MILITARY TECHNOLOGIES', '29'),
 ('MULTI/INTERDISCIPLINARY STUDIES', '30'),
 ('PARKS, RECREATION, LEISURE, AND FITNESS STUDIES', '31'),
 ('BASIC SKILLS', '32'),
 ('CITIZENSHIP ACTIVITIES', '33'),
 ('HEALTH-RELATED KNOWLEDGE AND SKILLS', '34'),
 ('INTERPERSONAL AND SOCIAL SKILLS', '35'),
 ('LEISURE AND RECREATIONAL ACTIVITIES', '36'),
 ('PERSONAL AWARENESS AND SELF-IMPROVEMENT', '37'),
 ('PHILOSOPHY AND RELIGIOUS STUDIES', '38'),
 ('THEOLOGY AND RELIGIOUS VOCATIONS', '39'),
 ('PHYSICAL SCIENCES', '40'),
 ('SCIENCE TECHNOLOGIES/TECHNICIANS', '41'),
 ('PSYCHOLOGY', '42'),
 ('SECURITY AND PROTECTIVE SERVICES', '43'),
 ('PUBLIC ADMINISTRATION AND SOCIAL SERVICE PROFESSIONS', '44'),
 ('SOCIAL SCIENCES', '45'),
 ('CONSTRUCTION TRADES', '46'),
 ('MECHANIC AND REPAIR TECHNOLOGIES/TECHNICIANS', '47'),
 ('PRECISION PRODUCTION', '48'),
 ('TRANSPORTATION AND MATERIALS MOVING', '49'),
 ('VISUAL AND PERFORMING ARTS', '50'),
 ('HEALTH PROFESSIONS AND RELATED CLINICAL SCIENCES', '51'),
 ('BUSINESS, MANAGEMENT, MARKETING, AND RELATED SUPPORT SERVICES', '52'),
 ('HIGH SCHOOL/SECONDARY DIPLOMAS AND CERTIFICATES', '53'),
 ('HISTORY', '54'),
 ('Residency Programs', '60')]

In [8]:
X, y = split_data(major)
modobj, modscore = trainmodel(X, y)


In [9]:
sat_widget = widgets.IntSlider(
    value=800,
    min=0,
    max=1600,
    step=1,
    description='SAT Score:',
    disabled=False
)

tuit_widget = widgets.IntSlider(
    value=15000,
    min=min(round(major['tuition'].astype(float), 0)),
    max=max(round(major['tuition'].astype(float), 0)),
    step=1,
    description='Cost of attendance, USD:',
    disabled=False,
    style = {'description_width': 'initial'}
)

adm_widget = widgets.BoundedFloatText(
    value=.3,
    min=min(major['ADM_RATE_ALL'].astype(float)),
    max=max(major['ADM_RATE_ALL'].astype(float)),
    step=.01,
    description='Admission rate:',
    disabled=False,
    style = {'description_width': 'initial'}
)

# cipfields = widgets.Dropdown(
#     options=cipcats,
#     value='01',
#     description='Major:',
#     disabled=False
# )

coltype_widget = widgets.Dropdown(
    options= coltype,
    value='Public',
    description='College Type:',
    disabled=False,
    style = {'description_width': 'initial'}
)

credential_widget = widgets.Dropdown(
    options=creds,
    value='Bachelors Degree',
    description='Credential:',
    disabled=False,
)

cippick_widget = widgets.Dropdown(
    options=cips,
    value='English Language and Literature, General.',
    description='Major:',
    disabled=False,
)

region_widget = widgets.Dropdown(
    options=reg,
    value='1',
    description='Region:',
    disabled=False,
)

locale_widget = widgets.Dropdown(
    options=locs,
    value='11',
    description='Locale:',
    disabled=False,
)


In [11]:

def tester1(sat, tuit, adm):
    
    print(f'Selected SAT Score: {sat}')
    print(f'Cost: {tuit}')
    print(f'Admit Rate:{adm}')

    
def tester2(cippick, credpick, coltype,
           region, locale):
    
    print(f'Selected Major Field: {cippick}')
    print(f'Region: {region}')
    print(f'Selected Credential: {credpick}')
    print(f'Locale: {locale}')
    print(f'College Type: {coltype}')

def results(sat, cippick, credpick, coltype,
           region, tuit, modscore, locale, adm):
    
    print(f"Model Rating: {modscore}")
    
    newdf = pd.DataFrame([[sat, credpick, cippick, coltype, region, tuit, locale, adm]],
    columns = ['SAT_AVG_ALL','CREDDESC', 'CIPDESC_new','CONTROL', 'REGION', 'tuition', 'LOCALE', 'ADM_RATE_ALL'])

    [[prediction]] = modobj.predict(newdf)
    pred_final = prediction if prediction > 0 else np.nan
    print(f'Predicted Median Earnings 2 years after grad: ${round(pred_final, 3)}')

    
    
sat_view = widgets.interactive(
    tester, 
    sat=sat_widget , 
    region=region_widget ,
    cippick = cippick_widget ,
    coltype=coltype_widget ,
    credpick=credential_widget ,
    locale=locale_widget ,
    tuit=tuit_widget ,
    adm=adm_widget ,
    modscore=modscore)

rt_view = widgets.interactive(
    tester, 
    sat=sat, 
    region=region,
    cippick = cippick,
    coltype=coltype,
    credpick=credential,
    locale=locale,
    tuit=tuit,
    adm=adm,
    modscore=modscore)

grid = widgets.GridspecLayout(1, 1,
    layout=widgets.Layout(justify_content='center'))
grid[0, 0] = sat_view

In [11]:

title_html = """
<h2>Predict Earnings by College Choices</h2>
In this model, several characteristics are used to predict median earnings two years after degree completion.

<ul style="line-height: 1.5">
    <li>Geographic location of college</li>
    <li>Type of college (public/private/for-profit, size)</li>
    <li>Major field of study (CIP code)</li>
    <li>Credential (Bachelor, Master, etc)</li>
    <li>Admission rate</li>
    <li>Average SAT score for admitted students (proxy for college prestige)</li>
</ul>

Using these features, a linear regression is able to very closely predict the median earnings for a graduate. Try it and see!
"""

In [12]:
app_contents = [widgets.HTML(title_html, 
                             layout=widgets.Layout(margin='0 0 3em 0', max_width='1000px')), grid]
app = widgets.VBox(app_contents, 
                   layout=widgets.Layout(max_width='1024px', margin='0 auto 0 auto'))

In [13]:
display(app)

VBox(children=(HTML(value='\n<h2>Predict Earnings by College Choices</h2>\nIn this model, several characterist…