In [1]:
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import wait 
import dask
from dask_saturn import SaturnCluster
from dask.distributed import Client

import matplotlib.pyplot as plt
import json
import datetime
import re

import bokeh as bk
from bokeh.io import show, output_notebook
from bokeh.plotting import figure

from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model


In [2]:
cluster = SaturnCluster()
client = Client(cluster)
client.restart()

INFO:dask-saturn:Cluster is ready
INFO:dask-saturn:Registering default plugins
INFO:dask-saturn:{'tcp://192.168.109.194:41925': {'status': 'repeat'}, 'tcp://192.168.160.195:36103': {'status': 'repeat'}, 'tcp://192.168.184.195:39745': {'status': 'repeat'}, 'tcp://192.168.59.67:39161': {'status': 'repeat'}, 'tcp://192.168.69.3:37869': {'status': 'repeat'}}


0,1
Client  Scheduler: tcp://d-steph-college-scorecard-proj-f647d66a9d5341e1bf4679ed4dc68db3.main-namespace:8786  Dashboard: https://d-steph-college-scorecard-proj-f647d66a9d5341e1bf4679ed4dc68db3.internal.saturnenterprise.io,Cluster  Workers: 5  Cores: 80  Memory: 637.50 GB


In [3]:

%%time

import s3fs
s3 = s3fs.S3FileSystem(anon=True)
s3fpath = 's3://saturn-public-data/college-scorecard/Most-Recent-Cohorts-Field-of-Study.csv'

major = dd.read_csv(
    s3fpath,
    usecols = ['UNITID','OPEID6','INSTNM','CONTROL','MAIN','CIPCODE',
               'CIPDESC','CREDLEV','CREDDESC','EARN_MDN_HI_2YR'],
    storage_options={'anon': True},
    dtype = 'object',
    na_values = ['PrivacySuppressed'],
    assume_missing=False
)

s3fpath2 = 's3://saturn-public-data/college-scorecard/Most-Recent-Cohorts-All-Data-Elements.csv'

inst = dd.read_csv(
    s3fpath2,
    storage_options={'anon': True},
    dtype = 'object',
    na_values = ['PrivacySuppressed'],
    assume_missing=False
)

# target: MD_EARN_WNE_P8

CPU times: user 413 ms, sys: 55.6 ms, total: 469 ms
Wall time: 1.79 s


# Model concept

Predict income x years in future for graduates?
Median Earnings 10 Years after Matriculation

given: x features of college, choose a major, then return median 10 years out.

Feature ideas:
* incoming sat/act
* admission rate
* college type (private/public/nonprofit)
* annual tuition


Deploy model, then give an interpretability dashboard?


In [4]:
m2 = major.compute()
m2.head()

Unnamed: 0,UNITID,OPEID6,INSTNM,CONTROL,MAIN,CIPCODE,CIPDESC,CREDLEV,CREDDESC,EARN_MDN_HI_2YR
0,100654,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelors Degree,
1,100654,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelors Degree,
2,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelors Degree,
3,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,
4,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,6,Doctoral Degree,


## Minimal MVP Model

In [5]:
#Ensure target is not na
m3 = m2[m2.EARN_MDN_HI_2YR.notna()]

In [6]:
enc = OneHotEncoder(handle_unknown='ignore')

In [7]:
X_train = m3[['CONTROL', 'CIPDESC', 'CREDDESC']]
y_train = m3[['EARN_MDN_HI_2YR']]

In [8]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('onehot', enc), ('linear', linear_model.LinearRegression())])
pipe = pipe.fit(X_train, y_train)

In [9]:
predictions = pipe.predict(X_train)
print(predictions)

[[54724.12941446]
 [32663.22014603]
 [54850.3062548 ]
 ...
 [31725.53698993]
 [24279.56738532]
 [16356.58665372]]


In [10]:
pipe.score(X_train,y_train)

0.7037035588677161

In [11]:
newdata = X_train.copy()
newdata['truth'] = y_train
newdata['pred'] = predictions

In [12]:
newdata.head()

Unnamed: 0,CONTROL,CIPDESC,CREDDESC,truth,pred
11,Public,"City/Urban, Community and Regional Planning.",Master's Degree,47260,54724.129414
13,Public,Audiovisual Communications Technologies/Techni...,Bachelors Degree,20102,32663.220146
14,Public,"Computer and Information Sciences, General.",Bachelors Degree,52107,54850.306255
18,Public,Educational Administration and Supervision.,Master's Degree,50231,58571.407551
21,Public,Teacher Education and Professional Development...,Bachelors Degree,37208,31959.074239


## Add more institution data

In [13]:
inst2 = inst.compute()

inst2 = inst2[['STABBR','HIGHDEG','REGION','LOCALE','CCSIZSET','ADM_RATE',
      'ADM_RATE_ALL','SAT_AVG','SAT_AVG_ALL','UGDS','NPT4_PUB','NPT4_PRIV',
      'TUITIONFEE_IN','TUITIONFEE_OUT','PFTFAC','MEDIAN_HH_INC',
     'UNITID','OPEID','OPEID6','INSTNM','CITY','ZIP']]

inst2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6806 entries, 0 to 2187
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   STABBR          6806 non-null   object
 1   HIGHDEG         6806 non-null   object
 2   REGION          6806 non-null   object
 3   LOCALE          6331 non-null   object
 4   CCSIZSET        6331 non-null   object
 5   ADM_RATE        2006 non-null   object
 6   ADM_RATE_ALL    2242 non-null   object
 7   SAT_AVG         1298 non-null   object
 8   SAT_AVG_ALL     1426 non-null   object
 9   UGDS            6041 non-null   object
 10  NPT4_PUB        1878 non-null   object
 11  NPT4_PRIV       3727 non-null   object
 12  TUITIONFEE_IN   3865 non-null   object
 13  TUITIONFEE_OUT  3621 non-null   object
 14  PFTFAC          3623 non-null   object
 15  MEDIAN_HH_INC   4670 non-null   object
 16  UNITID          6806 non-null   object
 17  OPEID           6806 non-null   object
 18  OPEID6  

In [14]:
inst2.columns

Index(['STABBR', 'HIGHDEG', 'REGION', 'LOCALE', 'CCSIZSET', 'ADM_RATE',
       'ADM_RATE_ALL', 'SAT_AVG', 'SAT_AVG_ALL', 'UGDS', 'NPT4_PUB',
       'NPT4_PRIV', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'PFTFAC',
       'MEDIAN_HH_INC', 'UNITID', 'OPEID', 'OPEID6', 'INSTNM', 'CITY', 'ZIP'],
      dtype='object')

In [15]:
m3.columns

Index(['UNITID', 'OPEID6', 'INSTNM', 'CONTROL', 'MAIN', 'CIPCODE', 'CIPDESC',
       'CREDLEV', 'CREDDESC', 'EARN_MDN_HI_2YR'],
      dtype='object')

In [16]:
newdf2 = pd.merge(
    m3,
    inst2,
    how="left",
    on=['UNITID', 'OPEID6', 'INSTNM'],
    suffixes=("_maj", "_inst"),
    copy=True,
)

In [17]:
newdf2.columns

Index(['UNITID', 'OPEID6', 'INSTNM', 'CONTROL', 'MAIN', 'CIPCODE', 'CIPDESC',
       'CREDLEV', 'CREDDESC', 'EARN_MDN_HI_2YR', 'STABBR', 'HIGHDEG', 'REGION',
       'LOCALE', 'CCSIZSET', 'ADM_RATE', 'ADM_RATE_ALL', 'SAT_AVG',
       'SAT_AVG_ALL', 'UGDS', 'NPT4_PUB', 'NPT4_PRIV', 'TUITIONFEE_IN',
       'TUITIONFEE_OUT', 'PFTFAC', 'MEDIAN_HH_INC', 'OPEID', 'CITY', 'ZIP'],
      dtype='object')

In [18]:
newdf2['deg_level'] = ['ug' if x in ["Bachelors Degree",
          "Associate's Degree",
          'Undergraduate Certificate or Diploma',
          'Post-baccalaureate Certificate'] else 'grad' for x in newdf2['CREDDESC']]


In [19]:
newdf2['tuition'] = newdf2['NPT4_PUB'].fillna(0).astype('int') + newdf2['NPT4_PRIV'].fillna(0).astype('int')

In [103]:
X = newdf2[['CONTROL', 'CIPDESC','STABBR',
       'CREDDESC', 'HIGHDEG', 'REGION',
       'LOCALE', 'CCSIZSET', 'ADM_RATE_ALL',
       'SAT_AVG_ALL', 'UGDS', 'tuition', 'TUITIONFEE_IN',
       'TUITIONFEE_OUT', 'PFTFAC', 'MEDIAN_HH_INC', 'deg_level']]
y = newdf2[['EARN_MDN_HI_2YR']]

#'INSTNM', 

In [104]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [105]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58690 entries, 48745 to 56422
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CONTROL         58690 non-null  object
 1   CIPDESC         58690 non-null  object
 2   STABBR          54902 non-null  object
 3   CREDDESC        58690 non-null  object
 4   HIGHDEG         54902 non-null  object
 5   REGION          54902 non-null  object
 6   LOCALE          54902 non-null  object
 7   CCSIZSET        54902 non-null  object
 8   ADM_RATE_ALL    37333 non-null  object
 9   SAT_AVG_ALL     32163 non-null  object
 10  UGDS            54507 non-null  object
 11  tuition         58690 non-null  int64 
 12  TUITIONFEE_IN   49534 non-null  object
 13  TUITIONFEE_OUT  49534 non-null  object
 14  PFTFAC          49449 non-null  object
 15  MEDIAN_HH_INC   50013 non-null  object
 16  deg_level       58690 non-null  object
dtypes: int64(1), object(16)
memory usage: 8.1+ MB


In [106]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = IterativeImputer(max_iter=10, random_state=0, initial_strategy='mean', add_indicator = True)


In [107]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('onehot', enc, ['CONTROL','CIPDESC','CREDDESC','deg_level',
                      'HIGHDEG','REGION','LOCALE', 'CCSIZSET', 'STABBR']),
    ('impute', imp, ['TUITIONFEE_IN','TUITIONFEE_OUT','PFTFAC','MEDIAN_HH_INC',
                      'UGDS','SAT_AVG_ALL','ADM_RATE_ALL'])], 
    remainder='passthrough' 
)

pipe = Pipeline(steps=[('coltrans', ct), ('linear', linear_model.LinearRegression())])
pipe = pipe.fit(X_train, y_train)



In [108]:
predictions = pipe.predict(X_test)
print(predictions)

[[59366.42877931]
 [55626.92460521]
 [56047.35490618]
 ...
 [36400.37398783]
 [48830.3137935 ]
 [35808.20762294]]


In [109]:
pipe.score(X_train,y_train)

0.7476261781154907

In [110]:
pipe.score(X_test,y_test)

0.7624068987813214

In [111]:
# newdata3 = X_train.copy()
# newdata3['truth'] = y_train.astype('int')
# newdata3['pred'] = predictions

newdata3 = X_test.copy()
newdata3['truth'] = y_test.astype('int')
newdata3['pred'] = predictions

newdata3['pred'] = [0 if x < 0 else x for x in newdata3['pred']]


In [112]:
newdata3.head()

Unnamed: 0,CONTROL,CIPDESC,STABBR,CREDDESC,HIGHDEG,REGION,LOCALE,CCSIZSET,ADM_RATE_ALL,SAT_AVG_ALL,UGDS,tuition,TUITIONFEE_IN,TUITIONFEE_OUT,PFTFAC,MEDIAN_HH_INC,deg_level,truth,pred
31062,"Private, nonprofit",Law.,NC,First Professional Degree,4,5,31,13,0.763,1120.0,4199,21434,32500.0,32500.0,0.4613,56906.03,grad,44508,59366.428779
12994,Public,"Educational Assessment, Evaluation, and Research.",IN,Graduate/Professional Certificate,4,3,13,16,0.6478,,15529,15020,9896.0,26468.0,0.8101,62817.43,grad,47260,55626.924605
12990,Public,"Computer and Information Sciences, General.",IN,Bachelors Degree,4,3,13,16,0.6478,,15529,15020,9896.0,26468.0,0.8101,62817.43,ug,54296,56047.354906
52066,Public,Health and Medical Administrative Services.,OH,Undergraduate Certificate or Diploma,1,3,21,-2,,,153,12544,,,,58212.55,ug,29685,23630.483727
50830,Public,Architectural Sciences and Technology.,PA,Bachelors Degree,3,2,13,13,,,5289,23435,16740.0,23880.0,0.6112,54926.29,ug,37568,40284.656667


In [113]:
from bokeh.models import NumeralTickFormatter
from bokeh.transform import factor_cmap, factor_mark
from bokeh.palettes import Spectral6
import time
from datetime import datetime as dt
from bokeh.plotting import figure
from bokeh.models import Span, Label
from bokeh.palettes import Viridis256, Cividis256, Turbo256


In [114]:
def plot_earn(df, groupvar):
    output_notebook()

    p = figure(title="Predicted Earnings", 
               y_axis_label='True', 
               x_axis_label='Predicted', 
               width=750, 
               height = 400)

    sourcedt = df 

    states = sourcedt[groupvar].fillna("NA").unique().tolist()

    p.circle(y='truth', x='pred', size=5, 
             fill_color=factor_cmap(groupvar, 
                                    palette=Spectral6, 
                                    factors=states), 
             legend_group=groupvar,
             source=sourcedt)

    p.yaxis[0].formatter = NumeralTickFormatter(format="0.0")#$
    p.xaxis[0].formatter = NumeralTickFormatter(format="0.0")


    show(p)

In [118]:
dt1 = newdata3[(newdata3['deg_level'] == 'grad')]

In [119]:
gv = 'CONTROL'

In [120]:
plot_earn(dt1, gv)