In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import copy
#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

from matplotlib import pyplot as plt
init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer, PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import TransformerMixin, BaseEstimator
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

In [2]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold, RFE, SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from joblib import dump, load
from Logger import RegressionLogger, FuncTransformer, ColSelect

In [3]:
import inspect

In [4]:
log_transformed = 0      

In [5]:
model_log = load("data/model_logging.joblib")

In [6]:
current_model = copy.deepcopy(model_log[4]["model"])

# Now Lets Load our original Data

In [7]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [8]:
df.drop("Unnamed: 0",axis=1, inplace=True)

In [9]:
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [10]:
df_black = df.dropna(subset=[df.columns[-7]])

In [11]:
X_black = df_black.iloc[:,:-14]
y_black = df_black.iloc[:, -7]

In [12]:
X_black = pd.get_dummies(X_black, drop_first=True)

# Create Holdout Set

In [13]:
x_train, x_test, y_train, y_test, sample_x, sample_y = load("objects/holdout_total.joblib")

In [46]:
X_fe = load("objects/features.joblib")

## Model performance on African American Graduation Rates

In [47]:
params = {
    "Imputer": "SimpleImputer",
    "Scaler": "Standard Scaler",
    "model_name": "Elastic Net",
    "model": current_model,
    "alpha": 0.1,
    "l1_ratio": 0.5,
}

In [48]:
logger = RegressionLogger(params)

In [49]:
logger.train_holdout(current_model, x_train, x_test, y_train,  y_test)

In [50]:
f'train and test mae: {logger.train_mae, logger.mae}'

'train and test mae: (9.26018194045588, 10.647651386135134)'

# feature engineering for African American Graduation Rates

In [51]:
hbcu = X_black.filter(regex="Historically").columns[0]

## Log Revenues vs African American Graduation Rates

In [52]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = np.log(X_black["Core_Expenses"]), y = y_black, mode='markers'
    )
)

fig.update_layout(
    height = 750,
    xaxis = {
        "title": "Log Core Expenses"
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Log Core Expenses vs. African American Graduation Rates"
)
fig.show()

## PCA on Staff Diversity

In [53]:
staff_diversity = X_black.filter(regex="instructional.*percentage")

In [54]:
pca_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])
component = pca_pipeline.fit_transform(staff_diversity)

In [55]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 750,
    width = 1200,
    xaxis = {
        "title": "Staff Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Staff Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Enrollment Diversity

In [56]:
enrollment_diversity = X_black.filter(regex="Percent of total enrollment")

In [57]:
component1 = pca_pipeline.fit_transform(enrollment_diversity)

In [58]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component1.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 750,
    width = 1200,
    xaxis = {
        "title": "Enrollment Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Revenue Distribution

In [59]:
revenues_df = X_black.iloc[:,1:7]

In [60]:
component2 = pca_pipeline.fit_transform(revenues_df)

In [61]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component2.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 650,
    width = 1000,
    xaxis = {
        "title": "Enrollment Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Expenses Distribution

In [62]:
expenses_distribution = X_black.iloc[:,14:21]

In [63]:
component3 = pca_pipeline.fit_transform(expenses_distribution)

In [64]:
pca_pipeline['pca'].explained_variance_ratio_

array([0.30030328])

In [65]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component3.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 650,
    width = 1000,
    xaxis = {
        "title": "Expenses Distribution Component",
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## Overall PCA Trends

In [66]:
pca_pipeline.steps[2] = ('pca', PCA(n_components=5))

In [70]:
principle_components = pca_pipeline.fit_transform(X_fe)

In [71]:
y_black.to_numpy().shape
principle_components[:,1]

array([ 8.20120643, -4.85081903,  2.89020866, ...,  2.545542  ,
        3.64679289,  1.70818302])

In [72]:
picker = wg.IntSlider(value=0, max=4)
def scatter(component):
    fig = go.Figure()

    fig.add_trace(

        go.Scatter(
            x = principle_components[:,component], y = y_black.to_numpy(), mode='markers'
        )
    )

    fig.update_layout(
        height = 525,
        width = 800,
        xaxis = {
            "title": "First Pricipled Component",
            "range": [-10, 10]
        },
        yaxis = {
            "title": "African American Graduation Rates"
        },
        title = "First Principled Component vs. African American Graduation Rates"
    )

    display(fig.show())
ui = wg.HBox([picker])
out = wg.interactive_output(scatter, {'component': picker})
out.layout.height = '700px'
_ = display(ui, out)

HBox(children=(IntSlider(value=0, max=4),))

Output(layout=Layout(height='700px'))

In [43]:
components_df = pd.DataFrame(principle_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])

In [44]:
components_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-5.423356,3.151113,2.232848,-4.167132,-0.584118
1,-5.896564,6.432013,5.924172,-2.267339,-0.468453
2,-2.186617,-2.419037,-0.552543,2.865848,-0.131630
3,-0.619987,3.943380,-4.247357,1.906793,-0.148496
4,-2.910306,0.501737,1.849377,-2.459682,0.070518
...,...,...,...,...,...
2075,-3.245314,-0.287545,-0.927582,0.358011,1.483925
2076,-5.263771,5.412507,3.183121,-1.193943,-0.093535
2077,-0.978317,2.299798,-3.956508,2.370685,-0.457610
2078,13.738177,4.195906,-1.424597,0.247661,-0.425861


## Interactive Line Fit Plot

# MultiVariate Plots With Principle Components