In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import copy
#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

from matplotlib import pyplot as plt
init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer, PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import TransformerMixin, BaseEstimator
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

In [2]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold, RFE, SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from joblib import dump, load
from Logger import RegressionLogger, FuncTransformer, ColSelect

In [3]:
import inspect

In [4]:
log_transformed = 0      

In [5]:
model_log = load("data/model_logging.joblib")

In [6]:
current_model = copy.deepcopy(model_log[4]["model"])

# Now Lets Load our original Data

In [7]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [8]:
df.drop("Unnamed: 0",axis=1, inplace=True)

In [9]:
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [10]:
df_black = df.dropna(subset=[df.columns[-7]])

In [11]:
X_black = df_black.iloc[:,:-14]
y_black = df_black.iloc[:, -7]

In [12]:
X_black = pd.get_dummies(X_black, drop_first=True)

## Model performance on African American Graduation Rates

In [13]:
params = {
    "Imputer": "SimpleImputer",
    "Scaler": "Standard Scaler",
    "model_name": "Elastic Net",
    "model": current_model,
    "alpha": 0.1,
    "l1_ratio": 0.5,
}

In [14]:
logger = RegressionLogger(params)

In [15]:
logger.train_update(current_model, X_black, y_black)

In [16]:
f'train and test mae: {logger.train_mae, logger.mae}'

'train and test mae: (12.839175383602452, 14.951887830483603)'

# feature engineering for African American Graduation Rates

In [17]:
hbcu = X_black.filter(regex="Historically").columns[0]

## Log Revenues vs African American Graduation Rates

In [18]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = np.log(X_black["Core_Expenses"]), y = y_black, mode='markers'
    )
)

fig.update_layout(
    height = 750,
    xaxis = {
        "title": "Log Core Expenses"
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Log Core Expenses vs. African American Graduation Rates"
)
fig.show()

## PCA on Staff Diversity

In [19]:
staff_diversity = X_black.filter(regex="instructional.*percentage")

In [20]:
pca_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])
component = pca_pipeline.fit_transform(staff_diversity)

In [21]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 750,
    width = 1200,
    xaxis = {
        "title": "Staff Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Staff Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Enrollment Diversity

In [22]:
enrollment_diversity = X_black.filter(regex="Percent of total enrollment")

In [23]:
component1 = pca_pipeline.fit_transform(enrollment_diversity)

In [24]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component1.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 750,
    width = 1200,
    xaxis = {
        "title": "Enrollment Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Revenue Distribution

In [25]:
revenues_df = X_black.iloc[:,1:7]

In [26]:
component2 = pca_pipeline.fit_transform(revenues_df)

In [27]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component2.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 650,
    width = 1000,
    xaxis = {
        "title": "Enrollment Diversity Component",
        
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## PCA on Expenses Distribution

In [28]:
expenses_distribution = X_black.iloc[:,14:21]

In [29]:
component3 = pca_pipeline.fit_transform(expenses_distribution)

In [30]:
pca_pipeline['pca'].explained_variance_ratio_

array([0.30030328])

In [31]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = component3.flatten(), y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 650,
    width = 1000,
    xaxis = {
        "title": "Expenses Distribution Component",
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

## Overall PCA Trends

In [32]:
pca_pipeline.steps[2] = ('pca', PCA(n_components=5))

In [33]:
principle_components = pca_pipeline.fit_transform(X_black)

In [34]:
y_black.to_numpy().shape
principle_components[:,1]

array([-3.9866369 ,  3.84760429,  2.50739903, ...,  3.6556763 ,
        4.48231181,  0.55131411])

In [35]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = principle_components[:,0], y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 525,
    width = 800,
    xaxis = {
        "title": "First Pricipled Component",
        "range": [-10, 30]
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "First Principled Component vs. African American Graduation Rates"
)

fig.show()

In [36]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = principle_components[:,1], y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 525,
    width = 800,
    xaxis = {
        "title": "Second Principled Component",
        "range": [-10, 20]
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Second Principled Component vs. African American Graduation Rates"
)

fig.show()

In [37]:
fig = go.Figure()

fig.add_trace(
    
    go.Scatter(
        x = principle_components[:,2], y = y_black.to_numpy(), mode='markers'
    )
)

fig.update_layout(
    height = 525,
    width = 800,
    xaxis = {
        "title": "Third Principled Component",
        "range": [-10, 20]
    },
    yaxis = {
        "title": "African American Graduation Rates"
    },
    title = "Enrollment Diversity Component vs. African American Graduation Rates"
)

fig.show()

In [38]:
# principle component 3 e^-x
# principle component 4 e^-x

## Interactive Line Fit Plot

# MultiVariate Plots With Principle Components