In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import copy
#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer, PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import TransformerMixin, BaseEstimator
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

In [2]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from joblib import dump, load
from Logger import RegressionLogger, FuncTransformer, ColSelect

In [3]:
log_transformed = 0      

In [4]:
model_log = load("data/model_logging.joblib")

In [5]:
current_model = copy.deepcopy(model_log[0]["model"])

# Now Lets Load our original Data

In [6]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [7]:
df.drop("Unnamed: 0",axis=1, inplace=True)

In [8]:
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [9]:
X = df.iloc[:, :-14]
y = df.iloc[:,-14]

In [10]:
df_black = df.dropna(subset=[df.columns[-7]])

In [11]:
X_black = df_black.iloc[:,:-14]
y_black = df_black.iloc[:, -7]

In [12]:
X_black = pd.get_dummies(X_black, drop_first=True)

In [13]:
X = pd.get_dummies(X, drop_first=True)

In [14]:
X.shape

(2315, 260)

In [15]:
X_black.shape

(2027, 255)

# Trend Identification

In [16]:
# log of core revenues
transformer = FuncTransformer()

In [17]:
fe_x = transformer.log_transform(X.to_numpy(), 0)

In [18]:
X["log_core_revenues"] = fe_x

In [19]:
num_ilocs = [X.columns.get_loc(i) for i in X.select_dtypes(include="float").columns]

In [20]:
scaler = ('Scaler', ColumnTransformer(
    [
        ('Scale', StandardScaler(), num_ilocs)
    ], remainder='passthrough'
))

# Before we continue, which of these features Actually improve the model performance

In [21]:
num_pipe = Pipeline([
    ('Impute', ColumnTransformer([
        ('Impute', SimpleImputer(), make_column_selector(dtype_include='float'))
    ], remainder='drop'
    )),
    ('Scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('Passthrough', ColumnTransformer([
        ('Pass', 'passthrough', make_column_selector(dtype_exclude='float'))
    ], remainder='drop'
    ))
])

preprocessor = FeatureUnion([
    ('numeric', num_pipe),
    ('categorical', cat_pipe)
])

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('polynomial', PolynomialFeatures()),
    ('classifier', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

In [22]:
params = {
    "Imputer": "SimpleImputer",
    "Scaler": "Standard Scaler",
    "model_name": "Elastic Net",
    "model": pipeline,
    "alpha": 0.1,
    "l1_ratio": 0.5,
}

In [23]:
logger = RegressionLogger(params)

In [24]:
logger.train_update(pipeline, X, y)


Objective did not converge. You might want to increase the number of iterations. Duality gap: 31395.085842468223, tolerance: 93.54013126588714



In [32]:
logger.train_mae, logger.mae

(5.573710716909098, 11.03782760078316)

# The Model Does Not Converge (as expected), but it still gives us very valuable feature combinations that we can use for feature engineering

In [26]:
feature_names = list(pipeline['polynomial'].get_feature_names(np.array(X.columns)))

In [27]:
coefs = list(pipeline['classifier'].coef_)

In [40]:
coef_map = sorted(list(zip(feature_names, coefs)), key=lambda x: x[1])

In [43]:
[i[0] for i in coef_map[:5]]

['Percent of total enrollment that are women (DRVEF2013_RV) SAT Critical Reading 75th percentile score (IC2012_RV)_isNaN',
 'Percent of total enrollment that are Black or African American (DRVEF2013_RV) Parent/child indicator - Finance (FLAGS2019)_Parent record - includes data from branch campuses',
 'Government_Grants SAT Critical Reading 75th percentile score (IC2012_RV)_isNaN',
 'SAT Critical Reading 75th percentile score (IC2012_RV)_isNaN State abbreviation (HD2018)_Montana',
 'Government_Grants Student-to-faculty ratio (EF2019D)']

In [42]:
[i[0] for i in coef_map[-5:]]

['Percent of total enrollment that are Black or African American (DRVEF2013_RV) Percent of total enrollment that are women (DRVEF2013_RV)',
 'Sales_And_Services Total men (EF2013B_RV  Undergraduate  Age 25 and over total)_as_percentage',
 'Percent of total enrollment that are White (DRVEF2013_RV) Parent/child indicator - Finance (FLAGS2019)_Parent record - includes data from branch campuses',
 'Academic_Support_Expenses Total price for out-of-state students living on campus 2012-13 (DRVIC2012_RV)',
 'Academic_Support_Expenses Sector of institution (HD2018)_Public, 2-year']

## Lets Add the Top 5 features and Bottom 5 features to our model and see if we can get a model that converges

# Visualizing Features against target

In [31]:

picker1 = wg.Dropdown(
    options=X.columns,
    value=X.columns[0],
    description='Feature:',
    disabled=False,
)

picker2 = wg.Dropdown(
    options=pd.DataFrame(y).columns,
    description='Target:',
    disabled=False,
)

ui = wg.HBox([picker1, picker2])

def scatter_residuals(feature, target):
    fig = go.Figure()
    
    _ = fig.add_trace(go.Scatter(
        x = X[feature], y=pd.DataFrame(y)[target], mode='markers'
    ))

    
    _ = display(fig.show())
    
out = wg.interactive_output(scatter_residuals, {'feature': picker1, 'target':picker2})
out.layout.height = '700px'
_ = display(ui, out)

HBox(children=(Dropdown(description='Feature:', options=('Core_Revenues', 'Tuition_And_Fees', 'Government_Gran…

Output(layout=Layout(height='700px'))

# Visualizing Features Against eachother