In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import copy
#conda install -c conda-forge cufflinks-py
#conda install plotly
import ipywidgets as wg
from IPython.display import display

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px

import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import inspect
import seaborn as sns

init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
pd.options.display.max_columns = 200
pd.options.display.max_rows = 272

In [2]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from joblib import dump, load

In [3]:
model_log = load("data/model_logging.joblib")

In [4]:
model_log_df = pd.DataFrame(model_log)

In [5]:
current_model = copy.deepcopy(model_log_df["model"].iloc[0])

In [6]:
def train_model(model, X, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    tmae = mean_absolute_error(y_train, y_pred_train)
    tmse = mean_squared_error(y_train, y_pred_train)
    trmse = np.sqrt(tmse)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    
    return tmae, trmse, mae, rmse

# Now Lets Load our original Data

In [7]:
df = pd.read_csv("data/cleaned_df.csv", index_col="UnitID")

In [8]:
df.drop("Unnamed: 0",axis=1, inplace=True)

In [9]:
df.drop(["City location of institution (HD2019)", "Institution Name"], axis=1, inplace=True)

In [10]:
X = df.iloc[:, :-14]
y = df.iloc[:,-14]

In [11]:
X = pd.get_dummies(X, drop_first=True)

In [12]:
model_log

[{'model': Pipeline(steps=[('Imputer',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Impute', SimpleImputer(),
                                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12, 13, 14, 15, 16,
                                                     17, 18, 19, 20, 21, 22, 23,
                                                     24, 25, 26, 27, 28, 29, ...])])),
                  ('Scaler',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Scale', StandardScaler(),
                                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12, 13, 14, 15, 16,
                                                     17, 18, 19, 20, 21, 22, 23,
                                                     24, 25, 26, 27, 28, 

In [13]:
current_model

Pipeline(steps=[('Imputer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('Impute', SimpleImputer(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...])])),
                ('Scaler',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('Scale', StandardScaler(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...])])),
                ('class

In [14]:
ilocs = [ X.columns.get_loc(i) for i in X.filter(regex="aid").columns]

## I want no features to be a linear combination of other features.

<ol>
    <li> Tuition_and_Fees_as_dollar_amount = Core Revenues * Tuition and Fees. This goes for all dollar amounts</li>
    <li> Black or African American Total Instructional Staff = Grand Total Instructional Staff * Black total staff_as_percentage</li>
    <li>Grand total Undergrad under 25 = Imcoming_Class_Grand_total * Grand total Undergrad Under 25 as percentage</li>
    <li>Total men over 25 as percentage = 100 - total men under 25 as percentage. Same goes for women and grand total percentages</li>
</ol>

We may want to eliminate these perfect collinear columns if it helps our model

# 1.) Dropping dollar amounts

In [15]:
for i in X.filter(regex="_As_Dollar").columns:
    X.drop(i, axis=1, inplace=True)

In [16]:
len(X.select_dtypes(include="float").columns)

111

In [17]:
numeric_iloc = list(range(111))

In [18]:
current_model.steps[0] = ('Imputer', ColumnTransformer(
    [
        ('Impute', SimpleImputer(), numeric_iloc)
    ]
)) 

current_model.steps[1] = ('Scaler', ColumnTransformer(
    [
        ('Scale', StandardScaler(), numeric_iloc)
    ]
)) 


In [19]:
current_model

Pipeline(steps=[('Imputer',
                 ColumnTransformer(transformers=[('Impute', SimpleImputer(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...])])),
                ('Scaler',
                 ColumnTransformer(transformers=[('Scale', StandardScaler(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...])])),
                ('classifier', ElasticNet(alpha=0.1))])

In [20]:
tmae, trmse, mae, rmse = train_model(current_model, X, y)

# Dropping these columns did not help

Dropping these columns did not help the model even though they are perfect linear combinations of other features. Lets undo this step and save it for when our model gets extremely complex. Lets save the model and note it in the experiment log

In [21]:
def print_results(tmae, trmse, mae, rmse):
    print(
        f'Training MAE: {tmae}, \n Training RMSE {trmse},' 
        f'\n Test MAE: {mae}, \n Test RMSE: {rmse}'
    )

In [22]:
print_results(tmae, trmse, mae, rmse)

Training MAE: 10.657154304539688, 
 Training RMSE 14.951172762357368,
 Test MAE: 10.32975392768195, 
 Test RMSE: 14.706748841178987


### Lets run it one more time for African American Graduation Rates just to be sure

In [23]:
df_black = df.dropna(subset=[df.columns[-7]])

In [24]:
X_black = df_black.iloc[:,:-14]
y_black = df_black.iloc[:, -7]

In [25]:
X_black = pd.get_dummies(X_black, drop_first=True)

In [26]:
tmae_black, trmse_black, mae_black, rmse_black = train_model(current_model, X_black, y_black)

In [47]:
print_results(tmae, trmse_black, mae_black, rmse_black)

Training MAE: 10.657154304539688, 
 Training RMSE 19.718070607696024,
 Test MAE: 16.202677780859513, 
 Test RMSE: 24.5074842118486


In [28]:
log1 = {
    "model_name": "Elastic Net",
    "model": current_model,
    "alpha": 0.1,
    "l1 ratio": 0.5,
    
    "tmae": tmae,
    "trmse": trmse,
    
    "mae_total": mae,
    "rmse_total": rmse,
    
    "tmae_black": tmae_black,
    "trmse_black": trmse_black,
    
    "mae_black": mae_black,
    "rmse_black": rmse_black,
    
    "notes": "Dropped dollar amount columns"
}

In [29]:
model_log.append(log1)

# Lets test against different imputation strategies: 
<ol>
    <li>KNNImputer()</li>
    <li>IterativeImputer()</li>
    <li>Let XG Boost handle imputation</li>
<ol>

In [30]:
# create a deep copy of our base line model for easy manipulation
print(id(current_model))
current_model = copy.deepcopy(model_log_df["model"].iloc[0])
print(id(current_model))
# if you do not create deep copy then it will reference the same object in memory

140529482848816
140529514248368


In [31]:
num_cols = list(range(124))

In [32]:
current_model.steps[0][1].transformers_

[('Impute',
  SimpleImputer(),
  [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   83,
   84,
   85,
   86,
   87,
   88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,
   96,
   97,
   98,
   99,
   100,
   101,
   102,
   103,
   104,
   105,
   106,
   107,
   108,
   109,
   110,
   111,
   112,
   113,
   114,
   115,
   116,
   117,
   118,
   119,
   120,
   121,
   122,
   123]),
 ('remainder',
  'passthrough',
  [124,
   125,
   126,
   127,
   128,
   129,
   13

In [33]:
current_model.steps[0][1].transformers_ = ('Impute', KNNImputer(), num_cols)

In [34]:
tmae1, trmse1, mae1, rmse1 = train_model(current_model, X, y)

In [35]:
print_results(tmae, trmse, mae, rmse)

Training MAE: 10.657154304539688, 
 Training RMSE 14.951172762357368,
 Test MAE: 10.32975392768195, 
 Test RMSE: 14.706748841178987


In [36]:
tmae_black1, trmse_black1, mae_black1, rmse_black1 = train_model(current_model, X_black, y_black)

In [37]:
print_results(tmae_black1, trmse_black1, mae_black1, rmse_black1)

Training MAE: 13.443723049417862, 
 Training RMSE 18.537007904687666,
 Test MAE: 14.861809376780581, 
 Test RMSE: 20.273453844145486


In [38]:
log2 = {
    "model_name": "Elastic Net",
    
    "model": current_model,
    "alpha": 0.1,
    "l1 ratio": 0.5,
    
    "tmae": tmae1,
    "trmse": trmse1,
    
    "mae_total": mae1,
    "rmse_total": rmse1,
    
    "tmae_black": tmae_black1,
    "trmse_black": trmse_black1,
    
    "mae_black": mae_black1,
    "rmse_black": rmse_black1,
    
    "notes": "KNN Imputation strategy"
}

In [39]:
model_log.append(log2)

### KNN imputation shows improvement so lets try iterative

In [40]:
current_model = copy.deepcopy(model_log_df["model"].iloc[0])

In [41]:
current_model.steps[0][1].transformers_ = ('Impute', IterativeImputer(), num_cols)

In [43]:
tmae_2, trmse_2, mae2, mse2 = train_model(current_model, X, y)

In [44]:
tmae_black2, trmse_black2, mae_black2, rmse_black2 = train_model(current_model, X_black, y_black)

In [46]:
print_results(tmae_black2, trmse_black2, mae_black2, rmse_black2)

Training MAE: 13.443723049417862, 
 Training RMSE 18.537007904687666,
 Test MAE: 14.861809376780581, 
 Test RMSE: 20.273453844145486


### Same Results as KNN now lets try XGBoost

In [48]:
#viewing default XGB parameters
xgb.XGBRegressor()

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [49]:
current_model = copy.deepcopy(model_log_df["model"].iloc[0])

In [50]:
current_model.steps.pop(0)

('Imputer',
 ColumnTransformer(remainder='passthrough',
                   transformers=[('Impute', SimpleImputer(),
                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                   25, 26, 27, 28, 29, ...])]))

In [51]:
params = {
    "objective": "reg:squarederror",
    "booster": "gbtree",
    "learning_rate": 0.1,
    "subsample": 1.0,
    "min_child_weight": 1
}

In [52]:
current_model.steps[1] = ('classifier', xgb.XGBRegressor(**params))

In [54]:
tmae3, trmse3, mae3, mse3 = train_model(current_model, X, y)

In [56]:
tmae_black3, trmse_black3, mae_black3, mse_black3 = train_model(current_model, X_black, y_black)

In [57]:
print_results(tmae_black3, tmae_black3, mae_black3, mse_black3)

Training MAE: 4.641660551472408, 
 Training RMSE 4.641660551472408,
 Test MAE: 13.931648124632288, 
 Test RMSE: 19.021821315387857


In [None]:
pd.DataFrame(model_log)