# Preprocessing and Modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import KNNImputer
import category_encoders as ce

In [3]:
from joblib import load, dump
from Logger import RegressionLogger

In [4]:
import plotly.express as px

In [5]:
black = 'graduation_rate_black_non-hispanic'
total = 'graduation_rate_total_cohort'

In [6]:
df = pd.read_csv("data/4_year/aggregate.csv", index_col=["UnitID", "institution_name"])
df.drop('Unnamed: 0', axis=1, inplace=True)

# Define X and Y

In [7]:
df['cohort'] = df['cohort'].astype(str)
categorical = df.select_dtypes('object').copy(deep=True)
numeric = df.select_dtypes(float).copy(deep=True)
ints = df.select_dtypes(int).copy(deep=True)

In [8]:
grad_rates = numeric.filter(regex="_rate").columns

In [9]:
y = numeric.loc[:,grad_rates]
numeric.drop(grad_rates, axis=1, inplace=True)

In [10]:
X = pd.concat([categorical, numeric, ints], axis=1)

# Train Test Split

In [11]:
model_log = list()

# Categorical Pipeline

In [12]:
categorical_preprocess = Pipeline([
    ('target_encode', ColumnTransformer([
        ("Encode", ce.TargetEncoder(), make_column_selector(dtype_include="object")),
    ], remainder='drop')),
    
    ('Scale', ColumnTransformer([
        ("Scaler", StandardScaler(), slice(200))
    ], remainder='passthrough'))
])

# Float Pipeline

In [13]:
numeric_preprocess = Pipeline([
    ('Impute', ColumnTransformer([
        ("Impute", KNNImputer(), make_column_selector(dtype_include=float)),
        
    ], remainder='drop')),
    
    ('Scale', ColumnTransformer([
        ("Scaler", StandardScaler(), slice(200))
    ], remainder='passthrough'),)
])

# Int Pipeline

In [14]:
int_preprocess = ColumnTransformer([
        ("pass", "passthrough", make_column_selector(dtype_include=int)),
    ], remainder='drop')

In [15]:
feature_union = FeatureUnion([
    ('categorical_preprocess', categorical_preprocess),
    ('numeric_preprocess', numeric_preprocess),
])

feature_union2 = FeatureUnion([
    ('int', int_preprocess),
    ('union1', feature_union),
    
])

## Final Pipelines

In [16]:
logger = RegressionLogger({
    "model_name": "Elastic Net",
    "alpha": 0.1,
    "l1_ratio": 0.5,
})

### ElasticNet

In [17]:
pipeline = Pipeline([
    ('preprocess', feature_union2),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

In [18]:
logger.train_update(pipeline, X, y[total])
logger.save_log("Elastic Net Baseline")
model_log.append(logger.record())

  elif pd.api.types.is_categorical(cols):


In [19]:
logger.mae

11.602174414998046

### Random Forest

In [20]:
logger.__init__({
    'model_name': 'Random Forest',
    'n_estimators': 100,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_depth': None
})

In [21]:
pipeline = Pipeline([
    ('preprocess', feature_union2),
    ('regressor', RandomForestRegressor())
])

In [22]:
logger.train_update(pipeline, X, y[total])
logger.save_log("Random Forest Baseline")
model_log.append(logger.record())

  elif pd.api.types.is_categorical(cols):


In [23]:
logger.mae

8.753506493506494

### XGB Regressor

In [24]:
logger.__init__({
    'model_name': 'XGBRegressor',
    'objective': 'reg:squarrederror',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_depth': None
})

In [25]:
pipeline = Pipeline([
    ('preprocess', feature_union2),
    ('regressor', xgb.XGBRegressor())
])

In [26]:
logger.train_update(pipeline, X, y[total])
logger.save_log("Random Forest Baseline")
model_log.append(logger.record())

  elif pd.api.types.is_categorical(cols):


In [27]:
logger.mae

8.299643784890444

# Dump Objects

## Dump Model Log

In [28]:
pd.DataFrame.from_dict(model_log)

Unnamed: 0,model_name,alpha,l1_ratio,rsquared,rmse,mae,train_rsquared,train_rmse,train_mae,model,notes,n_estimators,min_samples_split,min_samples_leaf,max_depth,objective
0,Elastic Net,0.1,0.5,0.524093,15.302038,11.602174,0.656159,12.793345,9.022771,"(FeatureUnion(transformer_list=[('int',\n ...",Elastic Net Baseline,,,,,
1,Random Forest,,,0.65958,12.941831,8.753506,0.959222,4.405721,2.788089,"(FeatureUnion(transformer_list=[('int',\n ...",Random Forest Baseline,100.0,2.0,1.0,,
2,XGBRegressor,,,0.694745,12.25518,8.299644,0.989975,2.184504,1.598105,"(FeatureUnion(transformer_list=[('int',\n ...",Random Forest Baseline,,2.0,1.0,,reg:squarrederror


In [29]:
dump(model_log, "objects/model_log.joblib")

['objects/model_log.joblib']