In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from joblib import Parallel, delayed


In [2]:
training_data = pd.read_parquet('assets/training_set_v2.parquet')
validation_data = pd.read_parquet('assets/validation_set_v2.parquet')

In [3]:
# Step 2: Standardize the data
scaler = StandardScaler()
columns_to_drop = ['ItemKey', 'RWB_EFFECTIVE_DATE']

X_train = training_data.drop(columns=['events'] + columns_to_drop, axis=1)
X_val = validation_data.drop(columns=['events'] + columns_to_drop, axis=1)

X_train_std = scaler.fit_transform(X_train)
X_val_std = scaler.transform(X_val)

In [4]:
# Step 3: Instantiate dummy regressors
dummy_regressor_mean = DummyRegressor(strategy='mean')
dummy_regressor_median = DummyRegressor(strategy='median')
dummy_regressor_quantile = DummyRegressor(strategy='quantile', quantile=0.25)

In [5]:
# Step 4: Evaluate model architectures
models = {
    'Dummy Mean': dummy_regressor_mean,
    'Dummy Median': dummy_regressor_median,
    'Dummy Quantile': dummy_regressor_quantile,
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'Elastic Net Regression': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor()
}

In [6]:
# Step 5: Evaluate the performance of each model
test_results = []

# Accumulate feature importance scores in dataframe 
feat_import_df = pd.DataFrame()

for model_name, model in models.items():
    model.fit(X_train_std, training_data['events'])
    predictions_test = model.predict(X_val_std)
    mse_test = mean_squared_error(validation_data['events'], predictions_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(validation_data['events'], predictions_test)

    test_results.append([model_name, mse_test, rmse_test, mae_test])

    if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
        if hasattr(model, 'coef_'):
            feature_importances = model.coef_
            sorted_indices = np.argsort(np.abs(feature_importances))[::-1]
        elif hasattr(model, 'feature_importances_'):  
            feature_importances = model.feature_importances_
            sorted_indices = np.argsort(feature_importances)[::-1]
        else:
            sorted_indices = None

        if sorted_indices is not None:

            model_name_outfeat = [model_name for idx in sorted_indices]
            features_outfeat = [X_train.columns[idx] for idx in sorted_indices]
            importance_outfeat = [feature_importances[idx] for idx in sorted_indices]
            append_df = pd.DataFrame({"model": model_name_outfeat,
                                      "feature": features_outfeat,
                                       "importance": importance_outfeat })
            feat_import_df = pd.concat([feat_import_df, append_df], axis=0)

In [8]:
# Step 6: Aggregate test performance results into a data frame
test_metrics_df = pd.DataFrame(test_results, columns=['Model', 'Val MSE', 'Val RMSE', 'Val MAE'])
test_metrics_df

Unnamed: 0,Model,Val MSE,Val RMSE,Val MAE
0,Dummy Mean,3.781558,1.944623,1.80256
1,Dummy Median,2.503833,1.58235,1.402347
2,Dummy Quantile,1.241666,1.114301,0.7847199
3,Linear Regression,6.497603e+19,8060771000.0,53311910.0
4,Lasso Regression,3.781558,1.944623,1.80256
5,Ridge Regression,2.928832,1.711383,1.443659
6,Elastic Net Regression,3.763181,1.939892,1.798243
7,Decision Tree Regression,5.977626,2.444918,1.147716
8,Random Forest Regression,2.469702,1.571528,1.124071
9,Gradient Boosting Regression,2.534045,1.591869,1.344729


In [14]:
# Include 5 fold cross val
test_results2 = []
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_squared_error', cv=5)
    mse_cv = -np.mean(cv_scores)
    rmse_cv = np.sqrt(mse_cv)
    mae_cv = np.mean(cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_absolute_error', cv=5))

    model.fit(X_train_std, training_data['events'])
    predictions_test = model.predict(X_val_std)
    mse_test = mean_squared_error(validation_data['events'], predictions_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(validation_data['events'], predictions_test)

    if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
        if hasattr(model, 'coef_'):  
            feature_importances = model.coef_
            sorted_indices = np.argsort(np.abs(feature_importances))[::-1][:5]  
        elif hasattr(model, 'feature_importances_'):  
            feature_importances = model.feature_importances_
            sorted_indices = np.argsort(feature_importances)[::-1][:5]
        else:
            sorted_indices = None

        if sorted_indices is not None:
            print(f'Feature importances for {model_name}:')
            for idx in sorted_indices:
                feature_name = X_train.columns[idx]
                importance = feature_importances[idx]
                print(f'{feature_name}: {importance}')

    test_results2.append([model_name, mse_test, rmse_test, mae_test, mse_cv, rmse_cv, mae_cv])

test_metrics2_df = pd.DataFrame(test_results2, columns=['Model', 'Val MSE', 'Val RMSE', 'Val MAE', 'CV MSE', 'CV RMSE', 'CV MAE'])
test_metrics2_df


Feature importances for Linear Regression:
Has64BitMacro: 5066460192337.521
Has32BitMacro: -5066460192337.402
Has64BitOffice97_2003Files: -3024624668039.1733
Has32BitOffice97_2003Files: 3024624668039.1445
Active0: -1218790162026.2551
Feature importances for Lasso Regression:
CaseType_Laptop: 0.0
Outlookx64_addin_filesize: 0.0
PowerPointx64_addin_filesize: 0.0
PowerPointx86_addin_filesize: 0.0
Publisherx86_addin_filesize: 0.0
Feature importances for Ridge Regression:
AvgProcessorNormSpeed: 12.173767396745472
Model_Latitude 9510: 7.88950289294358
Model_Latitude 7300: 4.147931681284737
Model_Latitude 7320: -3.597301302719127
Model_Latitude 9520: -3.500260237627614
Feature importances for Elastic Net Regression:
has_factset_add: 0.012465254702852915
OneNotex86_addin_filesize: 0.011558929769715197
OneNotex64_addin_filesize: 0.011556864612741162
Battery_Power: 0.0
PowerPointx86_addin_filesize: 0.0
Feature importances for Decision Tree Regression:
Days Since Creation: 0.13345664374353564
avg_

Unnamed: 0,Model,Val MSE,Val RMSE,Val MAE,CV MSE,CV RMSE,CV MAE
0,Dummy Mean,3.781558,1.944623,1.80256,7.848407,2.801501,-1.720659
1,Dummy Median,2.503833,1.58235,1.402347,8.462545,2.909045,-1.844118
2,Dummy Quantile,1.241666,1.114301,0.7847199,10.3507,3.217251,-2.044093
3,Linear Regression,6.497603e+19,8060771000.0,53311910.0,2.677244e+22,163622800000.0,-910481900.0
4,Lasso Regression,3.781558,1.944623,1.80256,7.848407,2.801501,-1.720659
5,Ridge Regression,2.928832,1.711383,1.443659,6.951728,2.636613,-1.584257
6,Elastic Net Regression,3.763181,1.939892,1.798243,7.827091,2.797694,-1.71495
7,Decision Tree Regression,5.827067,2.413932,1.140236,10.3587,3.218494,-1.67389
8,Random Forest Regression,2.483134,1.575796,1.125403,5.342409,2.311365,-1.35663
9,Gradient Boosting Regression,2.533405,1.591667,1.344672,6.174771,2.484909,-1.491555
