In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import numpy as np


In [2]:
training_data = pd.read_parquet('assets/training_set_office.parquet')
validation_data = pd.read_parquet('assets/validation_set_office.parquet')

In [3]:
# Step 2: Standardize the data
scaler = StandardScaler()
columns_to_drop = ['ItemKey', 'RWB_EFFECTIVE_DATE']

X_train = training_data.drop(columns=['events'] + columns_to_drop, axis=1)
X_val = validation_data.drop(columns=['events'] + columns_to_drop, axis=1)

X_train_std = scaler.fit_transform(X_train)
X_val_std = scaler.transform(X_val)

In [4]:
# Step 3: Instantiate dummy regressors
dummy_regressor_mean = DummyRegressor(strategy='mean')
dummy_regressor_median = DummyRegressor(strategy='median')
dummy_regressor_quantile = DummyRegressor(strategy='quantile', quantile=0.25)

In [5]:
# Step 4: Evaluate model architectures
models = {
    'Dummy Mean': dummy_regressor_mean,
    'Dummy Median': dummy_regressor_median,
    'Dummy Quantile': dummy_regressor_quantile,
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'Elastic Net Regression': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(criterion='poisson'),
    'Gradient Boosting Regression': GradientBoostingRegressor()
}

In [14]:
# Step 5: Evaluate the performance of each model
test_results = []

# Accumulate feature importance scores in dataframe 
feat_import_df = pd.DataFrame()

out_preds = []
out_models = []
out_true = []


# Define the custom scoring function
def weighted_mae_fun(y_true, y_pred):
    errors = np.abs(y_true - y_pred)

    # Errors for 0 num events are 0.5 times as important 
    # Errors for 1 num events are 1 times as important 
    # Errors for 2 or more num events are 3 times as important 
    sample_weights = np.where(y_true == 0, 0.5, np.where(y_true == 1, 1, 3)) 
    weighted_errors = sample_weights * errors
    weighted_mae_score = np.sum(weighted_errors) / np.sum(sample_weights)
    return np.mean(weighted_mae_score)

for model_name, model in models.items():
    model.fit(X_train_std, training_data['events'])
    predictions_test = model.predict(X_val_std)

    # Round predictions (can't have 0.5 number of incidents)
    predictions_test = np.round(predictions_test)

    mse_test = mean_squared_error(validation_data['events'], predictions_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(validation_data['events'], predictions_test)
    weighted_mae = weighted_mae_fun(validation_data['events'], predictions_test)

    test_results.append([model_name, mse_test, rmse_test, mae_test, weighted_mae])
    out_preds = out_preds + list(predictions_test)
    out_models = out_models + [model_name for pred in predictions_test]
    out_true = out_true + list(validation_data['events'])

    if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
        if hasattr(model, 'coef_'):
            feature_importances = model.coef_
            sorted_indices = np.argsort(np.abs(feature_importances))[::-1]
        elif hasattr(model, 'feature_importances_'):  
            feature_importances = model.feature_importances_
            sorted_indices = np.argsort(feature_importances)[::-1]
        else:
            sorted_indices = None
        

        if sorted_indices is not None:

            model_name_outfeat = [model_name for idx in sorted_indices]
            features_outfeat = [X_train.columns[idx] for idx in sorted_indices]
            importance_outfeat = [feature_importances[idx] for idx in sorted_indices]
            append_df = pd.DataFrame({"model": model_name_outfeat,
                                      "feature": features_outfeat,
                                       "importance": importance_outfeat })
            feat_import_df = pd.concat([feat_import_df, append_df], axis=0)

predictions_dat = pd.DataFrame({"Model_Name": out_models, "Predictions": out_preds, "True": out_true})
predictions_dat2 = pd.melt(predictions_dat, id_vars=['Model_Name'], value_vars=['Predictions', 'True'], var_name ='LabelType')

In [15]:
# Step 6: Aggregate test performance results into a data frame
test_metrics_df = pd.DataFrame(test_results, columns=['Model', 'Val MSE', 'Val RMSE', 'Val MAE', 'Val Weighted MAE'])
test_metrics_df

Unnamed: 0,Model,Val MSE,Val RMSE,Val MAE,Val Weighted MAE
0,Dummy Mean,3.023757,1.738895,1.618318,1.348023
1,Dummy Median,3.023757,1.738895,1.618318,1.348023
2,Dummy Quantile,1.04095,1.02027,0.761488,0.8655367
3,Linear Regression,2.078174e+22,144158700000.0,3692737000.0,2224683000.0
4,Lasso Regression,3.023757,1.738895,1.618318,1.348023
5,Ridge Regression,1.646765,1.283263,1.027821,0.9736347
6,Elastic Net Regression,3.023757,1.738895,1.618318,1.348023
7,Decision Tree Regression,3.292591,1.81455,1.145045,1.259699
8,Random Forest Regression,1.520163,1.232949,0.9512348,0.9225989
9,Gradient Boosting Regression,1.467959,1.211593,0.9715536,0.9109228


In [12]:
import plotly.express as px

predictions_dat3 = predictions_dat2[predictions_dat2['value'] < 1000] 
skip_models = ['Dummy Mean', 'Dummy Median', 'Dummy Quantile']
model_groups = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression']
model_groups2 = ['Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']

def eval_predictions(in_model, pred_df):

    pred_df = pred_df[pred_df['Model_Name'] == in_model]

    fig = px.histogram(pred_df, x="value", color="LabelType",
                   marginal="box", # or violin, rug
                   hover_data=pred_df.columns,
                   labels = {'value': "Number of Events"},
                   title = f"{in_model} target label predictions versus ground truth")
    return fig.update_layout(barmode="group") 
eval_predictions('Random Forest Regression', predictions_dat3)

In [13]:
import plotly.graph_objects as go

def eval_predictions2(in_model, pred_df):

    pred_df = pred_df[pred_df['Model_Name'] == in_model]

    pred_df = pred_df.sort_values('True')

    pred_df['indx'] = [val for val in list(range(len(pred_df)))]

    out = go.Figure()
    # Add scatter points and line to the legend
    out.add_trace(go.Scatter(x=pred_df["indx"], y=pred_df["Predictions"], mode="lines", name="Predicted values",
                    line=dict(color="orange", width=0.3)))
    out.add_trace(go.Scatter(x=pred_df["indx"], y=pred_df["True"], mode="markers", name="True values",
                    marker=dict(color="blue")))


    # Update the overall layout with title and axis labels
    out.update_layout(title=f"{in_model} Regression Results",
                    xaxis_title="Index",  # Update with the appropriate x-axis label
                    yaxis_title="Number of Events",
                    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))  # Adjust legend position 
    out.update_traces(opacity=0.9, selector=dict(name="Predicted values"))

    return out

#eval_predictions2('Gradient Boosting Regression', predictions_dat)
eval_predictions2('Random Forest Regression', predictions_dat)

In [10]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

def eval_predictions3(in_model, pred_df):

    pred_df = pred_df[pred_df['Model_Name'] == in_model]

    pred_df = pred_df.sort_values('True')

    pred_df['indx'] = [val for val in list(range(len(pred_df)))]

    # Create scatter plot
    fig = px.scatter(pred_df, x='True', y='Predictions', title='Actual vs. Predicted',
                 labels={'True': 'Actual Values', 'Predictions': 'Predicted Values'},
                 hover_name=pred_df.index)
    
    # Add line of equality
    fig.add_shape(type='line', line=dict(dash='dash'), x0=pred_df['True'].min(), x1=pred_df['True'].max(),
                y0=pred_df['True'].min(), y1=pred_df['True'].max())
    
    # Update layout
    fig.update_layout(showlegend=False)

    return fig

#eval_predictions2('Gradient Boosting Regression', predictions_dat)
eval_predictions3('Random Forest Regression', predictions_dat)


In [37]:
# Include 5 fold cross val
# Make a scorer and enable weighted scoring 

# Iterate through the models and perform 5-fold cross-validation
for model_name, model in models.items():
    print(f"Model: {model_name}")
    
    # Create a scorer using the custom scoring function
    scorer = make_scorer(weighted_mae_fun, greater_is_better=False, needs_proba=False)
    
    # Perform cross-validation
    scores = cross_val_score(model, X_train_std, training_data['events'], cv=5, scoring=scorer)
    
    # Display the cross-validation results
    print(f"Mean weighted MAE score for {model_name}:", np.mean(-scores))  # Convert back to positive values

Feature importances for Linear Regression:
Model_Latitude 7320: -2840999745355.6465
Model_Latitude 9520: -2692373572086.736
Model_Latitude 7300: -1240807220635.0552
Model_Latitude 9510: -1239288993019.3174
Model_Precision Tower 3430: -730801888629.5428
Feature importances for Lasso Regression:
CaseType_Others: -0.0
Has64BitMacro: 0.0
PowerPointx64_addin_filesize: 0.0
PowerPointx86_addin_filesize: 0.0
Publisherx86_addin_filesize: 0.0
Feature importances for Ridge Regression:
Wordx64_addin_filesize: 2.4280224727906763
PowerPointx86_addin_filesize: 2.2507945249404524
PowerPointx64_addin_filesize: -1.787368497143452
has_bloomberg_add: -1.5581349386432264
Wordx86_addin_filesize: -0.8263435928067453
Feature importances for Elastic Net Regression:
CaseType_Others: -0.0
Has64BitMacro: 0.0
PowerPointx64_addin_filesize: 0.0
PowerPointx86_addin_filesize: 0.0
Publisherx86_addin_filesize: 0.0
Feature importances for Decision Tree Regression:
Days Since Creation: 0.09071501052830011
FreeSpace_GB: 0.

Unnamed: 0,Model,Val MSE,Val RMSE,Val MAE,CV MSE,CV RMSE,CV MAE
0,Dummy Mean,2.161816,1.470312,1.340526,3.702727,1.924247,1.424596
1,Dummy Median,1.782354,1.335048,1.189903,4.162712,2.040272,1.575895
2,Dummy Quantile,0.8577485,0.9261471,0.6982651,5.875204,2.423882,1.775769
3,Linear Regression,2.078174e+22,144158700000.0,3692737000.0,2.884565e+26,16984010000000.0,213291000000.0
4,Lasso Regression,2.161816,1.470312,1.340526,3.702727,1.924247,1.424596
5,Ridge Regression,1.627891,1.275888,1.069436,3.197337,1.78811,1.252794
6,Elastic Net Regression,2.161816,1.470312,1.340526,3.702727,1.924247,1.424596
7,Decision Tree Regression,3.250391,1.802884,1.127852,4.861492,2.204879,1.501395
8,Random Forest Regression,1.419796,1.191552,0.9595512,2.669542,1.633873,1.177481
9,Gradient Boosting Regression,1.423983,1.193308,0.9991107,2.8309,1.682528,1.195353


In [16]:
feat_import_df[feat_import_df['model'] == 'Random Forest Regression'].head(20)

Unnamed: 0,model,feature,importance
0,Random Forest Regression,Days Since Creation,0.086645
1,Random Forest Regression,FreeSpace_GB,0.074325
2,Random Forest Regression,avg_software_age,0.070524
3,Random Forest Regression,has_cap_iq_add,0.068236
4,Random Forest Regression,num_installed_programs,0.056337
5,Random Forest Regression,Days Since Last Logon,0.053267
6,Random Forest Regression,Outlookx86_addin_filesize,0.053131
7,Random Forest Regression,BIOSReleaseAge,0.046339
8,Random Forest Regression,InstallAge,0.045525
9,Random Forest Regression,Outlookx64_addin_filesize,0.040533
