In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_poisson_deviance, 
    brier_score_loss, roc_auc_score, roc_curve, RocCurveDisplay
)

import pickle
from datetime import datetime

import wandb
wandb.login(relogin = True)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
run = wandb.init(
  project="claims_modeling",
  group = 'demo',
  name = f'W&B Tutorial - {datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
  notes="Just Playing Around!",
  tags=["gbm"],
  save_code = True,
  config = {
        "n_estimators": 500,
        "learning_rate": 0.05,
        "max_depth": 3,
        "subsample": 0.5,
        "n_iter_no_change": 20,
        "max_features": 'log2',
        'x': ['vehicle_age', 'annual_mileage', 'max_driver_age', 'min_driver_age',\
        'mean_driver_age', 'min_driver_tenure', 'youthful_driver_count', 'credit_score',\
        'household_tenure','multiline_houses', 'multiline_personal_article_policy', \
        'multiline_personal_liability_umbrella', 'multiline_rental', 'vehicle_count', 'vehicle_claim_time_since_all', \
        'driver_count','coverage_bi','coverage_coll','coverage_comp','coverage_ers','coverage_mpc','coverage_pd','coverage_ubi', \
        'vehicle_type',  'garaging_location'
      ],
        'y': 'vehicle_claim_cnt_pd_0'
  }
)

#

[34m[1mwandb[0m: Currently logged in as: [33mtylerrosacker2022[0m ([33mmsds_498_claims_modeling[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Build Datasets and Feature Prep

In [4]:
datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
directory = datas.download(root = 'datasets')

train_df = pd.read_parquet('datasets/split=train')
test_df = pd.read_parquet('datasets/split=test')
val_df = pd.read_parquet('datasets/split=validation')

[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.5


In [9]:
datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
directory = datas.download(root = 'datasets')

replace_vals = {
    'vehicle_type': {'van': 1, 'sports car': 2, 'pickup': 3, 'sedan': 4, 'suv': 5},
    'garaging_location': {'country': 1, 'downtown': 2, 'suburb': 3}
    }

train_df = pd.read_parquet('datasets/split=train').replace(replace_vals)
test_df = pd.read_parquet('datasets/split=test').replace(replace_vals)
val_df = pd.read_parquet('datasets/split=validation').replace(replace_vals)

train_x = train_df[run.config['x']]
test_x = test_df[run.config['x']]
val_x = val_df[run.config['x']]

train_y = train_df[run.config['y']]
test_y = test_df[run.config['y']]
val_y = val_df[run.config['y']]


[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.4


# Build Model Object

In [12]:
model = HistGradientBoostingRegressor(
    max_iter=run.config['n_estimators'],
    learning_rate=run.config['learning_rate'],
    max_depth=run.config['max_depth'],
    #min_samples_leafint=run.config['subsample'],
    validation_fraction=0.1,
    n_iter_no_change=run.config['n_iter_no_change'],
    #max_features=run.config['max_features'],
    loss="poisson",
    categorical_features = [x for x in run.config['x'] if x in ['vehicle_type', 'garaging_location']],
    verbose=1)

# Train Model

In [13]:
model.fit(X = train_x, 
          y = train_y
          )

Binning 0.040 GB of training data: 0.486 s
Binning 0.004 GB of validation data: 0.015 s
Fitting gradient boosted rounds:
[1/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.28133, val loss: 0.28013, in 0.036s
[2/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.28089, val loss: 0.27974, in 0.033s
[3/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.28048, val loss: 0.27940, in 0.035s
[4/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.28012, val loss: 0.27908, in 0.044s
[5/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.27979, val loss: 0.27882, in 0.031s
[6/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.27946, val loss: 0.27855, in 0.036s
[7/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.27918, val loss: 0.27829, in 0.042s
[8/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.27890, val loss: 0.27806, in 0.045s
[9/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.27866, val loss: 0.27788, in 0.033s
[10/500] 1 tree, 8 leaves, max depth = 3, train loss: 0.2784

# Validation

In [14]:
train_pred = model.predict(train_x)
test_pred = model.predict(test_x)
val_pred = model.predict(val_x)

In [15]:
def log_stats(dataset_name, prediction, truth):
  prediction = np.clip(prediction, a_min = 0.001, a_max = np.inf)
  predicted_p_gt_0 = np.clip(1 - np.exp(-prediction), a_min = 0, a_max = 1)
  truth_capped = np.clip(truth, a_min = 0, a_max = 1)

  fpr, tpr, _ = roc_curve(truth_capped, predicted_p_gt_0)
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
  
  metrics = {
      f"{dataset_name}_prediction_dist": wandb.Histogram(prediction),
      f"{dataset_name}_mse": mean_squared_error(truth, prediction), 
      f"{dataset_name}_mae": mean_absolute_error(truth, prediction),
      f"{dataset_name}_mean_poisson_deviance": mean_poisson_deviance(truth, prediction),
      f"{dataset_name}_brier_loss": brier_score_loss(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_auc_score": roc_auc_score(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_roc": roc_display.figure_
    }
  wandb.log(metrics)
  

log_stats('train', train_pred, train_y)
log_stats('test', test_pred, test_y)
log_stats('val', val_pred, val_y)

# Save Model and Close Out

In [16]:
pickle.dump(model, open('model.plk', 'wb'))
wandb.save('model.plk')

['/content/wandb/run-20230514_001119-g3bcfy74/files/model.plk']

In [17]:
wandb.finish()

0,1
test_auc_score,▁
test_brier_loss,▁
test_mae,▁
test_mean_poisson_deviance,▁
test_mse,▁
train_auc_score,▁
train_brier_loss,▁
train_mae,▁
train_mean_poisson_deviance,▁
train_mse,▁

0,1
test_auc_score,0.61301
test_brier_loss,0.07055
test_mae,0.14606
test_mean_poisson_deviance,0.40136
test_mse,0.07898
train_auc_score,0.62635
train_brier_loss,0.06963
train_mae,0.14534
train_mean_poisson_deviance,0.397
train_mse,0.07893
