<a href="https://colab.research.google.com/github/owensappington2/msds_498_insurance_loss_modeling/blob/main/W%26B_GBM_with_Grid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.23.1-py2.py3-none-any.whl (205 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.1/205.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_poisson_deviance, 
    brier_score_loss, roc_auc_score, roc_curve, RocCurveDisplay
)

import pickle
from datetime import datetime

import wandb
wandb.login(relogin = True)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
run = wandb.init(
  project="claims_modeling",
  group = 'demo',
  name = f'GBM Model - {datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
  notes="Parameter Tuning",
  tags=["gbm"],
  save_code = True,
  config = {
        "n_estimators": 1000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "subsample": 0.5,
        "n_iter_no_change": 20,
        "min_samples_split":3,
        "min_samples_leaf":2,
        "max_features": 'log2',
        'x': ['vehicle_age', 'annual_mileage', 'max_driver_age', 'min_driver_age',\
        'mean_driver_age', 'min_driver_tenure', 'youthful_driver_count', 'credit_score',\
        'household_tenure','multiline_houses', 'multiline_personal_article_policy', \
        'multiline_personal_liability_umbrella', 'multiline_rental', 'vehicle_count', 'vehicle_claim_time_since_all', \
        'driver_count','coverage_bi','coverage_coll','coverage_comp','coverage_ers','coverage_mpc','coverage_pd','coverage_ubi', \
        'vehicle_type',  'garaging_location'
      ],
        'y': 'vehicle_claim_cnt_pd_0'
  }
)

#

[34m[1mwandb[0m: Currently logged in as: [33mowensappington2022[0m ([33mmsds_498_claims_modeling[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Build Datasets and Feature Prep

In [4]:
datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
directory = datas.download(root = 'datasets')

train_df = pd.read_parquet('datasets/split=train')
test_df = pd.read_parquet('datasets/split=test')
val_df = pd.read_parquet('datasets/split=validation')

[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:5.5


In [5]:
datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
directory = datas.download(root = 'datasets')

replace_vals = {
    'vehicle_type': {'van': 1, 'sports car': 2, 'pickup': 3, 'sedan': 4, 'suv': 5},
    'garaging_location': {'country': 1, 'downtown': 2, 'suburb': 3}
    }

train_df = pd.read_parquet('datasets/split=train').replace(replace_vals)
test_df = pd.read_parquet('datasets/split=test').replace(replace_vals)
val_df = pd.read_parquet('datasets/split=validation').replace(replace_vals)

train_x = train_df[run.config['x']]
test_x = test_df[run.config['x']]
val_x = val_df[run.config['x']]

train_y = train_df[run.config['y']]
test_y = test_df[run.config['y']]
val_y = val_df[run.config['y']]


[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.3


# Build Model Object

In [6]:
model = HistGradientBoostingRegressor(
    max_iter=run.config['n_estimators'],
    learning_rate=run.config['learning_rate'],
    max_depth=run.config['max_depth'],
    min_samples_leaf=run.config['min_samples_leaf'],
    validation_fraction=0.1,
    n_iter_no_change=run.config['n_iter_no_change'],
    # max_features=run.config['max_features'],
    loss="poisson",
    categorical_features = [x for x in run.config['x'] if x in ['vehicle_type', 'garaging_location']],
    verbose=1)

Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_test1 = {'max_depth':range(1,5,1),'n_iter_no_change':range(5,20,1), 'max_iter':range(100,1000,100)}
gsearch1 = GridSearchCV(estimator = model, param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X = train_x, y = train_y)

In [None]:
gsearch1.best_params_

# Train Model

In [None]:
model.fit(X = train_x, 
          y = train_y
          )

# Validation

In [None]:
train_pred = model.predict(train_x)
test_pred = model.predict(test_x)
val_pred = model.predict(val_x)

In [None]:
def log_stats(dataset_name, prediction, truth):
  prediction = np.clip(prediction, a_min = 0.001, a_max = np.inf)
  predicted_p_gt_0 = np.clip(1 - np.exp(-prediction), a_min = 0, a_max = 1)
  truth_capped = np.clip(truth, a_min = 0, a_max = 1)

  fpr, tpr, _ = roc_curve(truth_capped, predicted_p_gt_0)
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
  
  metrics = {
      f"{dataset_name}_prediction_dist": wandb.Histogram(prediction),
      f"{dataset_name}_mse": mean_squared_error(truth, prediction), 
      f"{dataset_name}_mae": mean_absolute_error(truth, prediction),
      f"{dataset_name}_mean_poisson_deviance": mean_poisson_deviance(truth, prediction),
      f"{dataset_name}_brier_loss": brier_score_loss(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_auc_score": roc_auc_score(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_roc": roc_display.figure_
    }
  wandb.log(metrics)
  

log_stats('train', train_pred, train_y)
log_stats('test', test_pred, test_y)
log_stats('val', val_pred, val_y)

# Save Model and Close Out

In [None]:
pickle.dump(model, open('model.plk', 'wb'))
wandb.save('model.plk')

['/content/wandb/run-20230517_011940-182sigv5/files/model.plk']

In [None]:
wandb.finish()

VBox(children=(Label(value='4.047 MB of 4.047 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_auc_score,▁
test_brier_loss,▁
test_mae,▁
test_mean_poisson_deviance,▁
test_mse,▁
train_auc_score,▁
train_brier_loss,▁
train_mae,▁
train_mean_poisson_deviance,▁
train_mse,▁

0,1
test_auc_score,0.61282
test_brier_loss,0.07058
test_mae,0.14593
test_mean_poisson_deviance,0.40161
test_mse,0.07901
train_auc_score,0.63362
train_brier_loss,0.0695
train_mae,0.14506
train_mean_poisson_deviance,0.39526
train_mse,0.07874
