In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_poisson_deviance, 
    brier_score_loss, roc_auc_score, roc_curve, RocCurveDisplay
)

import pickle
from datetime import datetime

import wandb
wandb.login(relogin = True)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
run = wandb.init(
  project="claims_modeling",
  group = 'demo',
  name = f'W&B Tutorial - {datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
  notes="Just Playing Around!",
  tags=["glm"],
  save_code = True,
  config = {
        "alpha": 0.1, 
        "fit_intercept": False, 
        "solver": 'lbfgs',
        'x': ['credit_score', 'annual_mileage'],
        'y': 'vehicle_claim_cnt_pd_0'
  }
)

[34m[1mwandb[0m: Currently logged in as: [33mtylerrosacker2022[0m ([33mmsds_498_claims_modeling[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Build Datasets and Feature Prep

In [4]:
datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
directory = datas.download(root = 'datasets')

train_df = pd.read_parquet('datasets/split=train')
test_df = pd.read_parquet('datasets/split=test')
val_df = pd.read_parquet('datasets/split=validation')

train_x = train_df[run.config['x']]
test_x = test_df[run.config['x']]
val_x = val_df[run.config['x']]

train_y = train_df[run.config['y']]
test_y = test_df[run.config['y']]
val_y = val_df[run.config['y']]


[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.3


# Build Model Object

In [5]:
model = PoissonRegressor(alpha = run.config['alpha'], 
                         fit_intercept = run.config['fit_intercept'],
                         solver = run.config['solver'])

# Train Model

In [6]:
model.fit(X = train_x, 
          y = train_y
          )

# Validation

In [7]:
train_pred = model.predict(train_x)
test_pred = model.predict(test_x)
val_pred = model.predict(val_x)

In [8]:
def log_stats(dataset_name, prediction, truth):
  predicted_p_gt_0 = np.clip(1 - np.exp(-prediction), a_min = 0, a_max = 1)
  truth_capped = np.clip(truth.to_numpy(), a_min = 0, a_max = 1)

  fpr, tpr, _ = roc_curve(truth_capped, predicted_p_gt_0)
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
  
  metrics = {
      f"{dataset_name}_prediction_dist": wandb.Histogram(prediction),
      f"{dataset_name}_mse": mean_squared_error(truth, prediction), 
      f"{dataset_name}_mae": mean_absolute_error(truth, prediction),
      f"{dataset_name}_mean_poisson_deviance": mean_poisson_deviance(truth, prediction),
      f"{dataset_name}_brier_loss": brier_score_loss(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_auc_score": roc_auc_score(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_roc": roc_display.figure_
    }
  wandb.log(metrics)

log_stats('train', train_pred, train_y)
log_stats('test', test_pred, test_y)
log_stats('val', val_pred, val_y)

# Save Model and Close Out

In [9]:
pickle.dump(model, open('model.plk', 'wb'))
wandb.save('model.plk')

['/content/wandb/run-20230429_134339-6yhy9yqw/files/model.plk']

In [10]:
wandb.finish()

VBox(children=(Label(value='0.618 MB of 0.844 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.731838…

0,1
test_auc_score,▁
test_brier_loss,▁
test_mae,▁
test_mean_poisson_deviance,▁
test_mse,▁
train_auc_score,▁
train_brier_loss,▁
train_mae,▁
train_mean_poisson_deviance,▁
train_mse,▁

0,1
test_auc_score,0.56767
test_brier_loss,0.07167
test_mae,0.15008
test_mean_poisson_deviance,0.41509
test_mse,0.08036
train_auc_score,0.56752
train_brier_loss,0.07093
train_mae,0.14957
train_mean_poisson_deviance,0.41403
train_mse,0.08058
