In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.21.1-py2.py3-none-any.whl (201 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.7/201.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 k

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras as ks

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_poisson_deviance, 
    brier_score_loss, roc_auc_score, roc_curve, RocCurveDisplay
)

import pickle
from datetime import datetime

import wandb
wandb.login(relogin = True)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Prep Data Once

In [3]:
def filter_nested_array(x, keep):
  result = [[y[key] for key in keep] for y in x]

  return np.array(result, dtype = 'object')

def prepare_row(row):
  # Figure out Claims
  claims_keep_key = ['bi_ind', 'coll_ind', 'comp_ind', 'ers_ind', 'mpc_ind', 'pd_ind', 'ubi_ind',
                      'veh_had_bi_cov_ind', 'veh_had_coll_cov_ind', 'veh_had_comp_cov_ind', 'veh_had_ers_cov_ind', 
                     'veh_had_mpc_cov_ind', 'veh_had_pd_cov_ind', 'veh_had_ubi_cov_ind']
                      
  other_claims = filter_nested_array(row['other_claims'], claims_keep_key)
  other_claim_cnt = len(other_claims)
  if other_claim_cnt > 0:
    other_claims = np.append(other_claims, np.zeros([len(other_claims),1]), 1)

  veh_claims = filter_nested_array(row['vehicle_claims'], claims_keep_key)
  claim_cnt = len(veh_claims)
  if claim_cnt > 0:
    veh_claims = np.append(veh_claims, np.ones([len(veh_claims),1]), 1)

  if claim_cnt + other_claim_cnt == 0:
      all_claims = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
  elif claim_cnt == 0:
    all_claims = other_claims
  elif other_claim_cnt == 0:
    all_claims = veh_claims
  else:
    all_claims = np.append(veh_claims, other_claims, axis = 0)

  all_claims = tf.ragged.constant([all_claims.astype('float16')], ragged_rank = 1, inner_shape = (15,))

  # Figure Out Drivers
  drivers = filter_nested_array(row['driver_info'], ['driver_age', 'driver_gender', 'driver_tenure'])
  drivers[:, 0] = (drivers[:, 0] - 50)/50
  drivers[:, 1] = np.where(drivers[:, 1] == 'm', 1, 0)
  drivers[:, 2] = (drivers[:, 2] - 10)/10
  drivers = tf.ragged.constant([drivers.astype('float16')], ragged_rank = 1, inner_shape = (3,))

  # Figure out Vehicles
  vehicles = filter_nested_array(row['household_vehicles_info'], ['this_vehicle_ind', 'vehicle_age', 'vehicle_type', 'vehicle_years_owned'])
  vehicles[:, 1] = (vehicles[:, 1] - 15)/15
  vehicles[:, 3] = (vehicles[:, 3] - 15)/15

  veh_type = vehicles[:, 2]
  vehicles[:, 2] = np.where(veh_type == 'van', 1, 0) + np.where(veh_type == 'sedan', 2, 0) + np.where(veh_type == 'sports car', 3, 0) + np.where(veh_type == 'suv', 4, 0)
  vehicles = tf.ragged.constant([vehicles.astype('float16')], ragged_rank = 1, inner_shape = (4,))

  other = [(row['credit_score'] - 600)/500,
           (row['garaging_location'] == 'country') * 1 + (row['garaging_location'] == 'downtown') * 2, 
           (row['household_tenure'] - 15)/10,
           (row['multiline_houses']/2),
           row['multiline_rental'],
           row['multiline_personal_article_policy'],
           row['multiline_personal_liability_umbrella'],
           (row['vehicle_count']-3)/3,
           (row['annual_mileage'] - 10000)/10000,
           (row['vehicle_age'] - 15)/15,
           np.where(row['vehicle_type'] == 'van', 1, 0) + np.where(row['vehicle_type'] == 'sedan', 2, 0) + np.where(row['vehicle_type'] == 'sports car', 3, 0) + np.where(row['vehicle_type'] == 'suv', 4, 0),
           (row['vehicle_years_owned'] - 10)/15
           ]
  other = tf.constant(value = np.array(other, dtype = 'float16'))

  result = {
      'driver_info': drivers,
      'vehicle_info': vehicles,
      'claims_info': all_claims,
      'other_data': other,
      'target': row['vehicle_claim_cnt_pd_0']
  }

  return result

ragged = lambda y : tf.concat(y.to_list(), axis = 0)

def prep_one_datas(features):

  x = [
      ragged(features['driver_info']), 
      ragged(features['vehicle_info']), 
      ragged(features['claims_info']), 
      tf.convert_to_tensor(features['other_data'].to_list())
  ]

  y = tf.convert_to_tensor(features['target'].values)

  return x,y

In [4]:
with wandb.init(
      project="claims_modeling",
      group = 'Data Prep',
      name = f'Data Prep for NN - {datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
      notes="Prep for NN Training, No Model Build!",
      tags=["data"]) as run:
    datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
    directory = datas.download(root = 'datasets')

    train_features = pd.read_parquet('datasets/split=train') \
      .apply(prepare_row, axis = 1, result_type = 'expand')

    test_features = pd.read_parquet('datasets/split=test')\
      .apply(prepare_row, axis = 1, result_type = 'expand')

    val_features = pd.read_parquet('datasets/split=validation')\
      .apply(prepare_row, axis = 1, result_type = 'expand')

    train_x, train_y = prep_one_datas(train_features)
    test_x, test_y = prep_one_datas(test_features)
    val_x, val_y = prep_one_datas(val_features)

[34m[1mwandb[0m: Currently logged in as: [33mtylerrosacker2022[0m ([33mmsds_498_claims_modeling[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact sythetic_data:v5, 153.76MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:7.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Model Train Code

In [5]:
def build_model(run):
  # driver_info
  driver_input = ks.Input(shape = (None, 3), name = 'driver_info')
  driver_dense_1 = tf.keras.layers.Dense(run.config['driver_dense'], 
                                        activation=tf.keras.layers.LeakyReLU(alpha=run.config['relu_leakiness']),
                                        name = "Driver_Info_Dense"
                                        )(driver_input)
  driver_agged = tf.math.reduce_sum(driver_dense_1, 1)

  # vehicle_info
  vehicle_input = ks.Input(shape = (None, 4), name = 'vehicle_info')
  vehicle_dense_1 = tf.keras.layers.Dense(run.config['veh_dense'], 
                                          activation=tf.keras.layers.LeakyReLU(alpha=run.config['relu_leakiness']),
                                        name = "Vehicle_Info_Dense"
                                          )(vehicle_input)
  vehicle_agged = tf.math.reduce_sum(vehicle_dense_1, 1)

  # claims_info
  claims_input = ks.Input(shape = (None, 15), name = 'claims_info')
  claims_dense_1 = tf.keras.layers.Dense(run.config['claim_dense'], 
                                        activation=tf.keras.layers.LeakyReLU(alpha=run.config['relu_leakiness']),
                                        name = "Claim_Info_Dense"
                                        )(claims_input)
  claims_agged = tf.math.reduce_sum(claims_dense_1, 1)

  # other_data
  other_input = ks.Input(shape = (12,), name = 'other_data')

  combined = tf.keras.layers.Concatenate()([driver_agged, vehicle_agged, claims_agged, other_input])
  combined_norm = tf.keras.layers.BatchNormalization()(combined)

  dense_1 = tf.keras.layers.Dense(run.config['dense_res_block_width'], 
                                  activation=tf.keras.layers.LeakyReLU(alpha=run.config['relu_leakiness']),
                                  name = "Res_Layer_1"
                                  )(combined_norm)
  drop_1 = tf.keras.layers.Dropout(run.config['dropout'])(dense_1)

  dense_2 = tf.keras.layers.Dense(run.config['dense_res_block_width'], 
                                  activation=tf.keras.layers.LeakyReLU(alpha=run.config['relu_leakiness']),
                                  name = "Res_Layer_2"
                                  )(drop_1)

  drop_2 = tf.keras.layers.Dropout(run.config['dropout'])(dense_2)

  res_layer = tf.keras.layers.Add(name = "Combined_Res_Result")([drop_1, drop_2])

  target_layer = tf.keras.layers.Dense(1, 
                                      activation=tf.keras.activations.exponential, 
                                      name = 'target')(res_layer)

  model = ks.Model(inputs = [driver_input, vehicle_input, claims_input, other_input],
                outputs = [target_layer])

  model.compile(
                optimizer=tf.keras.optimizers.experimental.AdamW(
                    learning_rate = run.config['learning_rate'],
                    weight_decay = run.config['weight_decay']
                ), 
                loss=tf.keras.losses.Poisson())
  
  return model

In [6]:
def log_stats(dataset_name, prediction, truth):
  prediction = np.clip(prediction, a_min = 0.001, a_max = np.inf)
  predicted_p_gt_0 = np.clip(1 - np.exp(-prediction), a_min = 0, a_max = 1)
  truth_capped = np.clip(truth, a_min = 0, a_max = 1)

  fpr, tpr, _ = roc_curve(truth_capped, predicted_p_gt_0)
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
  
  metrics = {
      f"{dataset_name}_prediction_dist": wandb.Histogram(prediction),
      f"{dataset_name}_mse": mean_squared_error(truth, prediction), 
      f"{dataset_name}_mae": mean_absolute_error(truth, prediction),
      f"{dataset_name}_mean_poisson_deviance": mean_poisson_deviance(truth, prediction),
      f"{dataset_name}_brier_loss": brier_score_loss(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_auc_score": roc_auc_score(truth_capped, predicted_p_gt_0),
      f"{dataset_name}_roc": roc_display.figure_
    }
  wandb.log(metrics)
  


In [7]:
def main(config = None):
  with wandb.init(
      project="claims_modeling",
      group = 'NN Template V1',
      name = f'NN Train - {datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
      notes="Architecture 1 NN",
      tags=["nn"],
      save_code = True,
      sync_tensorboard=True,
      config=config) as run:
    datas = run.use_artifact('msds_498_claims_modeling/claims_modeling/sythetic_data:v5')
    model = build_model(run)

    tf.keras.utils.plot_model(
      model,
      to_file='model.png',
      show_shapes=True,
      show_layer_names=True,
      show_layer_activations=True,
      show_trainable=True
    )

    artifact = wandb.Artifact(
        name='model_arch_graph', 
        type='image'
        )    

    artifact.add_file(local_path='model.png')
    run.log_artifact(artifact)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(histogram_freq=1)

    model.fit(train_x, 
              train_y, 
              epochs = run.config['epochs'], 
              batch_size = run.config['batch_size'], 
              validation_data=(test_x, test_y),
              callbacks=[tensorboard_callback])
    
    train_pred = model.predict(train_x, batch_size = 1024)
    test_pred = model.predict(test_x, batch_size = 1024)
    val_pred = model.predict(val_x, batch_size = 1024)

    log_stats('train', train_pred, train_y)
    log_stats('test', test_pred, test_y)
    log_stats('val', val_pred, val_y)

    model.save('model')
    wandb.save('model')

# Grid Search!

In [None]:
# main(config = {
#         "epochs": 100,
#         "learning_rate": 1e-3,
#         "weight_decay": 4e-3,
#         "relu_leakiness": 0.01,
#         "driver_dense": 4,
#         "veh_dense": 5,
#         "claim_dense": 10,
#         "dense_res_block_width": 25,
#         "dropout": 0.1,
#         "batch_size": 1024
#   })

wandb.agent(entity = "msds_498_claims_modeling", project = "claims_modeling", sweep_id="koeqejgd", function=main, count = 15)

[34m[1mwandb[0m: Agent Starting Run: oyc5fksr with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	claim_dense: 10
[34m[1mwandb[0m: 	dense_res_block_width: 43
[34m[1mwandb[0m: 	driver_dense: 4
[34m[1mwandb[0m: 	dropout: 0.29374955518520646
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	learning_rate: 0.0018974163167356484
[34m[1mwandb[0m: 	relu_leakiness: 0.019171223690129184
[34m[1mwandb[0m: 	veh_dense: 3
[34m[1mwandb[0m: 	weight_decay: 1.2496176548696622e-05




Epoch 1/25



Epoch 2/25
Epoch 3/25
Epoch 4/25