# Pickled Model using ipwidgets
> This notebook loads a previously trained model and uses it to predict quote success rate using user input to change fields. User input uses ip[

- toc: true 
- badges: true
- comments: true
- categories: [jupyter]
- author: Tim Cummings

In [1]:
import logging
import random

import ipywidgets as widgets
import pandas as pd
import numpy as np

from fastai.tabular.all import *
from IPython.display import display
from IPython.utils import io  # using io.capture_output
from sklearn.metrics import roc_auc_score

## Set up
Specify the folder which contains the original kaggle data (train.csv and test.csv) and the trained model (learn_0708.pkl)

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'
path = Path('data/homesite-quote')
logger = logging.getLogger("load_pickled_model")
logging.basicConfig(level=logging.INFO)

In [3]:
trained_dl_pkl = "learn_0708.pkl"
learn = load_pickle(path/trained_dl_pkl)
preds, targs = learn.get_preds()
logger.debug(f"Trained deep learning model {trained_dl_pkl} has a roc_auc_score of {roc_auc_score(to_np(targs), to_np(preds[:,1]))}")

In [4]:
df_train = pd.read_csv(path/'train.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
df_test = pd.read_csv(path/'test.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
sr_conv = df_train['QuoteConversion_Flag']
df_train.drop('QuoteConversion_Flag', inplace=True, axis=1)
df = pd.concat([df_train, df_test])
df = add_datepart(df, 'Original_Quote_Date')
logger.debug(f"{df.shape} {df_train.shape} {df_test.shape} {sr_conv.shape}")
df_train = None
df_test = None

## Create a sensitivity analysis tool
A field is sensitive if changing the value of the field can change the outcome of the predicted quote success

While logging is INFO some logging will occur during a normal run. Setting logging level to WARNING will only log if an unknown dtype is encountered. See setup above to set level.


In [5]:
def sensitivity_analysis(qn):
    """Using data from quote number qn do a sensitivity analysis on all independent variables"""
    # Independent variables
    ind_original = df.loc[qn]
    prd = learn.predict(ind_original)
    # Predicted quote conversion flag
    qcf_original = prd[1].item()
    # Probability that quote conversion flag is as predicted
    prb_original = prd[2][qcf_original].item()
    logger.info(f"Sensitivity Analysis for Quote {qn}")
    # Check if we actually know the correct answer
    if qn in sr_conv.index:
        logger.info(f"Actual QuoteConversion_Flag {sr_conv[qn]}")

    def tf_sensitive(f, v_original, lst_v, p_original):
        """predicts quote success after changing field f from v_original to each value in lst_v. 
        If prediction changes then quote is sensitive to the value of this field and True is returned"""
        # Create a DataFrame which has every row identical except for field in question
        # Field f iterates through every value provided
        ind_other = df.loc[qn:qn].copy().drop(f, axis=1)  # fields other than f
        ind_f = pd.DataFrame(data={f: lst_v}, index=[qn] * len(lst_v))
        # Merge these two DataFrames to create one with all rows identical except field f
        ind = pd.merge(ind_other, ind_f, right_index=True, left_index=True)
        # Copy lines from learn.predict() because we want to predict several rows at once (faster than one at a time)
        dl = learn.dls.test_dl(ind)
        dl.dataset.conts = dl.dataset.conts.astype(np.float32)
        # stop learn.get_preds() printing blank lines
        with io.capture_output() as captured:
            # using get_preds() rather than predict() because get_preds can do multiple rows at once
            inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
        tf = False
        # Check if any predictions changed
        for i, dp in enumerate(dec_preds):
            qcf = dp.item()
            if qcf != qcf_original:
                prb = preds[i][qcf].item()
                logger.info(f"Changing {f} from {val_original} to {lst_v[i]} changes predicted quote conversion flag "
                            f"from {prb_original:.2%} {qcf_original} to {prb:.2%} {qcf}")
                tf = True
        return tf

    set_sensitive = set()
    # Loop through all fields. Check different values of each field to see if result is sensitive to it.
    for field in df.columns:
        ind = ind_original.copy()
        val_original = ind[field]
        tf_important = False
        num_unique = df[field].nunique()
        # If number of unique values is under 30 then try every value (or for objects try every value)
        if num_unique < 30 or df.dtypes[field] == 'O':
            lst_unique = df[field].unique()
            if tf_sensitive(field, val_original, lst_unique, prb_original):
                tf_important = True
            if tf_important:
                logger.info(f"Possible values of {field} are {lst_unique}")
                set_sensitive.add(field)
        else:
            if df.dtypes[field] == "int64":
                vmin = df[field].min()
                vmax = df[field].max()
                lst_val = [vmin + (vmax - vmin) * i // 10 for i in range(11)]
                logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
                if tf_sensitive(field, val_original, lst_val, prb_original):
                    tf_important = True
            elif df.dtypes[field] == "float64":
                vmin = df[field].min()
                vmax = df[field].max()
                lst_val = [vmin + (vmax - vmin) * i / 10 for i in range(11)]
                logger.debug(f"{field} {num_unique} {df.dtypes[field]!r} {vmin} {vmax} {lst_val}")
                if tf_sensitive(field, val_original, lst_val, prb_original):
                    tf_important = True
            else:
                logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
            if tf_important:
                set_sensitive.add(field)
    # return the set of fields which had individual effects on the prediction
    return set_sensitive


# Application: Step 1 - Ask user for quote number
Try quote 325710 for a quote with many fields which could be changed

In [6]:
qn_min = sr_conv.index.min()
qn_max = sr_conv.index.max()
qn = random.randint(qn_min, qn_max)

In [14]:
# try 325710 58% 0
wdg_quote_success = widgets.Label(value="")
def handle_quote_number_change(change):
    qn = change.new
    with io.capture_output() as captured:
        prd = learn.predict(df.loc[qn])
    qcf = prd[1].item()
    prb = prd[2][qcf].item()
    act = sr_conv[qn] if qn in sr_conv else "unknown"
    wdg_quote_success.value = f"Quote {change.new} actual {act} predicted {prb:.2%} {qcf}"
style = {'description_width': 'initial', 'width': '500px'}
wdg_quote_number = widgets.IntSlider(description="Quote number", min=qn_min, max=qn_max, value=qn, style=style, layout={'width': '800px'})
wdg_quote_number.observe(handle_quote_number_change, names='value')
display(wdg_quote_number)
display(wdg_quote_success)

IntSlider(value=9049, description='Quote number', layout=Layout(width='800px'), max=434588, min=1, style=Slide…

Label(value='')

# Application: Step 2 - Do sensitivity analysis
Normally we would hide the logging output but it helps us playing with data later. 

In [15]:
out = widgets.Output(layout={'border': '1px solid green'})
with out:
    set_field = sensitivity_analysis(wdg_quote_number.value)
display(out)
# list of sensitive fields
L(set_field)

Output(layout=Layout(border='1px solid green'))

(#26) ['PersonalField11','CoverageField9','PropertyField37','PersonalField80','SalesField10','PersonalField83','PersonalField4A','PersonalField81','PropertyField39A','PropertyField29'...]

# Application: Step 3 - Try altering values of sensitive fields
You can enter more than one to try to improve probability of quote success

Example CoverageField9 from E to B and SalesField10 from 0 to 6

In [16]:
qn = wdg_quote_number.value
lst_dropdown = []
lst_radio = []
style = {'description_width': 'initial'}
def nan_if_nan(n):
    """Can't include np.nan in dropdowns as np.nan != np.nan. Instead use a str"""
    try:
        if np.isnan(n):
            return "nan"
    except TypeError as te:
        pass
    return n

for f in set_field:
    num_unique = df[f].nunique()
    lst_unique = sorted((str(nan_if_nan(u)), nan_if_nan(u)) for u in df[f].unique())
    v = nan_if_nan(df.loc[qn,f])
    if num_unique < 5:
        wdg = widgets.RadioButtons(options=lst_unique, description=f, style=style, value=v)
        lst_radio.append(wdg)
    else:
        wdg = widgets.Dropdown(options=lst_unique, description=f, style=style, value=v)
        lst_dropdown.append(wdg)
display(widgets.HBox(children=lst_radio))
display(widgets.VBox(children=lst_dropdown))

HBox(children=(RadioButtons(description='PropertyField37', index=1, options=((' ', ' '), ('N', 'N'), ('Y', 'Y'…

VBox(children=(Dropdown(description='PersonalField11', options=(('0', 0), ('1', 1), ('2', 2), ('3', 3), ('4', …

# Application: Step 4 - Calculate new probability of success
Example CoverageField9 from E to B and SalesField10 from 0 to 6

Quote went from 58% unsuccessful to 78% successful

In [17]:
qn = wdg_quote_number.value
ind = df.loc[qn].copy()
for w in lst_radio + lst_dropdown:
    if w.value == "nan":
        v = np.nan
    else:
        v = w.value
    ind[w.description] = v
with io.capture_output() as captured:
    prd = learn.predict(ind)
qcf = prd[1].item()
prb = prd[2][qcf].item()
act = sr_conv[qn] if qn in sr_conv else "unknown"
print(f"Quote {qn} actual {act} predicted {prb:.2%} {qcf}")


Quote 325710 actual unknown predicted 77.76% 1
