In [3]:
# import required libraries
import os
import ibis
from dotenv import load_dotenv
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import vetiver
import pins
import rsconnect
from datetime import datetime, timedelta

In [4]:
# setup local environment
load_dotenv()

True

In [5]:
# read inspection data from the database
con = ibis.postgres.connect(
    user="posit",
    password=os.environ["CONF23_DB_PASSWORD"],
    host=os.environ["CONF23_DB_HOST"],
    port=5432,
    database="conf23_python",
)

inspection_data = con.table("food_inspection_validated").to_pandas()

In [6]:
inspection_data

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,zip,inspection_date,inspection_type,results,violations
0,2028690,CHINA CAFE,CHINA CAFE,2419388,RESTAURANT,RISK 1 (HIGH),60616,2017-04-20,CANVASS RE-INSPECTION,PASS,"{""33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTEN..."
1,2028768,NEW STONY SUMBARINE INC.,NEW STONY SUBMARINE,2060138,RESTAURANT,RISK 1 (HIGH),60617,2017-04-21,CANVASS RE-INSPECTION,PASS,
2,2028736,"BK'S FOOD, INC",BK'S FOOD,2522198,GROCERY STORE,RISK 2 (MEDIUM),60644,2017-04-21,LICENSE,PASS,
3,2028718,"LA UNICA FOOD MART, INC.",LA UNICA FOOD MART,491,RESTAURANT/GROCERY STORE,RISK 1 (HIGH),60660,2017-04-21,CANVASS,FAIL,"{""3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERAT..."
4,2028712,GT MARGIES MAXWELL STREET POLISH,GT MARGIES MAXWELL STREET POLISH,2523680,RESTAURANT,RISK 1 (HIGH),60651,2017-04-21,LICENSE,PASS,
...,...,...,...,...,...,...,...,...,...,...,...
255135,2028642,SAIGON LOTUS,SAIGON LOTUS,2523640,RESTAURANT,RISK 1 (HIGH),60657,2017-04-20,LICENSE,NOT READY,
255136,2028658,PIZZA VINO,PIZZA VINO,2511390,RESTAURANT,RISK 3 (LOW),60638,2017-04-20,LICENSE,FAIL,
255137,2028677,"BEREKE, INC..",BEREKE RESTAURANT,2241067,RESTAURANT,RISK 1 (HIGH),60625,2017-04-20,CANVASS,OUT OF BUSINESS,
255138,2028674,NORTH WEST REGIONAL CENTER,NORTH WEST REGIONAL CENTER,1878473,GOLDEN DINER,RISK 1 (HIGH),60618,2017-04-20,COMPLAINT,FAIL,"{""18. NO EVIDENCE OF RODENT OR INSECT OUTER OP..."


#### Exploratory Data Analysis

In [75]:
# (Uncomment to run some analysis)
#
# inspection_data.columns
# inspection_data.dtypes
# inspection_data.groupby("facility_type").count()["inspection_id"].sort_values(ascending=False)
# inspection_data.groupby("results").count()["inspection_id"]
# inspection_data['results'].unique()
# inspection_data.groupby("risk").count()["inspection_id"]
# inspection_data['violations']

#### Feature Engineering

In [56]:
inspection_data

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,zip,inspection_date,inspection_type,results,violations
0,2028690,CHINA CAFE,CHINA CAFE,2419388,RESTAURANT,RISK 1 (HIGH),60616,2017-04-20,CANVASS RE-INSPECTION,PASS,"{""33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTEN..."
1,2028768,NEW STONY SUMBARINE INC.,NEW STONY SUBMARINE,2060138,RESTAURANT,RISK 1 (HIGH),60617,2017-04-21,CANVASS RE-INSPECTION,PASS,
2,2028736,"BK'S FOOD, INC",BK'S FOOD,2522198,GROCERY STORE,RISK 2 (MEDIUM),60644,2017-04-21,LICENSE,PASS,
3,2028718,"LA UNICA FOOD MART, INC.",LA UNICA FOOD MART,491,RESTAURANT/GROCERY STORE,RISK 1 (HIGH),60660,2017-04-21,CANVASS,FAIL,"{""3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERAT..."
4,2028712,GT MARGIES MAXWELL STREET POLISH,GT MARGIES MAXWELL STREET POLISH,2523680,RESTAURANT,RISK 1 (HIGH),60651,2017-04-21,LICENSE,PASS,
...,...,...,...,...,...,...,...,...,...,...,...
255135,2028642,SAIGON LOTUS,SAIGON LOTUS,2523640,RESTAURANT,RISK 1 (HIGH),60657,2017-04-20,LICENSE,NOT READY,
255136,2028658,PIZZA VINO,PIZZA VINO,2511390,RESTAURANT,RISK 3 (LOW),60638,2017-04-20,LICENSE,FAIL,
255137,2028677,"BEREKE, INC..",BEREKE RESTAURANT,2241067,RESTAURANT,RISK 1 (HIGH),60625,2017-04-20,CANVASS,OUT OF BUSINESS,
255138,2028674,NORTH WEST REGIONAL CENTER,NORTH WEST REGIONAL CENTER,1878473,GOLDEN DINER,RISK 1 (HIGH),60618,2017-04-20,COMPLAINT,FAIL,"{""18. NO EVIDENCE OF RODENT OR INSECT OUTER OP..."


In [7]:
# clean up input data for modelling

inspection_data_for_training = (
    inspection_data
    # remove NA licenses
    .loc[inspection_data["license_"] != 0]
    # select only Restaurant, Bakery, Grocery Store
    .loc[
        inspection_data["facility_type"].isin(["RESTAURANT", "BAKERY", "GROCERY STORE"])
    ]
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["facility_type"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_BAKERY": "BAKERY",
            "_RESTAURANT": "RESTAURANT",
            "_GROCERY STORE": "GROCERY_STORE",
        }
    )
    # filter out relavant inspection results
    .loc[inspection_data["results"].isin(["FAIL", "PASS", "PASS W/ CONDITIONS"])]
    # make Pass with Conditions results as Fail, since they are not completely pass
    .assign(results=(lambda x: x["results"].replace(["PASS W/ CONDITIONS"], "FAIL")))
    .assign(RESULTS=(lambda x: x["results"].map({"PASS": 1, "FAIL": 0}).astype(int)))
    .drop(columns=["results"])
    # filter out valid risk entries
    .loc[
        inspection_data["risk"].isin(
            ["RISK 1 (HIGH)", "RISK 2 (MEDIUM)", "RISK 3 (LOW)"]
        )
    ]
    # create dummy variables for risk
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["risk"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_RISK 1 (HIGH)": "HIGH_RISK",
            "_RISK 2 (MEDIUM)": "MEDIUM_RISK",
            "_RISK 3 (LOW)": "LOW_RISK",
        }
    )
    # sort results by business and inspection date
    .sort_values(by=["license_", "inspection_date"])
)

In [8]:
# count violations for each inspection
inspection_data_for_training["count_violations"] = (
    inspection_data_for_training["violations"]
    .apply(lambda x: len(x[1:-1].split('","')) if x is not None else None)
    .fillna(0)
)
# count cumilative violations for each date for a license
inspection_data_for_training["CUM_VIOLATIONS"] = inspection_data_for_training.groupby(
    ["license_"]
)["count_violations"].cumsum()

In [9]:
inspection_data_for_training

Unnamed: 0,inspection_id,dba_name,aka_name,license_,zip,inspection_date,inspection_type,violations,BAKERY,GROCERY_STORE,RESTAURANT,RESULTS,HIGH_RISK,MEDIUM_RISK,LOW_RISK,count_violations,CUM_VIOLATIONS
107199,120273,"QUITEFRANKLY,LTD.",UPS CAFETERIA,0,60607,2010-01-06,CANVASS,"{""33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTEN...",0,0,1,1,1,0,0,4.0,4.0
109955,68320,TACOS REYNA,,0,60617,2010-03-02,CONSULTATION,,0,0,1,0,1,0,0,0.0,4.0
112903,74431,MICHAEL'S FRESH MARKET,MICHAEL'S FRESH MARKET,0,60615,2010-05-03,CONSULTATION,"{""6. HANDS WASHED AND CLEANED, GOOD HYGIENIC P...",0,1,0,0,1,0,0,8.0,12.0
121726,335339,VRJ FOOD MART CO.,VRJ FOOD MART CO.,0,60637,2010-10-13,CANVASS,"{""9. WATER SOURCE: SAFE, HOT & COLD UNDER CITY...",0,1,0,0,0,1,0,13.0,25.0
122804,428224,IRISH AMERICAN HERITAGE CENTER,IRISH AMERICAN HERITAGE CENTER,0,60630,2010-10-28,TAG REMOVAL,"{""32. FOOD AND NON-FOOD CONTACT SURFACES PROPE...",0,0,1,0,1,0,0,6.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4126,2064867,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2017-06-23,CANVASS RE-INSPECTION,"{""32. FOOD AND NON-FOOD CONTACT SURFACES PROPE...",0,0,1,1,1,0,0,3.0,100.0
38461,2285869,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2019-04-26,CANVASS,"{""3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL...",0,0,1,0,1,0,0,12.0,112.0
56540,2401917,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2020-10-01,COMPLAINT,"{""51. PLUMBING INSTALLED; PROPER BACKFLOW DEVI...",0,0,1,1,1,0,0,3.0,115.0
76496,2528308,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2021-08-20,CANVASS,"{""16. FOOD-CONTACT SURFACES: CLEANED & SANITIZ...",0,0,1,0,1,0,0,4.0,119.0


##### Modelling

In [10]:
# Create training and test split
X = inspection_data_for_training.drop(
    columns=[
        "license_",
        "RESULTS",
        "inspection_id",
        "dba_name",
        "aka_name",
        "inspection_type",
        "violations",
        "count_violations",
        "inspection_date",
        "zip",
    ]
)
y = inspection_data_for_training[["RESULTS"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
# Train a random forest model
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, np.ravel(y_train))

In [12]:
# test predictions
y_pred = clf.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)

print(np.sqrt(mse))

0.6348408414527528


### Model deployment using Vetiver

In [189]:
# Create a vetiver model object


<vetiver.vetiver_model.VetiverModel at 0x7fc456af7dc0>

In [190]:
# Write the vetiver model as a pin for versioning
model_board = pins.board_connect(
    os.getenv("CONNECT_SERVER"),
    api_key=os.getenv("CONNECT_API_KEY"),
    allow_pickle_read=True
)
vetiver.vetiver_pin_write(model_board, model=v)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
Writing pin:
Name: 'gagan/inspection_results'
Version: 20230802T155036Z-8ce58


In [43]:
# Deploy the vetiver model as an API on Posit Connect
rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = rsconnect.api.RSConnectServer(url=rsc_server, api_key=rsc_key)


vetiver.deploy_rsconnect(
    connect_server=connect_server,
    board=model_board,
    pin_name="gagan/inspection_results",
)

             Consider creating a requirements.txt file instead.[0m


              Do you need to check your pinned model?
              Using version 77878
[33;20mConnect detected CLI commands and/or environment variables that overlap with stored credential.
[0m[33;20mCheck your environment variables (e.g. CONNECT_API_KEY) to make sure you want them to be used.
[0m[33;20mCredential paremeters are taken with the following precedence: stored > CLI > environment.
[0m[33;20mTo ignore an environment variable, override it in the CLI with an empty string (e.g. -k '').
[0m[0mValidating server...[0m[32;20m 	[OK]
[0m[0mValidating app mode...[0m[32;20m 	[OK]
[0m[0mMaking bundle ...[0m[32;20m 	[OK]
[0m[0mDeploying bundle ...[0m[32;20m 	[OK]
[0m[0mSaving deployed information...[0m[32;20m 	[OK]
[0m[0mBuilding FastAPI application...[0m
[0mBundle created with Python version 3.10.11 is compatible with environment Kubernetes::ghcr.io/rstudio/content-pro:r4.1.3-py3.10.11-ubuntu2204 with Python version 3.10.11 from /opt/python/3.10.11/bin/py

#### Model monitoring

In [13]:
# Simulate multiple days of input data
data = X_test.copy()
data["RESULTS"] = y_test
data["date_obs"] = ""
data

Unnamed: 0,BAKERY,GROCERY_STORE,RESTAURANT,HIGH_RISK,MEDIUM_RISK,LOW_RISK,CUM_VIOLATIONS,RESULTS,date_obs
42587,0,0,1,1,0,0,18.0,0,
247812,0,0,1,1,0,0,39.0,1,
190834,0,0,1,1,0,0,3.0,1,
83804,0,0,1,1,0,0,129.0,0,
234767,0,0,1,1,0,0,64.0,1,
...,...,...,...,...,...,...,...,...,...
46013,0,0,1,1,0,0,16.0,0,
147108,0,0,1,0,1,0,4.0,1,
25961,0,0,1,1,0,0,89.0,0,
157777,0,0,1,0,1,0,5.0,1,


In [15]:
endpoint = "https://colorado.posit.co/rsc/inspection_results/predict"
data["preds"] = vetiver.predict(endpoint, data).iloc[:, 0].values

In [16]:
# add last three dates
day_list = [
    (pd.Timestamp.today() - timedelta(days=2)).strftime("%Y-%m-%d"),
    (pd.Timestamp.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
    (pd.Timestamp.today().strftime("%Y-%m-%d")),
]
day_list

['2023-08-02', '2023-08-03', '2023-08-04']

In [17]:
data["date_obs"] = np.random.choice(day_list, size=len(data))

In [18]:
data

Unnamed: 0,BAKERY,GROCERY_STORE,RESTAURANT,HIGH_RISK,MEDIUM_RISK,LOW_RISK,CUM_VIOLATIONS,RESULTS,date_obs,preds
42587,0,0,1,1,0,0,18.0,0,2023-08-02,1
247812,0,0,1,1,0,0,39.0,1,2023-08-03,1
190834,0,0,1,1,0,0,3.0,1,2023-08-02,1
83804,0,0,1,1,0,0,129.0,0,2023-08-04,0
234767,0,0,1,1,0,0,64.0,1,2023-08-02,0
...,...,...,...,...,...,...,...,...,...,...
46013,0,0,1,1,0,0,16.0,0,2023-08-04,1
147108,0,0,1,0,1,0,4.0,1,2023-08-04,1
25961,0,0,1,1,0,0,89.0,0,2023-08-02,0
157777,0,0,1,0,1,0,5.0,1,2023-08-03,1


In [20]:
# pin this dataset for use in model card
board.pin_write(data,"gagan/inspection_results_monitoring",type="parquet")

Writing pin:
Name: 'gagan/inspection_results_monitoring'
Version: 20230804T162655Z-80f1d


Meta(title='inspection_results_monitoring: a pinned 44990 x 10 DataFrame', description=None, created='20230804T162655Z', pin_hash='80f1d6ba8242885c', file='inspection_results_monitoring.parquet', file_size=422651, type='parquet', api_version=1, version=VersionRaw(version='78050'), tags=None, name='gagan/inspection_results_monitoring', user={}, local={})

In [220]:
# define which metrics to track
metric_set = [metrics.mean_absolute_error, metrics.mean_squared_error, metrics.r2_score]

# choose time period to track over
td = timedelta(days=1)

# calculate metrics
original_metrics = vetiver.compute_metrics(
    data=data,
    date_var="date_obs",
    period=td,
    metric_set=metric_set,
    truth="RESULTS",
    estimate="preds",
)

In [221]:
original_metrics

Unnamed: 0,index,n,metric,estimate
0,2023-07-31,14980,mean_absolute_error,0.397463
1,2023-07-31,14980,mean_squared_error,0.397463
2,2023-07-31,14980,r2_score,-0.638377
3,2023-08-01,15019,mean_absolute_error,0.403289
4,2023-08-01,15019,mean_squared_error,0.403289
5,2023-08-01,15019,r2_score,-0.658787


In [222]:
# Write metrics as a pin
model_board.pin_write(original_metrics, "gagan/inspection_model_metrics", type="csv")

Writing pin:
Name: 'gagan/inspection_model_metrics'
Version: 20230802T173601Z-ec9a9


Meta(title='inspection_model_metrics: a pinned 6 x 4 DataFrame', description=None, created='20230802T173601Z', pin_hash='ec9a94f9d83fc22e', file='inspection_model_metrics.csv', file_size=340, type='csv', api_version=1, version=VersionRaw(version='77969'), tags=None, name='gagan/inspection_model_metrics', user={}, local={})

In [72]:
# Model card
vetiver.model_card(path=".")

'./model_card.qmd'