In [1]:
# import required libraries
import os
from datetime import datetime, timedelta

import ibis
import numpy as np
import pandas as pd
import pins
import requests
import rsconnect
import vetiver
import xgboost as xgb
from dotenv import load_dotenv
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine


In [2]:
# setup local environment
load_dotenv()

False

In [3]:
connect_username = requests.get(
    f"{os.environ['CONNECT_SERVER']}/__api__/v1/user",
    headers={"Authorization": f"Key {os.environ['CONNECT_API_KEY']}"}
).json()["username"]

connect_username

'sam.edwardes'

In [4]:
# read inspection data from the database
con = ibis.postgres.connect(
    user="posit",
    password=os.environ["CONF23_DB_PASSWORD"],
    host=os.environ["CONF23_DB_HOST"],
    port=5432,
    database="conf23_python",
)

inspection_data = con.table("food_inspection_validated").to_pandas()

In [5]:
inspection_data

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,zip,inspection_date,inspection_type,results,violations
0,52234,CAFE 608,CAFE 608,2013328,RESTAURANT,RISK 1 (HIGH),60657,2010-01-04,LICENSE RE-INSPECTION,PASS,
1,70269,MR.DANIEL'S,MR.DANIEL'S,1899292,RESTAURANT,RISK 1 (HIGH),60634,2010-01-04,LICENSE RE-INSPECTION,PASS,
2,67733,WOLCOTT'S,TROQUET,1992040,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
3,67732,WOLCOTT'S,TROQUET,1992039,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
4,104236,TEMPO CAFE,TEMPO CAFE,80916,RESTAURANT,RISK 1 (HIGH),60611,2010-01-04,CANVASS,FAIL,"{""18. NO EVIDENCE OF RODENT OR INSECT OUTER OP..."
...,...,...,...,...,...,...,...,...,...,...,...
257513,2579937,LA BAGUETTE,ALEX CAFE/TORTA FUTBOLERAS,2008113,RESTAURANT,RISK 1 (HIGH),60623,2023-08-11,CANVASS RE-INSPECTION,PASS,"{""53. TOILET FACILITIES: PROPERLY CONSTRUCTED,..."
257514,2579958,SOUTH SHORE FOOD MART INC.,SOUTH SHORE FOOD MART,2340833,GROCERY STORE,RISK 1 (HIGH),60617,2023-08-11,CANVASS,FAIL,"{""3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL..."
257515,2579939,LA FIESTA BAKERY,LA FIESTA BAKERY/TAQUERIA,1488177,RESTAURANT,RISK 1 (HIGH),60629,2023-08-11,CANVASS,PASS,"{""37. FOOD PROPERLY LABELED; ORIGINAL CONTAINE..."
257516,2579946,GROTA RESTAURANT,GROTA RESTAURANT,6753,RESTAURANT,RISK 1 (HIGH),60634,2023-08-11,CANVASS RE-INSPECTION,PASS,


#### Exploratory Data Analysis

In [6]:
# (Uncomment to run some analysis)
#
# inspection_data.columns
# inspection_data.dtypes
# inspection_data.groupby("facility_type").count()["inspection_id"].sort_values(ascending=False)
# inspection_data.groupby("results").count()["inspection_id"]
# inspection_data['results'].unique()
# inspection_data.groupby("risk").count()["inspection_id"]
# inspection_data['violations']

#### Feature Engineering

In [7]:
inspection_data

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,zip,inspection_date,inspection_type,results,violations
0,52234,CAFE 608,CAFE 608,2013328,RESTAURANT,RISK 1 (HIGH),60657,2010-01-04,LICENSE RE-INSPECTION,PASS,
1,70269,MR.DANIEL'S,MR.DANIEL'S,1899292,RESTAURANT,RISK 1 (HIGH),60634,2010-01-04,LICENSE RE-INSPECTION,PASS,
2,67733,WOLCOTT'S,TROQUET,1992040,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
3,67732,WOLCOTT'S,TROQUET,1992039,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
4,104236,TEMPO CAFE,TEMPO CAFE,80916,RESTAURANT,RISK 1 (HIGH),60611,2010-01-04,CANVASS,FAIL,"{""18. NO EVIDENCE OF RODENT OR INSECT OUTER OP..."
...,...,...,...,...,...,...,...,...,...,...,...
257513,2579937,LA BAGUETTE,ALEX CAFE/TORTA FUTBOLERAS,2008113,RESTAURANT,RISK 1 (HIGH),60623,2023-08-11,CANVASS RE-INSPECTION,PASS,"{""53. TOILET FACILITIES: PROPERLY CONSTRUCTED,..."
257514,2579958,SOUTH SHORE FOOD MART INC.,SOUTH SHORE FOOD MART,2340833,GROCERY STORE,RISK 1 (HIGH),60617,2023-08-11,CANVASS,FAIL,"{""3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL..."
257515,2579939,LA FIESTA BAKERY,LA FIESTA BAKERY/TAQUERIA,1488177,RESTAURANT,RISK 1 (HIGH),60629,2023-08-11,CANVASS,PASS,"{""37. FOOD PROPERLY LABELED; ORIGINAL CONTAINE..."
257516,2579946,GROTA RESTAURANT,GROTA RESTAURANT,6753,RESTAURANT,RISK 1 (HIGH),60634,2023-08-11,CANVASS RE-INSPECTION,PASS,


In [8]:
# clean up input data for modelling

inspection_data_for_training = (
    inspection_data
    # remove NA licenses
    .loc[inspection_data["license_"] != 0]
    # select only Restaurant, Bakery, Grocery Store
    .loc[
        inspection_data["facility_type"].isin(["RESTAURANT", "BAKERY", "GROCERY STORE"])
    ]
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["facility_type"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_BAKERY": "BAKERY",
            "_RESTAURANT": "RESTAURANT",
            "_GROCERY STORE": "GROCERY_STORE",
        }
    )
    # filter out relavant inspection results
    .loc[inspection_data["results"].isin(["FAIL", "PASS", "PASS W/ CONDITIONS"])]
    # make Pass with Conditions results as Fail, since they are not completely pass
    .assign(results=(lambda x: x["results"].replace(["PASS W/ CONDITIONS"], "FAIL")))
    .assign(RESULTS=(lambda x: x["results"].map({"PASS": 1, "FAIL": 0}).astype(int)))
    .drop(columns=["results"])
    # filter out valid risk entries
    .loc[
        inspection_data["risk"].isin(
            ["RISK 1 (HIGH)", "RISK 2 (MEDIUM)", "RISK 3 (LOW)"]
        )
    ]
    # create dummy variables for risk
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["risk"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_RISK 1 (HIGH)": "HIGH_RISK",
            "_RISK 2 (MEDIUM)": "MEDIUM_RISK",
            "_RISK 3 (LOW)": "LOW_RISK",
        }
    )
    # sort results by business and inspection date
    .sort_values(by=["license_", "inspection_date"])
)

In [9]:
# count violations for each inspection
inspection_data_for_training["count_violations"] = (
    inspection_data_for_training["violations"]
    .apply(lambda x: len(x[1:-1].split('","')) if x is not None else None)
    .fillna(0)
)
# count cumilative violations for each date for a license
inspection_data_for_training["CUM_VIOLATIONS"] = inspection_data_for_training.groupby(
    ["license_"]
)["count_violations"].cumsum()

In [10]:
inspection_data_for_training

Unnamed: 0,inspection_id,dba_name,aka_name,license_,zip,inspection_date,inspection_type,violations,BAKERY,GROCERY_STORE,RESTAURANT,RESULTS,HIGH_RISK,MEDIUM_RISK,LOW_RISK,count_violations,CUM_VIOLATIONS
101,120273,"QUITEFRANKLY,LTD.",UPS CAFETERIA,0,60607,2010-01-06,CANVASS,"{""33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTEN...",0,0,1,1,1,0,0,4.0,4.0
2987,68320,TACOS REYNA,,0,60617,2010-03-02,CONSULTATION,,0,0,1,0,1,0,0,0.0,4.0
5896,74431,MICHAEL'S FRESH MARKET,MICHAEL'S FRESH MARKET,0,60615,2010-05-03,CONSULTATION,"{""6. HANDS WASHED AND CLEANED, GOOD HYGIENIC P...",0,1,0,0,1,0,0,8.0,12.0
14764,335339,VRJ FOOD MART CO.,VRJ FOOD MART CO.,0,60637,2010-10-13,CANVASS,"{""9. WATER SOURCE: SAFE, HOT & COLD UNDER CITY...",0,1,0,0,0,1,0,13.0,25.0
15834,428224,IRISH AMERICAN HERITAGE CENTER,IRISH AMERICAN HERITAGE CENTER,0,60630,2010-10-28,TAG REMOVAL,"{""32. FOOD AND NON-FOOD CONTACT SURFACES PROPE...",0,0,1,0,1,0,0,6.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152795,2064867,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2017-06-23,CANVASS RE-INSPECTION,"{""32. FOOD AND NON-FOOD CONTACT SURFACES PROPE...",0,0,1,1,1,0,0,3.0,100.0
186652,2285869,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2019-04-26,CANVASS,"{""3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL...",0,0,1,0,1,0,0,12.0,112.0
211803,2401917,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2020-10-01,COMPLAINT,"{""51. PLUMBING INSTALLED; PROPER BACKFLOW DEVI...",0,0,1,1,1,0,0,3.0,115.0
225041,2528308,FALCO'S PIZZA,FALCO'S PIZZA,990,60632,2021-08-20,CANVASS,"{""16. FOOD-CONTACT SURFACES: CLEANED & SANITIZ...",0,0,1,0,1,0,0,4.0,119.0


In [11]:
# Save the inspection data for training to SQL so that it can be
# used by the Shiny App.
db_user = "posit"
db_password = os.environ["CONF23_DB_PASSWORD"]
db_host = os.environ["CONF23_DB_HOST"]
db_port = 5432
db_database = "conf23_python"
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}/{db_database}")
print(f"{engine=}")

# Determine the table name
if connect_username == "sam.edwardes":
    table_name_prefix = ""
else:
    table_name_prefix = re.sub('[^0-9a-zA-Z]+', '_', connect_username) + "_"

table_name = f"{table_name_prefix}model_features"
print(f"{table_name=}")


# Insert the data into postgres. Inserting large amounts of data can be slow, so
# iterate over 10,000 rows at a time.
n_rows = inspection_data_for_training.shape[0]
step_size = 10_000

for i in range(0, n_rows, step_size):
    index_start = i
    index_end = min(n_rows, i + step_size - 1)
    
    if i == 0:
        if_exists = "replace"
    else:
        if_exists = "append"

    print(f"Inserting rows: {index_start:,} - {index_end:,}")

    inspection_data_for_training \
        .reset_index(drop=True) \
        .loc[index_start:index_end, :] \
        .to_sql(table_name, engine, if_exists=if_exists, index=False)


engine=Engine(postgresql+psycopg2://posit:***@database.conf23workflows.training.posit.co/conf23_python)
table_name='model_features'
Inserting rows: 0 - 9,999
Inserting rows: 10,000 - 19,999
Inserting rows: 20,000 - 29,999
Inserting rows: 30,000 - 39,999
Inserting rows: 40,000 - 49,999
Inserting rows: 50,000 - 59,999
Inserting rows: 60,000 - 69,999
Inserting rows: 70,000 - 79,999
Inserting rows: 80,000 - 89,999
Inserting rows: 90,000 - 99,999
Inserting rows: 100,000 - 109,999
Inserting rows: 110,000 - 119,999
Inserting rows: 120,000 - 129,999
Inserting rows: 130,000 - 139,999
Inserting rows: 140,000 - 149,999
Inserting rows: 150,000 - 159,999
Inserting rows: 160,000 - 169,999
Inserting rows: 170,000 - 179,999
Inserting rows: 180,000 - 181,570


##### Modelling

In [12]:
# Create training and test split
X = inspection_data_for_training.drop(
    columns=[
        "license_",
        "RESULTS",
        "inspection_id",
        "dba_name",
        "aka_name",
        "inspection_type",
        "violations",
        "count_violations",
        "inspection_date",
        "zip",
    ]
)
y = inspection_data_for_training[["RESULTS"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
# Train a random forest model
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, np.ravel(y_train))

In [14]:
# test predictions
y_pred = clf.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)

print(np.sqrt(mse))

0.6365490032462582


### Model deployment using Vetiver

In [15]:
# Create a vetiver model object
v = vetiver.VetiverModel(
    model=clf, 
    model_name=f"{connect_username}/inspection_results", 
    prototype_data=X_train[:1]
)

v

<vetiver.vetiver_model.VetiverModel at 0x7f49857459d0>

In [16]:
# Write the vetiver model as a pin for versioning
model_board = pins.board_connect(
    os.getenv("CONNECT_SERVER"),
    api_key=os.getenv("CONNECT_API_KEY"),
    allow_pickle_read=True
)
vetiver.vetiver_pin_write(model_board, model=v)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
Writing pin:
Name: 'sam.edwardes/inspection_results'
Version: 20230815T194307Z-2a4f4


In [17]:
# Deploy the vetiver model as an API on Posit Connect
rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = rsconnect.api.RSConnectServer(url=rsc_server, api_key=rsc_key)


vetiver.deploy_rsconnect(
    connect_server=connect_server,
    board=model_board,
    pin_name=f"{connect_username}/inspection_results",
)

             Consider creating a requirements.txt file instead.[0m


              Do you need to check your pinned model?
              Using version 2
[0mValidating server...[0m[32;20m 	[OK]
[0m[0mValidating app mode...[0m[32;20m 	[OK]
[0m[0mMaking bundle ...[0m[32;20m 	[OK]
[0m[0mDeploying bundle ...[0m[32;20m 	[OK]
[0m[0mSaving deployed information...[0m[32;20m 	[OK]
[0m[0mBuilding FastAPI application...[0m
[0mBundle created with Python version 3.11.3 is compatible with environment Local with Python version 3.11.3 from /opt/python/3.11.3/bin/python3.11 [0m
[0mBundle requested Python version 3.11.3; using /opt/python/3.11.3/bin/python3.11 which has version 3.11.3[0m
[0m2023/08/15 19:43:15.894316717 [rsc-session] Content GUID: 21431834-84e4-48d9-a850-4c6521e03ec2[0m
[0m2023/08/15 19:43:15.894362884 [rsc-session] Content ID: 12[0m
[0m2023/08/15 19:43:15.894373189 [rsc-session] Bundle ID: 18[0m
[0m2023/08/15 19:43:15.897930009 arguments: /opt/python/3.11.3/bin/python3.11 /opt/rstudio-connect/python/build_environment.py[

#### Model monitoring

In [18]:
# Simulate multiple days of input data
data = X_test.copy()
data["RESULTS"] = y_test
data["date_obs"] = ""
data

Unnamed: 0,BAKERY,GROCERY_STORE,RESTAURANT,HIGH_RISK,MEDIUM_RISK,LOW_RISK,CUM_VIOLATIONS,RESULTS,date_obs
194128,0,0,1,1,0,0,73.0,1,
13480,0,0,1,1,0,0,4.0,1,
212770,0,0,1,0,1,0,34.0,1,
35496,0,0,1,1,0,0,13.0,1,
63397,0,0,1,0,1,0,0.0,0,
...,...,...,...,...,...,...,...,...,...
150357,0,0,1,1,0,0,7.0,1,
193651,0,0,1,1,0,0,9.0,1,
76412,0,0,1,1,0,0,3.0,1,
88299,0,0,1,1,0,0,35.0,0,


In [19]:
endpoint = "https://colorado.posit.co/rsc/inspection_results/predict"
data["preds"] = vetiver.predict(endpoint, data).iloc[:, 0].values

In [20]:
# add last three dates
day_list = [
    (pd.Timestamp.today() - timedelta(days=2)).strftime("%Y-%m-%d"),
    (pd.Timestamp.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
    (pd.Timestamp.today().strftime("%Y-%m-%d")),
]
day_list

['2023-08-13', '2023-08-14', '2023-08-15']

In [21]:
data["date_obs"] = np.random.choice(day_list, size=len(data))

In [22]:
data

Unnamed: 0,BAKERY,GROCERY_STORE,RESTAURANT,HIGH_RISK,MEDIUM_RISK,LOW_RISK,CUM_VIOLATIONS,RESULTS,date_obs,preds
194128,0,0,1,1,0,0,73.0,1,2023-08-15,0
13480,0,0,1,1,0,0,4.0,1,2023-08-14,1
212770,0,0,1,0,1,0,34.0,1,2023-08-13,1
35496,0,0,1,1,0,0,13.0,1,2023-08-15,1
63397,0,0,1,0,1,0,0.0,0,2023-08-15,1
...,...,...,...,...,...,...,...,...,...,...
150357,0,0,1,1,0,0,7.0,1,2023-08-14,1
193651,0,0,1,1,0,0,9.0,1,2023-08-14,1
76412,0,0,1,1,0,0,3.0,1,2023-08-15,1
88299,0,0,1,1,0,0,35.0,0,2023-08-15,1


In [23]:
# pin this dataset for use in model card
model_board.pin_write(data, f"{connect_username}/inspection_results_monitoring",type="parquet")

Writing pin:
Name: 'sam.edwardes/inspection_results_monitoring'
Version: 20230815T194324Z-571d5


Meta(title='inspection_results_monitoring: a pinned 45393 x 10 DataFrame', description=None, created='20230815T194324Z', pin_hash='571d5dab390ea6e4', file='inspection_results_monitoring.parquet', file_size=420705, type='parquet', api_version=1, version=VersionRaw(version='19'), tags=None, name='sam.edwardes/inspection_results_monitoring', user={}, local={})

In [24]:
# define which metrics to track
metric_set = [metrics.mean_absolute_error, metrics.mean_squared_error, metrics.r2_score]

# choose time period to track over
td = timedelta(days=1)

# calculate metrics
original_metrics = vetiver.compute_metrics(
    data=data,
    date_var="date_obs",
    period=td,
    metric_set=metric_set,
    truth="RESULTS",
    estimate="preds",
)

In [25]:
original_metrics

Unnamed: 0,index,n,metric,estimate
0,2023-08-13,15117,mean_absolute_error,0.403188
1,2023-08-13,15117,mean_squared_error,0.403188
2,2023-08-13,15117,r2_score,-0.655279
3,2023-08-14,15096,mean_absolute_error,0.3958
4,2023-08-14,15096,mean_squared_error,0.3958
5,2023-08-14,15096,r2_score,-0.636891


In [26]:
# Write metrics as a pin
model_board.pin_write(original_metrics, f"{connect_username}/inspection_model_metrics", type="csv")

Writing pin:
Name: 'sam.edwardes/inspection_model_metrics'
Version: 20230815T194326Z-da2fc


Meta(title='inspection_model_metrics: a pinned 6 x 4 DataFrame', description=None, created='20230815T194326Z', pin_hash='da2fca6e7cfb802b', file='inspection_model_metrics.csv', file_size=338, type='csv', api_version=1, version=VersionRaw(version='20'), tags=None, name='sam.edwardes/inspection_model_metrics', user={}, local={})

In [27]:
# Model card
vetiver.model_card(path=".")

'./model_card.qmd'