In [41]:
# import required libraries
import os
from datetime import datetime, timedelta

import ibis
import numpy as np
import pandas as pd
import pins
import requests
import rsconnect
import vetiver
import xgboost as xgb
from dotenv import load_dotenv
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import re

In [2]:
# setup local environment
load_dotenv()

True

In [3]:
# get username on Connect
connect_username = requests.get(
    f"{os.environ['CONNECT_SERVER']}/__api__/v1/user",
    headers={"Authorization": f"Key {os.environ['CONNECT_API_KEY']}"}
).json()["username"]

connect_username

'gagan'

In [4]:
# read inspection data from the database
con = ibis.postgres.connect(
    user="posit",
    password=os.environ["CONF23_DB_PASSWORD"],
    host=os.environ["CONF23_DB_HOST"],
    port=5432,
    database="conf23_python",
)

inspection_data = con.table("food_inspection_validated").to_pandas()

In [5]:
inspection_data

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,zip,inspection_date,inspection_type,results,violations
0,52234,CAFE 608,CAFE 608,2013328,RESTAURANT,RISK 1 (HIGH),60657,2010-01-04,LICENSE RE-INSPECTION,PASS,
1,104236,TEMPO CAFE,TEMPO CAFE,80916,RESTAURANT,RISK 1 (HIGH),60611,2010-01-04,CANVASS,FAIL,"{""18. NO EVIDENCE OF RODENT OR INSECT OUTER OP..."
2,67757,DUNKIN DONUTS/BASKIN-ROBBINS,DUNKIN DONUTS/BASKIN-ROBBINS,1380279,RESTAURANT,RISK 2 (MEDIUM),60601,2010-01-04,TAG REMOVAL,PASS,
3,67732,WOLCOTT'S,TROQUET,1992039,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
4,67733,WOLCOTT'S,TROQUET,1992040,RESTAURANT,RISK 1 (HIGH),60613,2010-01-04,LICENSE RE-INSPECTION,PASS,
...,...,...,...,...,...,...,...,...,...,...,...
258140,2580700,HERITAGE OUTPOST LAWRENCE HOUSE LLC,HERITAGE OUTPOST,2476946,RESTAURANT,RISK 2 (MEDIUM),60640,2023-08-25,CANVASS,FAIL,"{""2. CITY OF CHICAGO FOOD SERVICE SANITATION C..."
258141,2580727,MR CHILAQUIL,MR CHILAQUIL,2911975,RESTAURANT,RISK 1 (HIGH),60632,2023-08-25,CANVASS RE-INSPECTION,PASS,"{""51. PLUMBING INSTALLED; PROPER BACKFLOW DEVI..."
258142,2580738,HANCOCK HIGH SCHOOL,HANCOCK HIGH SCHOOL,4025868,SCHOOL,RISK 1 (HIGH),60638,2023-08-25,CANVASS,PASS,"{""10. ADEQUATE HANDWASHING SINKS PROPERLY SUPP..."
258143,2580799,TERRA E MARE,TERRA E MARE,2817542,RESTAURANT,RISK 1 (HIGH),60607,2023-08-28,LICENSE RE-INSPECTION,PASS,


#### Exploratory Data Analysis

In [9]:
# (Uncomment to run some analysis)
#
# inspection_data.columns
# inspection_data.dtypes
# inspection_data.groupby("facility_type").count()["inspection_id"].sort_values(ascending=False)
# inspection_data.groupby("results").count()["inspection_id"]
# inspection_data['results'].unique()
# inspection_data.groupby("risk").count()["inspection_id"]
# inspection_data['violations']

#### Feature Engineering

In [35]:
# clean up input data for modelling

inspection_data_for_training = (
    inspection_data
    # remove NA licenses
    .loc[inspection_data["license_"] != 0]
    # only use inspections in the last year
    .loc[inspection_data["inspection_date"]>=(inspection_data["inspection_date"].max() - timedelta(days=365))]
    # select only Restaurant, Bakery, Grocery Store
    .loc[
        inspection_data["facility_type"].isin(["RESTAURANT", "BAKERY", "GROCERY STORE"])
    ]
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["facility_type"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_BAKERY": "BAKERY",
            "_RESTAURANT": "RESTAURANT",
            "_GROCERY STORE": "GROCERY_STORE",
        }
    )
    # filter out relavant inspection results
    .loc[inspection_data["results"].isin(["FAIL", "PASS", "PASS W/ CONDITIONS"])]
    # make Pass with Conditions results as Fail, since they are not completely pass
    .assign(results=(lambda x: x["results"].replace(["PASS W/ CONDITIONS"], "FAIL")))
    .assign(RESULTS=(lambda x: x["results"].map({"PASS": 1, "FAIL": 0}).astype(int)))
    .drop(columns=["results"])
    # filter out valid risk entries
    .loc[
        inspection_data["risk"].isin(
            ["RISK 1 (HIGH)", "RISK 2 (MEDIUM)", "RISK 3 (LOW)"]
        )
    ]
    # create dummy variables for risk
    .pipe(
        lambda inspection_data: pd.get_dummies(
            inspection_data, columns=["risk"], prefix=[""], dtype=int
        )
    )
    .rename(
        columns={
            "_RISK 1 (HIGH)": "HIGH_RISK",
            "_RISK 2 (MEDIUM)": "MEDIUM_RISK",
            "_RISK 3 (LOW)": "LOW_RISK",
        }
    )
    # sort results by business and inspection date
    .sort_values(by=["license_", "inspection_date"])
)

In [38]:
# count violations for each inspection
inspection_data_for_training["count_violations"] = (
    inspection_data_for_training["violations"]
    .apply(lambda x: len(x[1:-1].split('","')) if x is not None else None)
    .fillna(0)
)
# count cumilative violations for each date for a license
inspection_data_for_training["CUM_VIOLATIONS"] = inspection_data_for_training.groupby(
    ["license_"]
)["count_violations"].cumsum()

In [39]:
inspection_data_for_training

Unnamed: 0,inspection_id,dba_name,aka_name,license_,zip,inspection_date,inspection_type,violations,BAKERY,GROCERY_STORE,RESTAURANT,RESULTS,HIGH_RISK,MEDIUM_RISK,LOW_RISK,count_violations,CUM_VIOLATIONS
245090,2566654,FAB'S NUTRITION,FAB'S NUTRITION,0,60639,2022-11-07,CANVASS,"{""2. CITY OF CHICAGO FOOD SERVICE SANITATION C...",0,0,1,0,0,1,0,8.0,8.0
245410,2566988,FAB'S NUTRITION,FAB'S NUTRITION,0,60639,2022-11-14,CANVASS RE-INSPECTION,"{""45. SINGLE-USE/SINGLE-SERVICE ARTICLES: PROP...",0,0,1,1,0,1,0,4.0,12.0
248027,2569691,SBARRO'S,SBARRO'S,0,60612,2023-01-13,CANVASS,"{""1. PERSON IN CHARGE PRESENT, DEMONSTRATES KN...",0,0,1,0,1,0,0,5.0,17.0
246429,2570167,JOE'S BARBEQ AND FISH #1,JOE'S BARBEQ AND FISH #1,0,60644,2023-01-24,COMPLAINT,"{""2. CITY OF CHICAGO FOOD SERVICE SANITATION C...",0,0,1,0,1,0,0,19.0,36.0
243480,2573601,PENNINGTON'S FOOD AND SNACKS,PENNINGTON'S FOOD AND SNACKS,0,60619,2023-04-03,COMPLAINT,"{""2. CITY OF CHICAGO FOOD SERVICE SANITATION C...",0,0,1,0,1,0,0,16.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240461,2575179,COLLETTI'S RESTAURANT,COLLETTI'S RESTAURANT,9567,60646,2023-05-01,CANVASS RE-INSPECTION,"{""58. ALLERGEN TRAINING AS REQUIRED - COMMENTS...",0,0,1,1,1,0,0,1.0,5.0
251759,2573480,FURAMA RESTAURANT INC,FURAMA RESTAURANT,9821,60640,2023-03-30,CANVASS,"{""10. ADEQUATE HANDWASHING SINKS PROPERLY SUPP...",0,0,1,0,1,0,0,8.0,8.0
257922,2580332,FURAMA RESTAURANT INC,FURAMA RESTAURANT,9821,60640,2023-08-18,COMPLAINT,"{""47. FOOD & NON-FOOD CONTACT SURFACES CLEANAB...",0,0,1,1,1,0,0,3.0,11.0
251705,2573397,L & M STARLIGHT RESTAURANT INC,L & M STARLIGHT RESTAURANT INC,9890,60652,2023-03-29,CANVASS,"{""9. NO BARE HAND CONTACT WITH RTE FOOD OR A P...",0,0,1,0,1,0,0,5.0,5.0


In [42]:
# Save the inspection data for training to SQL so that it can be
# used by the Shiny App.
db_user = "posit"
db_password = os.environ["CONF23_DB_PASSWORD"]
db_host = os.environ["CONF23_DB_HOST"]
db_port = 5432
db_database = "conf23_python"
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}/{db_database}")
print(f"{engine=}")

# Determine the table name
if connect_username == "sam.edwardes":
    table_name_prefix = ""
else:
    table_name_prefix = re.sub('[^0-9a-zA-Z]+', '_', connect_username) + "_"

table_name = f"{table_name_prefix}model_features"
print(f"{table_name=}")


# Insert the data into postgres. Inserting large amounts of data can be slow, so
# iterate over 10,000 rows at a time.
n_rows = inspection_data_for_training.shape[0]
step_size = 10_000

for i in range(0, n_rows, step_size):
    index_start = i
    index_end = min(n_rows, i + step_size - 1)
    
    if i == 0:
        if_exists = "replace"
    else:
        if_exists = "append"

    print(f"Inserting rows: {index_start:,} - {index_end:,}")

    inspection_data_for_training \
        .reset_index(drop=True) \
        .loc[index_start:index_end, :] \
        .to_sql(table_name, engine, if_exists=if_exists, index=False)


engine=Engine(postgresql+psycopg2://posit:***@database.conf23workflows.training.posit.co/conf23_python)
table_name='gagan_model_features'
Inserting rows: 0 - 9,999
Inserting rows: 10,000 - 11,416


##### Modelling

In [43]:
# Create training and test split
X = inspection_data_for_training.drop(
    columns=[
        "license_",
        "RESULTS",
        "inspection_id",
        "dba_name",
        "aka_name",
        "inspection_type",
        "violations",
        "count_violations",
        "inspection_date",
        "zip",
    ]
)
y = inspection_data_for_training[["RESULTS"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [44]:
# Train a random forest model
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, np.ravel(y_train))

In [45]:
# test predictions
y_pred = clf.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)

print(np.sqrt(mse))

0.6256041228847741


### Model deployment using Vetiver

In [46]:
# Create a vetiver model object
v = vetiver.VetiverModel(
    model=clf, 
    model_name=f"{connect_username}/inspection_results", 
    prototype_data=X_train[:1]
)

v

<vetiver.vetiver_model.VetiverModel at 0x7f6d05286850>

In [47]:
# Write the vetiver model as a pin for versioning
model_board = pins.board_connect(
    os.getenv("CONNECT_SERVER"),
    api_key=os.getenv("CONNECT_API_KEY"),
    allow_pickle_read=True
)
vetiver.vetiver_pin_write(model_board, model=v)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
Writing pin:
Name: 'gagan/inspection_results'
Version: 20230907T192848Z-da4f4


In [48]:
# Deploy the vetiver model as an API on Posit Connect
rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = rsconnect.api.RSConnectServer(url=rsc_server, api_key=rsc_key)


vetiver.deploy_rsconnect(
    connect_server=connect_server,
    board=model_board,
    pin_name=f"{connect_username}/inspection_results",
)

             Consider creating a requirements.txt file instead.[0m


              Do you need to check your pinned model?
              Using version 178
[0mValidating server...[0m[32;20m 	[OK]
[0m[0mValidating app mode...[0m[32;20m 	[OK]
[0m[0mMaking bundle ...[0m[32;20m 	[OK]
[0m[0mDeploying bundle ...[0m[32;20m 	[OK]
[0m[0mSaving deployed information...[0m[32;20m 	[OK]
[0m[0mBuilding FastAPI application...[0m
[0mBundle created with Python version 3.11.3 is compatible with environment Local with Python version 3.11.3 from /opt/python/3.11.3/bin/python3.11 [0m
[0mBundle requested Python version 3.11.3; using /opt/python/3.11.3/bin/python3.11 which has version 3.11.3[0m
[0m2023/09/07 19:29:21.251717339 [rsc-session] Content GUID: b9fe48dd-4e2d-4b92-8be7-331da686bd9e[0m
[0m2023/09/07 19:29:21.251764523 [rsc-session] Content ID: 82[0m
[0m2023/09/07 19:29:21.251773248 [rsc-session] Bundle ID: 179[0m
[0m2023/09/07 19:29:21.254711168 arguments: /opt/python/3.11.3/bin/python3.11 /opt/rstudio-connect/python/build_environment.p