# Data Validation

In the previous notebook, two tables were saved:

- City of Chicago - Business License Data (RAW): `chicago-business-license-data`
- City of Chicago - Food Inspection Data (RAW): `chicago-food-inspection-data`

## Setup

In [None]:
import os
import requests

import ibis
import pandas as pd
import numpy as np
import pandera as pa
from sqlalchemy import create_engine, text

In [None]:
pd.options.display.max_columns = 999

In [None]:
# Database details
db_user = "posit"
db_password = os.environ["CONF23_DB_PASSWORD"]
db_host = os.environ["CONF23_DB_HOST"]
db_port = 5432
db_database = "conf23_python"

# Set up sqlalchemy for writing data
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}/{db_database}")

# Set up ibis for reading data
con = ibis.postgres.connect(
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port,
    database=db_database
)

Set dyanmic variables. To ensure that we do not have overload the database or the server, only the instructors scripts will run on the full data set.

In [None]:
connect_username = requests.get(
    f"{os.environ['CONNECT_SERVER']}/__api__/v1/user",
    headers={"Authorization": f"Key {os.environ['CONNECT_API_KEY']}"}
).json()["username"]

connect_username

In [None]:
if connect_username == "sam.edwardes":
    max_rows = 99_999_999
else:
    max_rows = 10_000

max_rows

## Load raw data

Use `ibis` to read the data from Postgres.

In [None]:
business_license_raw = con.table("business_license_raw").limit(max_rows).to_pandas()

In [None]:
food_inspection_raw = con.table("food_inspection_raw").limit(max_rows).to_pandas()

## Data Exploration

Does each license only one row in the table?

In [None]:
business_license_raw["license_id"].value_counts().sort_values(ascending=False)

Does all the data relate to Chicago?

In [None]:
business_license_raw["city"].value_counts().sort_values(ascending=False)

In [None]:
business_license_raw["state"].value_counts().sort_values(ascending=False)

##### Food inspection data 

What are the different risk levels?

In [None]:
food_inspection_raw["risk"].value_counts().plot(kind="barh")

What are the most common violations?

In [None]:
food_inspection_raw["violations"].value_counts().sort_values(ascending=False)

What are the most common outcomes?

In [None]:
food_inspection_raw["results"].value_counts().plot(kind="barh")

What are the most common facility types?

In [None]:
food_inspection_raw["facility_type"].value_counts().sort_values(ascending=False).head(25)

## Data set (1): Business License Data

<https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses/r5kz-chrr>

In [None]:
business_license_raw

**Data tidying**

Apply some basic tidying steps to the data.

In [None]:
business_license_tidy = business_license_raw.copy()
business_license_tidy

In [None]:
# Filter to only keep state of IL
business_license_tidy = business_license_tidy.assign(
    state=business_license_tidy["state"].str.upper()
)

business_license_tidy = business_license_tidy.loc[business_license_tidy["state"] == "IL", :]

In [None]:
# Filter to only keep city of Chicago
business_license_tidy = business_license_tidy.assign(
    city=business_license_tidy["city"].str.upper()
)

business_license_tidy = business_license_tidy.loc[business_license_tidy["city"] == "CHICAGO", :]

In [None]:
# Convert conditional approval to a boolean value.
business_license_tidy = business_license_tidy.assign(
    conditional_approval=business_license_tidy["conditional_approval"].str.upper()
)

business_license_tidy = business_license_tidy.assign(
    conditional_approval=business_license_tidy["conditional_approval"].apply(lambda x: x == "Y")
)

In [None]:
# Drop the "location" column, the same data is already stored in the "latitude"
# and "longitude" columns.
business_license_tidy = business_license_tidy.drop(columns=["location"])

In [None]:
# Reset the index
business_license_tidy = business_license_tidy.reset_index(drop=True)

**Data validation**

Use pandera to validate the data and convert each column to the correct type.

In [None]:
business_license_schema = pa.DataFrameSchema(
    columns={
        "id": pa.Column(str, coerce=True),
        "license_id": pa.Column(str, coerce=True, unique=True), # Primary Key
        "account_number": pa.Column(str, coerce=True),
        "site_number": pa.Column(str, coerce=True),
        "legal_name": pa.Column(str, coerce=True),
        "doing_business_as_name": pa.Column(str, coerce=True, nullable=True),
        "address": pa.Column(str, coerce=True),
        "city": pa.Column(str, coerce=True, nullable=True, checks=[
            pa.Check.eq("CHICAGO")
        ]),
        "state": pa.Column(str, coerce=True, nullable=True, checks=[
            pa.Check.eq("IL")
        ]),
        "zip_code": pa.Column(str, coerce=True, nullable=True, checks=[
            pa.Check(lambda x: x.str.match(r'^\d{5}$').all())
        ]),
        "ward": pa.Column(str, coerce=True, nullable=True),
        "precinct": pa.Column(str, coerce=True, nullable=True),
        "ward_precinct": pa.Column(str, coerce=True, nullable=True),
        "police_district": pa.Column(pa.Category, coerce=True, nullable=True),
        "license_code": pa.Column(pa.Category, coerce=True),
        "license_description": pa.Column(str, coerce=True),
        "business_activity_id": pa.Column(str, coerce=True, nullable=True),
        "business_activity": pa.Column(pa.Category, coerce=True, nullable=True),
        "license_number": pa.Column(str, coerce=True),
        "application_type": pa.Column(pa.Category, coerce=True),
        "application_created_date": pa.Column(str, coerce=True, nullable=True),
        "application_requirements_complete": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "payment_date": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "conditional_approval": pa.Column(bool, coerce=True),
        "license_start_date": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "expiration_date": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "license_approved_for_issuance": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "date_issued": pa.Column(pa.DateTime, coerce=True),
        "license_status": pa.Column(pa.Category, coerce=True),
        "license_status_change_date": pa.Column(pa.DateTime, coerce=True, nullable=True),
        "ssa": pa.Column(str, coerce=True, nullable=True),
        "latitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
            pa.Check.between(38, 44)
        ]),
        "longitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
            pa.Check.between(-89, -84)
        ]),
    }
)


business_license_validated = business_license_schema.validate(business_license_tidy)
business_license_validated

Insert the data into postgresql.

In [None]:
# determine the table name
if connect_username == "sam.edwardes":
    table_name_prefix = ""
else:
    table_name_prefix = re.sub('[^0-9a-zA-Z]+', '_', connect_username) + "_"

In [None]:
table_name = f"{table_name_prefix}business_license_validated"
table_name

In [None]:
# Insert the data into postgres. Inserting large amounts of data can be slow, so
# iterate over 10,000 rows at a time.

n_rows = business_license_validated.shape[0]
step_size = 10_000

for i in range(0, n_rows, step_size):
    index_start = i
    index_end = min(n_rows, i + step_size - 1)
    
    if i == 0:
        if_exists = "replace"
    else:
        if_exists = "append"

    print(f"Inserting rows: {index_start:,} - {index_end:,}")
    
    business_license_validated \
        .loc[index_start:index_end, :] \
        .to_sql(table_name, engine, if_exists=if_exists, index=False)

In [None]:
# Confirm number of rows
with engine.begin() as conn:
    query = text(f"SELECT COUNT(*) FROM {table_name}")
    number_of_rows = pd.read_sql_query(query, conn)

print(number_of_rows)

## Data set (2): Food inspections

<https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5>

In [None]:
food_inspection_raw

**Data cleaning**

Apply some basic cleaning steps to the data.

In [None]:
food_inspection_tidy = food_inspection_raw.copy()

In [None]:
# Filter to only keep state of IL
food_inspection_tidy = food_inspection_tidy.assign(
    state=food_inspection_tidy["state"].str.upper()
)

food_inspection_tidy = food_inspection_tidy.loc[food_inspection_tidy["state"] == "IL", :]

In [None]:
# Filter to only keep city of Chicago
food_inspection_tidy = food_inspection_tidy.assign(
    city=food_inspection_tidy["city"].str.upper()
)

food_inspection_tidy = food_inspection_tidy.loc[food_inspection_tidy["city"] == "CHICAGO", :]

In [None]:
# Drop columns that also exist in the business license data.
food_inspection_tidy = food_inspection_tidy.drop(columns=["address", "city", "state", "latitude", "longitude", "location"])

In [None]:
# Convert categorical columns to be all upper case for consistency
food_inspection_tidy = food_inspection_tidy.assign(
    dba_name=lambda x: x["dba_name"].str.upper(),
    aka_name=lambda x: x["aka_name"].str.upper(),
    facility_type=lambda x: x["facility_type"].str.upper(),
    risk=lambda x: x["risk"].str.upper(),
    inspection_type=lambda x: x["inspection_type"].str.upper(),
    results=lambda x: x["results"].str.upper(),
    violations=lambda x: x["violations"].str.upper(),
)

In [None]:
# Specify the order of categorical columns.
food_inspection_tidy = food_inspection_tidy.assign(
    risk=lambda x: x["risk"].astype("category").cat.set_categories(
        ["ALL", "RISK 1 (HIGH)", "RISK 2 (MEDIUM)", "RISK 3 (LOW)"], 
        ordered=True
    )
)

In [None]:
# The "violations" can have multiple violations separated by a "|". E.g.
# "32. FOOD AND NON-FOOD ... REPLACED. | 33. FOOD AND NON-FOOD CONTACT E"
# To make the data easier to work with split each violation into its own item.
# The result is the violations column will contain a list of strings.
food_inspection_tidy = food_inspection_tidy.assign(violations=lambda x: x["violations"].str.split(pat=" \| "))

In [None]:
food_inspection_tidy = food_inspection_tidy.reset_index(drop=True)

**Data validation**

Use pandera to validate the data and convert each column to the correct type.

In [None]:
food_inspection_schema = pa.DataFrameSchema({
    "inspection_id": pa.Column(str, coerce=True, unique=True), # Primary Key
    "dba_name": pa.Column(str, coerce=True),
    "aka_name": pa.Column(str, coerce=True, nullable=True),
    "license_": pa.Column(str, coerce=True, nullable=True), # Foreign Key
    "facility_type": pa.Column(pa.Category, coerce=True, nullable=True),
    "risk": pa.Column(str, coerce=True, nullable=True, checks=[
        pa.Check.isin(["ALL", "RISK 1 (HIGH)", "RISK 2 (MEDIUM)", "RISK 3 (LOW)"])
    ]),
    "zip": pa.Column(str, coerce=True, nullable=True),
    "inspection_date": pa.Column(pa.DateTime, coerce=True),
    "inspection_type": pa.Column(pa.Category, coerce=True, nullable=True),
    "results": pa.Column(pa.Category, coerce=True),
    "violations": pa.Column(pa.Object, coerce=True, nullable=True)
})

food_inspection_validated = food_inspection_schema.validate(food_inspection_tidy)
food_inspection_validated

Insert the data into postgresql.

In [None]:
# determine the table name
if connect_username == "sam.edwardes":
    table_name_prefix = ""
else:
    table_name_prefix = re.sub('[^0-9a-zA-Z]+', '_', "sam-edwardes") + "_"

table_name = f"{table_name_prefix}food_inspection_validated"

table_name

In [None]:

# Insert the data into postgres. Inserting large amounts of data can be slow, so
# iterate over 10,000 rows at a time.

n_rows = food_inspection_validated.shape[0]
step_size = 10_000

for i in range(0, n_rows, step_size):
    index_start = i
    index_end = min(n_rows, i + step_size - 1)
    
    if i == 0:
        if_exists = "replace"
    else:
        if_exists = "append"

    print(f"Inserting rows: {index_start:,} - {index_end:,}")

    food_inspection_validated \
        .loc[index_start:index_end, :] \
        .to_sql(table_name, engine, if_exists=if_exists, index=False)

In [None]:
# Confirm number of rows
with engine.begin() as conn:
    query = text(f"SELECT COUNT(*) FROM {table_name}")
    number_of_rows = pd.read_sql_query(query, conn)

print(number_of_rows)