# Data Validation

In the previous notebook, two pins were saved:

- City of Chicago - Business License Data (RAW): `chicago-business-license-data`
- ity of Chicago - Food Inspection Data (RAW): `chicago-food-inspection-data`

## Setup

In [None]:
import pandas as pd
import pandera as pa
import pins
# from ydata_profiling import ProfileReport

In [None]:
pd.options.display.max_columns = 999

In [None]:
# Set up the board
board = pins.board_connect()
user_name = "sam.edwardes"

## Data set (1): Business License Data

<https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses/r5kz-chrr>

In [None]:
pin_name = f"{user_name}/chicago-business-license-data-raw"
business_license_raw = board.pin_read(pin_name)
business_license_raw

In [None]:
# Apply some basic cleaning steps:
business_license_tidy = (
    business_license_raw
    .loc[business_license_raw["state"] == "IL"]
)

**Tip:** Use multiple cursors in VS Code to easily edit many lines at the same time (<https://code.visualstudio.com/docs/getstarted/tips-and-tricks#_column-box-selection>).

In [None]:
business_license_schema = pa.DataFrameSchema({
    "id": pa.Column(str, coerce=True),
    "license_id": pa.Column(str, coerce=True, unique=True),
    "account_number": pa.Column(str, coerce=True),
    "site_number": pa.Column(str, coerce=True),
    "legal_name": pa.Column(str, coerce=True),
    "doing_business_as_name": pa.Column(str, coerce=True, nullable=True),
    "address": pa.Column(str, coerce=True),
    "city": pa.Column(str, coerce=True, nullable=True),
    "state": pa.Column(str, coerce=True, nullable=True, checks=[
        pa.Check.eq("IL")
    ]),
    "zip_code": pa.Column(str, coerce=True, nullable=True, checks=[
        pa.Check(lambda x: x.str.match(r'^\d{5}$').all())
    ]),
    "ward": pa.Column(str, coerce=True, nullable=True),
    "precinct": pa.Column(str, coerce=True, nullable=True),
    "ward_precinct": pa.Column(str, coerce=True, nullable=True),
    "police_district": pa.Column(pa.Category, coerce=True, nullable=True),
    "license_code": pa.Column(pa.Category, coerce=True),
    "license_description": pa.Column(str, coerce=True),
    "business_activity_id": pa.Column(str, coerce=True, nullable=True),
    "business_activity": pa.Column(pa.Category, coerce=True, nullable=True),
    "license_number": pa.Column(str, coerce=True),
    "application_type": pa.Column(pa.Category, coerce=True),
    "application_created_date": pa.Column(str, coerce=True, nullable=True),
    "application_requirements_complete": pa.Column(pa.DateTime, coerce=True),
    "payment_date": pa.Column(pa.DateTime, coerce=True),
    "conditional_approval": pa.Column(str, coerce=True),
    "license_start_date": pa.Column(pa.DateTime, coerce=True),
    "expiration_date": pa.Column(pa.DateTime, coerce=True),
    "license_approved_for_issuance": pa.Column(str, coerce=True),
    "date_issued": pa.Column(pa.DateTime, coerce=True),
    "license_status": pa.Column(pa.Category, coerce=True),
    "license_status_change_date": pa.Column(pa.DateTime, coerce=True, nullable=True),
    "ssa": pa.Column(str, coerce=True, nullable=True),
    "latitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
        pa.Check.between(38, 44)
    ]),
    "longitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
        pa.Check.between(-89, -84)
    ]),
    "location": pa.Column(str, coerce=True),
})



business_license_validated = business_license_schema.validate(business_license_tidy)

# print(business_license_validated.info())
# business_license_validated

In [None]:
(business_license_raw["id"] == "1000929-20021116").value_counts()

In [None]:
business_license_raw[business_license_raw["id"] == "1000929-20021116"]

In [None]:
# Pin the data to Connect
pin_name = f"{user_name}/chicago-business-license-data-validated"
board.pin_write(
    business_license_validated, 
    name=pin_name, 
    type="csv",
    versioned=True,
    title="City of Chicago - Business License Data (VALIDATED)"
)

## Data set (2): Food inspections

<https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5>

In [None]:
pin_name = f"{user_name}/chicago-food-inspection-data"
food_inspection_raw = board.pin_read(pin_name)
food_inspection_raw

In [None]:
food_inspection_schema = pa.DataFrameSchema({
    "inspection_id": pa.Column(str, coerce=True, unique=True),
    "dba_name": pa.Column(str, coerce=True),
    "aka_name": pa.Column(str, coerce=True, nullable=True),
    "license_": pa.Column(str, coerce=True),
    "facility_type": pa.Column(pa.Category, coerce=True, nullable=True),
    "risk": pa.Column(str, coerce=True),
    "address": pa.Column(str, coerce=True),
    "city": pa.Column(str, coerce=True, checks=[
        pa.Check.eq("CHICAGO")
    ]),
    "state": pa.Column(str, coerce=True),
    "zip": pa.Column(str, coerce=True),
    "inspection_date": pa.Column(pa.DateTime, coerce=True),
    "inspection_type": pa.Column(str, coerce=True),
    "results": pa.Column(str, coerce=True),
    "violations": pa.Column(str, coerce=True, nullable=True),
    "latitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
        pa.Check.between(38, 44)
    ]),
    "longitude": pa.Column(pa.Float, coerce=True, nullable=True, checks=[
        pa.Check.between(-89, -84)
    ]),
    "location": pa.Column(str, coerce=True, nullable=True),
})

food_inspection_validated = food_inspection_schema.validate(food_inspection_raw)
print(food_inspection_validated.info())
food_inspection_validated

In [None]:
# Pin the data to Connect
pin_name = f"{user_name}/chicago-food-inspection-data-validated"
board.pin_write(
    food_inspection_validated, 
    name=pin_name, 
    type="csv",
    versioned=True,
    title="City of Chicago - Food Inspection Data (VALIDATED)"
)

# Tmp

In [None]:
# profile = ProfileReport(business_license_raw, title="Profiling Report")
# profile