Code generated by codeium with context of html file

In [36]:
import os
import glob
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

# Define the folder where HTML files are stored
html_folder = "./mock_roe_4"

# List all HTML files in the folder
html_files = glob.glob(os.path.join(html_folder, "*.html"))

# List to store extracted data
data = []

# Extract data from each HTML file
for file in html_files:
    with open(file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
        
        # Extract postal code from the heading
        h1_tag = soup.find("h1")
        if h1_tag:
            postal_code = h1_tag.text.strip().split()[-1]  # Extracting last word (postal code)

        # Find all tables
        tables = soup.find_all("table")

        for table in tables:
            rows = table.find_all("tr")
            business_data = {}
            
            # Iterate through table rows
            for row in rows:
                cells = row.find_all("td")
                if len(cells) == 4:
                    key1, val1, key2, val2 = [cell.text.strip() for cell in cells]
                    business_data[key1] = val1
                    business_data[key2] = val2

            data.append({
                    "postal_code": postal_code,
                    "business_id": business_data["business_id"],
                    "address": business_data["address"],
                    "city": business_data["city"],
                    "phone_number": business_data["phone_number"],
                    "tax_code": business_data["tax_code"],
                    "business_certificate": business_data["business_certificate"],
                    "owner_name": business_data["owner_name"],
                    "owner_address": business_data["owner_address"],
                    "latitude": business_data["latitude"], 
                    "longitude": business_data["longitude"],
                    "owner_city": business_data["owner_city"],
                    "owner_state": business_data["owner_state"],
                    "owner_zip": business_data["owner_zip"]
                })

# Convert list to DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv("business_data.csv", index=False)

In [38]:
wdata =df[df["postal_code"]== "94110"]

In [39]:
wdata

Unnamed: 0,postal_code,business_id,address,city,phone_number,tax_code,business_certificate,owner_name,owner_address,latitude,longitude,owner_city,owner_state,owner_zip
2483,94110,45,3202 FOLSOM St,S.F.,,H24,340024,"HARB, CHARLES AND KRISTIN",1150 SANCHEZ,37.7471,-122.414,S.F.,CA,94114
2484,94110,95,419 CORTLAND Ave,S.F.,,H24,380576,"LIGURIA INVESTMENTS (Giuseppe Manna, Vega Free...",419 Cortland Avenue,37.7392,-122.417,S.F.,CA,94110
2485,94110,180,3459 MISSION St,SF,14155288985,H24,452950,"LEMUS, ANA",3459 MISSION St,37.7415,-122.422,SF,CA,94110
2486,94110,184,1109 VALENCIA St,SF,14155827659,H24,341342,"NAVARRETE, VICTOR",1109 VALENCIA St,37.7551,-122.421,SF,CA,94110
2487,94110,217,400 CORTLAND Ave,S.F.,,H24,347088,"BENARAFA, A.A. AND BOUJEBHA, M",780 25TH AVE. #3,37.7391,-122.418,S.F.,CA,94121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066,94110,88525,2805 Mission St,,14155253309,H25,1016611,"24th Street Mac, Inc",588 Sutter St #236,,,San Francisco,CA,94102
3067,94110,88575,2154 Mission St,,14155611716,H24,1039737,Almeer Food Inc.,273 88th St. Apt. #1,,,Daly City,CA,94015
3068,94110,88687,2948 Folsom St,San Francisco,,H74,1036368,Eli Goldstein,"477 O'Farrell St, Apt 502",,,San Francisco,CA,94102
3069,94110,89191,1501 Cortland Ave,San Francisco,,H74,1029359,"Culinary Eye, Inc.",1501 Cortland Ave.,,,San Francisco,CA,34110


In [None]:
"""
Cell generated by Data Wrangler.
"""
# def clean_data(wdata):
#     # Filter rows based on column: 'address'
#     wdata = wdata[wdata['address'].str.contains("St", regex=False, na=False, case=False)]
#     return wdata

# wdata_clean = clean_data(wdata.copy())
# wdata_clean.head()

In [28]:
wdata = wdata[["business_id", "postal_code"]]

In [29]:
wdata[['business_id']] = wdata[['business_id']].apply(pd.to_numeric, errors='coerce')

In [11]:
wdata.dtypes

business_id     int64
postal_code    object
dtype: object

In [30]:
import sqlalchemy as sql
engine = sql.create_engine("sqlite:///./mock_roe_4/violations.db")
db_data = pd.read_sql("select * from violations", engine)

In [31]:
db_data

Unnamed: 0,business_id,date,violation_type_id,risk_category,description
0,10,2014-07-29,103129,Moderate Risk,Insufficient hot water or running water
1,10,2014-07-29,103144,Low Risk,Unapproved or unmaintained equipment or utensils
2,10,2014-01-14,103119,Moderate Risk,Inadequate and inaccessible handwashing facili...
3,10,2014-01-14,103145,Low Risk,Improper storage of equipment utensils or linens
4,10,2014-01-14,103154,Low Risk,Unclean or degraded floors walls or ceilings
...,...,...,...,...,...
36045,88878,2016-08-19,103144,Low Risk,Unapproved or unmaintained equipment or utensils
36046,88878,2016-08-19,103124,Moderate Risk,Inadequately cleaned or sanitized food contact...
36047,89072,2016-09-22,103120,Moderate Risk,Moderate risk food holding temperature
36048,89072,2016-09-22,103131,Moderate Risk,Moderate risk vermin infestation


In [14]:
db_data.dtypes

business_id           int64
date                 object
violation_type_id    object
risk_category        object
description          object
dtype: object

In [32]:
new_data = pd.merge(wdata, db_data, on="business_id", how="inner")

In [33]:
new_data.groupby("postal_code").count()

Unnamed: 0_level_0,business_id,date,violation_type_id,risk_category,description
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
94110,3234,3234,3234,3234,3234


In [34]:
# Convert the date column to datetime
new_data['date'] = pd.to_datetime(new_data['date'])

# Filter the data for Moderate Risk and Monday
filtered_data = new_data[(new_data['risk_category'] == 'Moderate Risk') & (new_data['date'].dt.dayofweek == 0)]

# Group by business_id and count the violations
violation_counts = filtered_data.groupby('business_id').size().reset_index(name='violation_count')

print(violation_counts)

     business_id  violation_count
0             45                5
1             95                2
2            217                1
3            580                2
4            821                1
..           ...              ...
133        84195                3
134        84573                2
135        85953                1
136        86350                2
137        86386                1

[138 rows x 2 columns]


In [35]:
violation_counts["violation_count"].sum()

np.int64(249)