Code generated by codeium with context of html file

In [2]:
import os
import glob
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

# Define the folder where HTML files are stored
html_folder = "../../mock4/mock_roe_4"

# List all HTML files in the folder
html_files = glob.glob(os.path.join(html_folder, "*.html"))

# List to store extracted data
data = []

# Extract data from each HTML file
for file in html_files:
    with open(file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
        
        # Extract postal code from the heading
        h1_tag = soup.find("h1")
        if h1_tag:
            postal_code = h1_tag.text.strip().split()[-1]  # Extracting last word (postal code)

        # Find all tables
        tables = soup.find_all("table")

        for table in tables:
            rows = table.find_all("tr")
            business_data = {}
            
            # Iterate through table rows
            for row in rows:
                cells = row.find_all("td")
                if len(cells) == 4:
                    key1, val1, key2, val2 = [cell.text.strip() for cell in cells]
                    business_data[key1] = val1
                    business_data[key2] = val2

            data.append({
                    "postal_code": postal_code,
                    "business_id": business_data["business_id"],
                    "address": business_data["address"],
                    "city": business_data["city"],
                    "phone_number": business_data["phone_number"],
                    "tax_code": business_data["tax_code"],
                    "business_certificate": business_data["business_certificate"],
                    "owner_name": business_data["owner_name"],
                    "owner_address": business_data["owner_address"],
                    "latitude": business_data["latitude"], 
                    "longitude": business_data["longitude"],
                    "owner_city": business_data["owner_city"],
                    "owner_state": business_data["owner_state"],
                    "owner_zip": business_data["owner_zip"]
                })

# Convert list to DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv("../../mock4/business_data.csv", index=False)

In [19]:
df.head()

Unnamed: 0,postal_code,business_id,address,city,phone_number,tax_code,business_certificate,owner_name,owner_address,latitude,longitude,owner_city,owner_state,owner_zip
0,92672,64660,1530 Haight St,SF,14155250643.0,H08,319919,"Haight Street Market, Inc. / Konstantinos Vard...",1530 Market Street,37.77,-122.448,San Francisco,CA,94117
1,94013,37167,888 Howard St 2nd Floor,SF,,H25,424376,"CDC SF, LLC",2041 Rosecrans Avenue P.O.Box 916,37.7817,-122.405,El Segundo,CA,90245
2,94013,37169,888 Howard St 4th Floor,SF,,H24,424376,"CDC SF, LLC",2041 Rosecrans Ave. P.O. Box 916,37.7817,-122.405,El Segundo,CA,90245
3,94014,74674,101 Bayshore Blvd,SF,14155831168.0,H79,478579,Ann Yip,41 Lighthouse Lane,,,Daly City,CA,94014
4,94014,80829,428 11th St,san Francisco CA,14155735897.0,H79,476141,Seung Kyun Han,776 Easton Ave,,,San Bruno,CA,94060


In [17]:
df.shape

(6356, 14)

In [3]:
wdata =df[df["postal_code"]== "94110"]

In [4]:
"""
Cell generated by Data Wrangler.
"""
# def clean_data(wdata):
#     # Filter rows based on column: 'address'
#     wdata = wdata[wdata['address'].str.contains("St", regex=False, na=False, case=False)]
#     return wdata

# wdata_clean = clean_data(wdata.copy())
# wdata_clean.head()

'\nCell generated by Data Wrangler.\n'

In [5]:
wdata = wdata[["business_id", "postal_code"]]

In [6]:
wdata[['business_id']] = wdata[['business_id']].apply(pd.to_numeric, errors='coerce')

In [7]:
wdata.dtypes

business_id     int64
postal_code    object
dtype: object

In [18]:
wdata.head()

Unnamed: 0,business_id,postal_code
2483,45,94110
2484,95,94110
2485,180,94110
2486,184,94110
2487,217,94110


In [9]:
import sqlalchemy as sql
engine = sql.create_engine("sqlite:///../../mock4/mock_roe_4/violations.db")
db_data = pd.read_sql("select * from violations", engine)

In [None]:
# Can also use sqlite3
# import sqlite3
# conn = sqlite3.connect("./mock_roe_4/violations.db")
# db_data_2 = pd.read_sql("select * from violations", conn)

In [10]:
db_data

Unnamed: 0,business_id,date,violation_type_id,risk_category,description
0,10,2014-07-29,103129,Moderate Risk,Insufficient hot water or running water
1,10,2014-07-29,103144,Low Risk,Unapproved or unmaintained equipment or utensils
2,10,2014-01-14,103119,Moderate Risk,Inadequate and inaccessible handwashing facili...
3,10,2014-01-14,103145,Low Risk,Improper storage of equipment utensils or linens
4,10,2014-01-14,103154,Low Risk,Unclean or degraded floors walls or ceilings
...,...,...,...,...,...
36045,88878,2016-08-19,103144,Low Risk,Unapproved or unmaintained equipment or utensils
36046,88878,2016-08-19,103124,Moderate Risk,Inadequately cleaned or sanitized food contact...
36047,89072,2016-09-22,103120,Moderate Risk,Moderate risk food holding temperature
36048,89072,2016-09-22,103131,Moderate Risk,Moderate risk vermin infestation


In [11]:
db_data.dtypes

business_id           int64
date                 object
violation_type_id    object
risk_category        object
description          object
dtype: object

In [12]:
new_data = pd.merge(wdata, db_data, on="business_id", how="inner")

In [14]:
# Convert the date column to datetime
new_data['date'] = pd.to_datetime(new_data['date'])

# Filter the data for Moderate Risk and Monday
filtered_data = new_data[(new_data['risk_category'] == 'Moderate Risk') & (new_data['date'].dt.dayofweek == 0)]

# Group by business_id and count the violations
violation_counts = filtered_data.groupby('business_id').size().reset_index(name='violation_count')

print(violation_counts)

     business_id  violation_count
0             45                5
1             95                2
2            217                1
3            580                2
4            821                1
..           ...              ...
133        84195                3
134        84573                2
135        85953                1
136        86350                2
137        86386                1

[138 rows x 2 columns]


In [16]:
violation_counts["violation_count"].sum()
# Q1 Answer:

np.int64(249)