# LIBARY

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import fiona
import pyogrio
import os

import requests
import zipfile
import os
from io import BytesIO
from bs4 import BeautifulSoup

# DOWNLOAD BUILDING DATA

In [156]:
# URL of the webpage to scrape
url = "https://disasters.geoplatform.gov/USA_Structures/"

def fetch_state_links():
    """Fetches state names and their corresponding links from the webpage."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)
        return {link.text.strip(): link["href"] for link in links if "Deliverable" in link["href"]}
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)
        return {}

def get_link_by_state(state_name, state_links):
    """Returns the link for a given state name."""
    return state_links.get(state_name, "State not found")   

In [158]:
def download_and_extract_zip(state_name, state_links):
    """Downloads and extracts a ZIP file from the given URL."""
    url = get_link_by_state(state_name, state_links)
    parent_dir = os.path.dirname(os.getcwd())
    output_dir = os.path.join(parent_dir, 'Data', 'building_data_gdb')

    response = requests.get(url, stream=True)
    if response.status_code == 200:
        os.makedirs(output_dir, exist_ok=True)
        zip_path = os.path.join(output_dir, f"{state_name}_Structures.zip")
        
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
        
        print(f"Downloaded and extracted to {output_dir}")
    else:
        print("Failed to download the ZIP file.")

In [166]:
state_links = fetch_state_links()
download_and_extract_zip("California", state_links)

Downloaded and extracted to /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Gradschool/4_SPRING25/Capstone/github/EarthquakeDamageModel_Heinz/Data/building_data_gdb


# READ BUILDING DATA

In [164]:
def get_building_data_directory(stateid="CA"):
    cwd = os.getcwd()

    # get parent directory
    parent_dir = os.path.dirname(cwd)
    # get the building data directory
    building_data_directory = os.path.join(parent_dir, 'Data', 'building_data_gdb')
    # find all folder in the building data directory
    folders = [f for f in os.listdir(building_data_directory) if os.path.isdir(os.path.join(building_data_directory, f))]
    # get the folder that ends with stateid
    stateid_dir= [f for f in folders if f.endswith(f'{stateid}')][0]

    return os.path.join(building_data_directory, stateid_dir, f'{stateid}_Structures.gdb')

def get_building_data_csv(stateid):
    building_data_directory = get_building_data_directory()

    # get the csv file
    return os.path.join(building_data_directory, f'{stateid}.csv')

In [None]:
# if false makedir
def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        print(f"Directory {directory} already exists.")

In [169]:
# Check if a csv file for a state is exists
    # if exists, read it
    # if not, check if the gdb file exists
    # if exists, read it
def read_building_data(stateid):
    building_data_directory = get_building_data_directory(stateid)

    # get the csv file
    csv_path = get_building_data_csv(stateid)

    if os.path.exists(csv_path):
        print(f"Reading {csv_path}")
        return gpd.read_file(csv_path)
    else:
        print(f"{csv_path} does not exist.")
        gdb_path = os.path.join(building_data_directory)
        if os.path.exists(gdb_path):
            print(f"Reading {gdb_path}")
            return gpd.read_file(gdb_path)
        else:
            print(f"{gdb_path} does not exist.")
            print("Please download the gdb file from the USGS website.")

In [4]:
states_data = [
    ("Alabama", "AL"), ("Alaska", "AK"), ("Arizona", "AZ"), ("Arkansas", "AR"),
    ("California", "CA"), ("Colorado", "CO"), ("Connecticut", "CT"), ("Delaware", "DE"),
    ("Florida", "FL"), ("Georgia", "GA"), ("Hawaii", "HI"), ("Idaho", "ID"),
    ("Illinois", "IL"), ("Indiana", "IN"), ("Iowa", "IA"), ("Kansas", "KS"),
    ("Kentucky", "KY"), ("Louisiana", "LA"), ("Maine", "ME"), ("Maryland", "MD"),
    ("Massachusetts", "MA"), ("Michigan", "MI"), ("Minnesota", "MN"), ("Mississippi", "MS"),
    ("Missouri", "MO"), ("Montana", "MT"), ("Nebraska", "NE"), ("Nevada", "NV"),
    ("New Hampshire", "NH"), ("New Jersey", "NJ"), ("New Mexico", "NM"), ("New York", "NY"),
    ("North Carolina", "NC"), ("North Dakota", "ND"), ("Ohio", "OH"), ("Oklahoma", "OK"),
    ("Oregon", "OR"), ("Pennsylvania", "PA"), ("Rhode Island", "RI"), ("South Carolina", "SC"),
    ("South Dakota", "SD"), ("Tennessee", "TN"), ("Texas", "TX"), ("Utah", "UT"),
    ("Vermont", "VT"), ("Virginia", "VA"), ("Washington", "WA"), ("West Virginia", "WV"),
    ("Wisconsin", "WI"), ("Wyoming", "WY")
]

In [None]:
gdf = read_building_data("CA")

/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Gradschool/4_SPRING25/Capstone/github/EarthquakeDamageModel_Heinz/Data/building_data_gdb/Deliverable20230728CA/CA_Structures.gdb/CA.csv does not exist.
Reading /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Gradschool/4_SPRING25/Capstone/github/EarthquakeDamageModel_Heinz/Data/building_data_gdb/Deliverable20230728CA/CA_Structures.gdb


In [168]:
gdf

# AGGREGATE BUILDING DATA

In [31]:
# function to remap OCC_CLS and PRIM_OCC
def remap_occupancy_classes(gdf):
    # Define the mapping dictionaries
    building_data = gdf[['BUILD_ID', 'OCC_CLS', 'PRIM_OCC', 'SQFEET', 'FIPS', 'CENSUSCODE', 'LONGITUDE', 'LATITUDE', 'Shape_Length', 'Shape_Area',
                    'geometry']]
    # mapping the occupancy class
    mapping = {
        'Agriculture':'OTHER', 'Education':'OTHER', 'Residential':'RESIDENTIAL', 'Unclassified':'OTHER',
        'Commercial':'OTHER', 'Government':'OTHER', 'Industrial':'OTHER', 'Utility and Misc':'OTHER',
        'Assembly':'OTHER'
    }
    building_data['OCC_CLS'] = building_data['OCC_CLS'].map(mapping)

    # mapping the primary occupancy
    mapping = {i:'OTHER' for i in building_data['PRIM_OCC'].unique() if i not in ['Single Family Dwelling', 'Multi - Family Dwelling']}
    residential = {'Single Family Dwelling':'SINGLE FAMILY', 'Multi - Family Dwelling':'MULTI FAMILY'}
    mapping.update(residential)
    building_data['PRIM_OCC'] = building_data['PRIM_OCC'].map(mapping)
    return building_data


# function to aggregate the building counts by GEODI, OCC_CLS, PRIM_OCC
def aggregate_building_counts(gdf):
    building_data = remap_occupancy_classes(gdf)
    # group by GEODI, OCC_CLS, PRIM_OCC and sum the counts
    count_building_data = building_data.groupby(['CENSUSCODE', 'OCC_CLS', 'PRIM_OCC']).agg({'BUILD_ID':'count', 'SQFEET':'sum'}).reset_index()
    # rename the columns
    count_building_data = count_building_data.rename(columns={'BUILD_ID':'COUNT'})
    return count_building_data

In [32]:
count_building_data = aggregate_building_counts(gdf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [33]:
count_building_data

Unnamed: 0,CENSUSCODE,OCC_CLS,PRIM_OCC,COUNT,SQFEET
0,06001400100,OTHER,OTHER,104,5.108791e+05
1,06001400100,RESIDENTIAL,MULTI FAMILY,22,3.491409e+04
2,06001400100,RESIDENTIAL,OTHER,10,1.186831e+05
3,06001400100,RESIDENTIAL,SINGLE FAMILY,1162,2.378443e+06
4,06001400200,OTHER,OTHER,46,3.368561e+05
...,...,...,...,...,...
34196,06115041101,RESIDENTIAL,SINGLE FAMILY,903,1.527391e+06
34197,06115041102,OTHER,OTHER,189,3.991922e+05
34198,06115041102,RESIDENTIAL,MULTI FAMILY,67,1.241931e+05
34199,06115041102,RESIDENTIAL,OTHER,354,5.515065e+05


In [37]:
def pivot_building_data(count_building_data):
    df = count_building_data.copy()

    # Create a pivot table
    df_pivot = df.pivot_table(index="CENSUSCODE", columns=["OCC_CLS", "PRIM_OCC"], values="COUNT", aggfunc="sum", fill_value=0)

    # Flatten MultiIndex columns
    df_pivot.columns = [f"{col[0]}_{col[1]}" for col in df_pivot.columns]
    df_pivot = df_pivot.reset_index()

    return df_pivot

In [38]:
df_pivot = pivot_building_data(count_building_data)

In [39]:
df_pivot

Unnamed: 0,CENSUSCODE,OTHER_OTHER,RESIDENTIAL_MULTI FAMILY,RESIDENTIAL_OTHER,RESIDENTIAL_SINGLE FAMILY
0,06001400100,104,22,10,1162
1,06001400200,46,110,2,538
2,06001400300,67,416,7,1139
3,06001400400,57,391,3,777
4,06001400500,56,342,6,614
...,...,...,...,...,...
9108,06115040902,759,0,0,0
9109,06115041001,579,90,360,1807
9110,06115041002,270,155,628,1771
9111,06115041101,373,86,596,903
