In [1]:
# Imports essential libraries for data manipulation (pandas), spatial data processing (geopandas), database connection (sqlalchemy), and mapping (leafmap).

import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine,text 
import leafmap

# Set up DB connection parameters within PostGIS

In [2]:
# Define the db connection parameters
username = "postgres"
password = "12345"
host = "localhost"
dbname = "Buildings"
port = "5432"

pg_connection = f"PG:host={host} port={port} dbname={dbname} user={username} password={password}"
engine = create_engine(f"postgresql://{username}:{password}@{host}:{port}/{dbname}")

# Download data from CSDI Portal

In [None]:
# Download gdb from CSDI Portal

from fgdbDL import download_and_extract_gdb 

# URL for "Buildings"
#url = "https://static.csdi.gov.hk/csdi-webpage/download/51d63757e2675874af80eef94afb6a35/fgdb"

# URL for "iB1000 Topographic Map"
url = "https://open.hkmapservice.gov.hk/OpenData/directDownload?productName=iB1000&sheetName=Fullset_Seamless&productFormat=FGDB"

storage_path = "../../geospatial_data/ls_project2/"

download_and_extract_gdb(url, storage_path)

In [None]:
# Download shp from CSDI Portal

from shpDL import download_and_extract_shp

#url = 'https://static.csdi.gov.hk/csdi-webpage/download/0e55c533715b5da3ae0ca6e6024e90b4/shp'

url = 'https://static.csdi.gov.hk/csdi-webpage/download/83cd933a39c7525581d6aa429a981c90/shp'

storage_path = "../../geospatial_data/ls_project2/"

download_and_extract_shp(url, storage_path)

In [None]:
# Compile OZP data from CSDI Portal using WFS

from ozp2pgsql import fetch_and_process_wfs_data

wfs_url = 'https://www.ozp.tpb.gov.hk/arcgis/services/DATA/OZP_PLAN_CSDI/MapServer/WFSServer?request=GetCapabilities&service=WFS'
download_dir = "../../geospatial_data/ls_project2/ozp"
postgis_conn_string = f"postgresql://{username}:{password}@{host}:{port}/{dbname}"
schema_name = "OZP"

fetch_and_process_wfs_data(wfs_url, download_dir, postgis_conn_string, schema_name=schema_name)

# Set up paths and layers

In [3]:
# Define the paths and layer name (comment out either one gdb_path when not in use)

# Building Footprint database
blg_gdb_path = "../../geospatial_data/ls_project2/Building_Footprint.gdb"

# Lot database
lot_gdb_path = "../../geospatial_data/ls_project2/LandParcel_Lot.gdb"

# Building information and age records
bdbiar_shp_path = "../../geospatial_data/ls_project2/BDBIAR.shp"

# Building information and age records
ib1000_gdb_path = "../../geospatial_data/ls_project2/iB1000.gdb"

# District Boundary
dcd_shp_path = "../../geospatial_data/ls_project2/DCD.shp"

# Import into a PostgreSQL database using ogr2ogr

In [None]:
# Imports Building Footprint GDB into a PostgreSQL database using ogr2ogr
from gdb2pgsql import transfer_gdb_to_postgis

transfer_gdb_to_postgis(blg_gdb_path, pg_connection, "Buildings")

In [None]:
# Imports Lot GDB into a PostgreSQL database using ogr2ogr
from gdb2pgsql import transfer_gdb_to_postgis

transfer_gdb_to_postgis(lot_gdb_path, pg_connection,"Lot")

In [None]:
# Imports Building information and age records SHP into a PostgreSQL database using ogr2ogr
from shp2pgsql import import_shapefile_to_postgresql

import_shapefile_to_postgresql(bdbiar_shp_path, pg_connection, "Building_Age")

In [None]:
# Imports iB1000 GDB into a PostgreSQL database using ogr2ogr
from gdb2pgsql import transfer_gdb_to_postgis

transfer_gdb_to_postgis(ib1000_gdb_path, pg_connection,"iB1000")

In [None]:
# Imports DCD SHP into a PostgreSQL database using ogr2ogr
from shp2pgsql import import_shapefile_to_postgresql

import_shapefile_to_postgresql(dcd_shp_path, pg_connection, "DCD")

In [None]:
# Replace variable with GDB path to inspect available layers within the specified GDB

from gdbList import list_layers_with_types

list_layers_with_types(lot_gdb_path)

# Read data from PostgreSQL database into dataframes

In [4]:
schema_buildings = '"Buildings"'
schema_buildings_age = '"Building_Age"'
schema_ib1000 = '"iB1000"'
schema_ozp = '"OZP"'
schema_lot = '"Lot"'
schema_dcd = '"DCD"'

table_op = "OCCUPATION_PERMIT"
table_op_blgstr = "OP_BUILDING_STRUCTURE"
table_blginfo = "BUILDING_INFO"
table_blgstr = "BUILDING_STRUCTURE"
table_blgcat = "CT_BUILDING_CATEGORY"
table_bdbiar = "BDBIAR"
table_ib1000_blg = "BUILDING"
table_ib1000_site = "SITE"
table_gdf_merged_ozp = "gdf_merged_ozp"
table_gdf_lot = "LOT"
table_lotlandinfo = "LOTLANDINFO"
table_lot_register = "LOT_REGISTER"
table_landdocument = "LANDDOCUMENT"
table_dcd = "DCD"

sql_op = text(f"SELECT * FROM {schema_buildings}.{table_op}")
sql_op_blgstr = text(f"SELECT * FROM {schema_buildings}.{table_op_blgstr}")
sql_blginfo = text(f"SELECT * FROM {schema_buildings}.{table_blginfo}")
sql_blstr = text(f"SELECT * FROM {schema_buildings}.{table_blgstr}")
sql_blgcat = text(f"SELECT * FROM {schema_buildings}.{table_blgcat}")
sql_bdbiar = text(f"SELECT * FROM {schema_buildings_age}.{table_bdbiar}")
sql_ib1000_blg = text(f"SELECT * FROM {schema_ib1000}.{table_ib1000_blg}")
sql_ib1000_site = text(f"SELECT * FROM {schema_ib1000}.{table_ib1000_site}")
sql_gdf_merged_ozp = text(f"SELECT * FROM {schema_ozp}.{table_gdf_merged_ozp}")
sql_gdf_lot = text(f"SELECT * FROM {schema_lot}.{table_gdf_lot}")
sql_lotlandinfo = text(f"SELECT * FROM {schema_lot}.{table_lotlandinfo}")
sql_lot_register = text(f"SELECT * FROM {schema_lot}.{table_lot_register}")
sql_landdocument = text(f"SELECT * FROM {schema_lot}.{table_landdocument}")
sql_gdf_dcd = text(f"SELECT * FROM {schema_dcd}.{table_dcd}")

In [5]:
# Read the tables into DataFrames
df_op = pd.read_sql(sql_op, con=engine.connect())
df_op_blgstr = pd.read_sql(sql_op_blgstr, con=engine.connect())
df_blgcat = pd.read_sql(sql_blgcat, con=engine.connect())
df_blginfo = pd.read_sql(sql_blginfo, con=engine.connect())
df_lotlandinfo = pd.read_sql(sql_lotlandinfo, con=engine.connect())
df_lot_register = pd.read_sql(sql_lot_register, con=engine.connect())
df_landdocument = pd.read_sql(sql_landdocument, con=engine.connect())

# Read the tables with geometry into DataFrames
gdf_blgstr = gpd.read_postgis(sql_blstr, con=engine.connect(), geom_col="shape") 
gdf_bdbiar = gpd.read_postgis(sql_bdbiar, con=engine.connect(), geom_col="wkb_geometry")
gdf_ib1000_blg = gpd.read_postgis(sql_ib1000_blg, con=engine.connect(), geom_col="shape")
gdf_ib1000_site = gpd.read_postgis(sql_ib1000_site, con=engine.connect(), geom_col="shape")
gdf_merged_ozp = gpd.read_postgis(sql_gdf_merged_ozp, con=engine.connect(), geom_col="geometry")
gdf_lot = gpd.read_postgis(sql_gdf_lot, con=engine.connect(), geom_col="shape")
gdf_dcd = gpd.read_postgis(sql_gdf_dcd, con=engine.connect(), geom_col="wkb_geometry")

In [None]:
# Define the remapping dictionary
remap_dict = {
    'Agriculture(1)': 'Agriculture', 
    'Commercial / Residential(1)': 'Commercial / Residential', 
    'Commercial / Residential(2)': 'Commercial / Residential', 
    'Commercial / Residential(3)': 'Commercial / Residential', 
    'Commercial / Residential(4)': 'Commercial / Residential', 
    'Commercial(1)': 'Commercial', 
    'Commercial(2)': 'Commercial', 
    'Commercial(3)': 'Commercial', 
    'Commercial(4)': 'Commercial', 
    'Commercial(5)': 'Commercial', 
    'Commercial(6)': 'Commercial', 
    'Commercial(7)': 'Commercial', 
    'Commercial(8)': 'Commercial', 
    'Commercial(9)': 'Commercial',
    'Commercial (3)': 'Commercial',
    'Commercial (4)': 'Commercial',
    'Commercial (1)': 'Commercial',
    'Commercial (2)': 'Commercial',
    'Commercial(11)': 'Commercial',
    'Commercial(10)': 'Commercial',
    'Comprehensive Development Area(1)': 'Comprehensive Development Area', 
    'Comprehensive Development Area(2)': 'Comprehensive Development Area', 
    'Comprehensive Development Area(3)': 'Comprehensive Development Area', 
    'Comprehensive Development Area(4)': 'Comprehensive Development Area', 
    'Comprehensive Development Area(5)': 'Comprehensive Development Area', 
    'Comprehensive Development Area(6)': 'Comprehensive Development Area', 
    'Conservation Area(1)': 'Conservation Area', 
    'Government, Institution or Community(1)': 'Government, Institution or Community', 
    'Government, Institution or Community(10)': 'Government, Institution or Community', 
    'Government, Institution or Community(2)': 'Government, Institution or Community', 
    'Government, Institution or Community(3)': 'Government, Institution or Community', 
    'Government, Institution or Community(4)': 'Government, Institution or Community', 
    'Government, Institution or Community(5)': 'Government, Institution or Community', 
    'Government, Institution or Community(6)': 'Government, Institution or Community', 
    'Government, Institution or Community(7)': 'Government, Institution or Community', 
    'Government, Institution or Community(8)': 'Government, Institution or Community', 
    'Government, Institution or Community(9)': 'Government, Institution or Community',
    'Government, Institution or Community (1)': 'Government, Institution or Community',
    'Government, Institution or Community(13)': 'Government, Institution or Community',
    'Government, Institution or Community(12)': 'Government, Institution or Community',
    'Government, Institution or Community(11)': 'Government, Institution or Community',
    'Green Belt(1)': 'Green Belt', 
    'Green Belt(2)': 'Green Belt', 
    'Industrial(1)': 'Industrial', 
    'Industrial(2)': 'Industrial', 
    'Industrial(3)': 'Industrial', 
    'Open Space(1)': 'Open Space', 
    'Open Space(2)': 'Open Space', 
    'Open Space(3)': 'Open Space', 
    'Other Specified Uses(1)': 'Other Specified Uses', 
    'Other Specified Uses(2)': 'Other Specified Uses', 
    'Other Specified Uses(3)': 'Other Specified Uses', 
    'Other Specified Uses(4)': 'Other Specified Uses', 
    'Other Specified Uses(5)': 'Other Specified Uses', 
    'Other Specified Uses(6)': 'Other Specified Uses', 
    'Residential (Group A)1': 'Residential (Group A)', 
    'Residential (Group A)10': 'Residential (Group A)', 
    'Residential (Group A)11': 'Residential (Group A)', 
    'Residential (Group A)12': 'Residential (Group A)', 
    'Residential (Group A)13': 'Residential (Group A)', 
    'Residential (Group A)14': 'Residential (Group A)', 
    'Residential (Group A)15': 'Residential (Group A)', 
    'Residential (Group A)16': 'Residential (Group A)', 
    'Residential (Group A)17': 'Residential (Group A)', 
    'Residential (Group A)18': 'Residential (Group A)', 
    'Residential (Group A)19': 'Residential (Group A)', 
    'Residential (Group A)2': 'Residential (Group A)', 
    'Residential (Group A)20': 'Residential (Group A)', 
    'Residential (Group A)21': 'Residential (Group A)', 
    'Residential (Group A)22': 'Residential (Group A)', 
    'Residential (Group A)23': 'Residential (Group A)', 
    'Residential (Group A)24': 'Residential (Group A)', 
    'Residential (Group A)25': 'Residential (Group A)', 
    'Residential (Group A)26': 'Residential (Group A)', 
    'Residential (Group A)3': 'Residential (Group A)', 
    'Residential (Group A)4': 'Residential (Group A)', 
    'Residential (Group A)5': 'Residential (Group A)', 
    'Residential (Group A)6': 'Residential (Group A)', 
    'Residential (Group A)7': 'Residential (Group A)', 
    'Residential (Group A)8': 'Residential (Group A)', 
    'Residential (Group A)9': 'Residential (Group A)', 
    'Residential (Group B)1': 'Residential (Group B)', 
    'Residential (Group B)10': 'Residential (Group B)', 
    'Residential (Group B)11': 'Residential (Group B)', 
    'Residential (Group B)12': 'Residential (Group B)', 
    'Residential (Group B)14': 'Residential (Group B)', 
    'Residential (Group B)16': 'Residential (Group B)', 
    'Residential (Group B)17': 'Residential (Group B)', 
    'Residential (Group B)19': 'Residential (Group B)', 
    'Residential (Group B)2': 'Residential (Group B)', 
    'Residential (Group B)3': 'Residential (Group B)', 
    'Residential (Group B)4': 'Residential (Group B)', 
    'Residential (Group B)5': 'Residential (Group B)', 
    'Residential (Group B)6': 'Residential (Group B)', 
    'Residential (Group B)7': 'Residential (Group B)', 
    'Residential (Group B)8': 'Residential (Group B)', 
    'Residential (Group B)20': 'Residential (Group B)',
    'Residential (Group B)15': 'Residential (Group B)',
    'Residential (Group A)27': 'Residential (Group B)',
    'Residential (Group A)28': 'Residential (Group B)',
    'Residential (Group B)13': 'Residential (Group B)',
    'Residential (Group B)9': 'Residential (Group B)',
    'Residential (Group B)18': 'Residential (Group B)',
    'Residential (Group C)1': 'Residential (Group C)', 
    'Residential (Group C)10': 'Residential (Group C)', 
    'Residential (Group C)11': 'Residential (Group C)', 
    'Residential (Group C)12': 'Residential (Group C)', 
    'Residential (Group C)13': 'Residential (Group C)', 
    'Residential (Group C)14': 'Residential (Group C)', 
    'Residential (Group C)15': 'Residential (Group C)', 
    'Residential (Group C)2': 'Residential (Group C)', 
    'Residential (Group C)3': 'Residential (Group C)', 
    'Residential (Group C)4': 'Residential (Group C)', 
    'Residential (Group C)5': 'Residential (Group C)', 
    'Residential (Group C)6': 'Residential (Group C)', 
    'Residential (Group C)7': 'Residential (Group C)', 
    'Residential (Group C)8': 'Residential (Group C)', 
    'Residential (Group C)9': 'Residential (Group C)', 
    'Residential (Group D)1': 'Residential (Group D)', 
    'Residential (Group E)1': 'Residential (Group E)', 
    'Residential (Group E)2': 'Residential (Group E)', 
    'Village Type Development(1)': 'Village Type Development',
    'Recreation(1)': 'Recreation',
    'Site of Special Scientific Interest(1)': 'Site of Special Scientific Interest',
    'Comprehensive Development Area (2)': 'Comprehensive Development Area',
    'Coastal Protection Area(1)': 'Coastal Protection Area'
}

# Apply the remapping to the GeoDataFrame and keep unmapped values unchanged
gdf_merged_ozp['DESC_ENG'] = gdf_merged_ozp['DESC_ENG'].map(remap_dict).fillna(gdf_merged_ozp['DESC_ENG'])

gdf_merged_ozp.to_postgis("gdf_merged_ozp", engine, if_exists="replace", schema="OZP")

In [6]:
# Select only the "opno" and "opdate" columns from df_op
df_op_subset = df_op[["opno", "opdate"]]

df_op_subset.opdate = pd.to_datetime(df_op_subset["opdate"], utc=True)

# Merge df_op_blgstr with the subset of df_op on the "opno" column
df_merge_op_blgstr = pd.merge(df_op_blgstr,
                            df_op_subset,
                            on="opno",
                            how="right")

In [7]:
# Select only the "buildingstructureid" and "opdate" columns from df_merge_op_blgstr
df_merge_op_blgstr_subset = df_merge_op_blgstr[["buildingstructureid", "opno", "opdate"]]

# Merge gdf_blgstr with the subset of df_merge_op_blgstr on the "buildingstructureid" column
gdf_merge_blgstr = pd.merge(gdf_blgstr,
                df_merge_op_blgstr_subset,
                on="buildingstructureid", how="left")

In [8]:
# Select only the "buildingstructureid", "infotype" and "infodescription" columns from df_blginfo
df_blginfo_subset = df_blginfo[["buildingstructureid", "infotype", "infodescription"]]

# Merge gdf_merge_blgstr with the subset of df_blginfo on the "buildingstructureid" column
gdf_merge_blgstr = pd.merge(gdf_merge_blgstr,
                df_blginfo_subset,
                on="buildingstructureid", how="left")

In [9]:
# Select only the "buildingstructureid" and "opdate" columns from df_merge_op_blgstr
df_blgcat_subset = df_blgcat[["code",
                              "description",
                              "note"]]

df_blgcat_subset = df_blgcat_subset.rename(columns={"code": "category",
                                 "description": "catdesc",
                                 "note": "catnote"})

gdf_merge_blgstr.category = gdf_merge_blgstr.category.astype("object").astype("int64")

# Merge gdf_blgstr with the subset of df_merge_op_blgstr on the "buildingstructureid" column
gdf_merge_blgstr = pd.merge(gdf_merge_blgstr,
                df_blgcat_subset,
                on="category", how="left")

In [10]:
today = pd.to_datetime('today', utc=True).normalize()

gdf_merge_blgstr["calcdate"] = today

In [11]:
gdf_merge_blgstr['age_blg'] = (gdf_merge_blgstr["calcdate"] - gdf_merge_blgstr["opdate"]) / pd.Timedelta(days=365)

# Filter Building structure and Building age by "Tower" type

In [12]:
# Remove " district" from all records in the district column
gdf_dcd['name_en'] = gdf_dcd['name_en'].str.replace(' District', '', regex=False)

# Define district groups
hong_kong_districts = ["Central and Western", "Eastern", "Southern", "Wan Chai"]
kowloon_districts = ["Kowloon City", "Kwun Tong", "Sham Shui Po", "Wong Tai Sin", "Yau Tsim Mong"]
new_territories_districts = ["Islands", "Kwai Tsing", "North", "Sai Kung", "Sha Tin", "Tai Po", "Tsuen Wan", "Tuen Mun", "Yuen Long"]

# Assign regions
gdf_dcd['region'] = None
gdf_dcd.loc[gdf_dcd['name_en'].isin(hong_kong_districts), 'region'] = 'Hong Kong'
gdf_dcd.loc[gdf_dcd['name_en'].isin(kowloon_districts), 'region'] = 'Kowloon'
gdf_dcd.loc[gdf_dcd['name_en'].isin(new_territories_districts), 'region'] = 'New Territories'

In [13]:
gdf_bdbiar.to_crs(epsg=2326, inplace=True)

In [14]:
# Select only the "name_en" and "region" columns from gdf_dcd
gdf_dcd_subset = gdf_dcd[["name_en", "region", "wkb_geometry"]]

gdf_bdbiar = gpd.sjoin(gdf_bdbiar, gdf_dcd, how="left")

# Rename the columns
gdf_bdbiar = gdf_bdbiar.rename(columns={'name_en': 'district'})

gdf_bdbiar = gdf_bdbiar.drop(['index_right'], axis=1)

In [15]:
gdf_blgstr_tower = gdf_merge_blgstr[gdf_merge_blgstr.buildingstructuretype == "T"]
gdf_bdbiar_tower = gdf_bdbiar[gdf_bdbiar.nsearch4_e == "Tower"]

In [16]:
gdf_sjoin_blgstr = gpd.sjoin_nearest(gdf_blgstr_tower, gdf_bdbiar_tower, how="left", max_distance=10)

gdf_sjoin_blgstr = gdf_sjoin_blgstr.drop(['index_right'], axis=1)

In [17]:
# Ensure both columns are in datetime format
gdf_sjoin_blgstr['nsearch3_e'] = pd.to_datetime(gdf_sjoin_blgstr['nsearch3_e'], errors='coerce')

# Convert to timezone-naive (assuming UTC for timezone-aware)
gdf_sjoin_blgstr['calcdate'] = gdf_sjoin_blgstr['calcdate'].dt.tz_localize(None)

# Calculate the age in years
gdf_sjoin_blgstr['age_bdbiar'] = (gdf_sjoin_blgstr["calcdate"] - gdf_sjoin_blgstr["nsearch3_e"]) / pd.Timedelta(days=365)

In [18]:
# Select only the "sitestype" and "sitecode" columns from gdf_ib1000_site
gdf_ib1000_site_subset = gdf_ib1000_site[["sitestype", "sitecode", "shape"]]

gdf_sjoin_blgstr = gpd.sjoin(gdf_sjoin_blgstr, gdf_ib1000_site_subset, how="left")

gdf_sjoin_blgstr = gdf_sjoin_blgstr.drop(['index_right'], axis=1)

In [19]:
# Select only the "DESC_ENG" and "SPUSE_ENG" columns from gdf_merged_ozp
gdf_merged_ozp_subset = gdf_merged_ozp[["DESC_ENG", "SPUSE_ENG", "geometry"]]

gdf_overlay_blgstr_ozp = gpd.overlay(gdf_sjoin_blgstr, gdf_merged_ozp_subset, how="intersection")

#Sort by area so largest area is last
gdf_overlay_blgstr_ozp['area'] = gdf_overlay_blgstr_ozp.geometry.area
gdf_overlay_blgstr_ozp.sort_values(by='area', inplace=True)

#Drop duplicates, keep last/largest
gdf_overlay_blgstr_ozp.drop_duplicates(subset='buildingstructureid', keep='last', inplace=True)
gdf_overlay_blgstr_ozp.drop(columns=['area'], inplace=True)

In [20]:
# Select only the "buildingstructureid", "DESC_ENG" and "SPUSE_ENG" columns from gdf_overlay_blgstr
gdf_overlay_blgstr_ozp_subset = gdf_overlay_blgstr_ozp[["buildingstructureid", "DESC_ENG", "SPUSE_ENG"]]

gdf_sjoin_blgstr = gdf_sjoin_blgstr.merge(gdf_overlay_blgstr_ozp_subset, on="buildingstructureid", how="left")

In [21]:
# Select only the "buildingstructureid", "DESC_ENG" and "SPUSE_ENG" columns from gdf_overlay_blgstr
df_landdocument_subset = df_landdocument[["prn", "usertype"]]

df_merge_lot_usertype = df_lot_register.merge(df_landdocument_subset, on="prn", how="left")

In [22]:
df_merge_lot_usertype.rename(columns={'lotcsuid_landsdformat':'lotcsuid'}, inplace=True)

# Select only the "buildingstructureid", "DESC_ENG" and "SPUSE_ENG" columns from gdf_overlay_blgstr
df_merge_lot_usertype_subset = df_merge_lot_usertype[["lotcsuid", "usertype"]]

gdf_lot_usertype = gdf_lot.merge(df_merge_lot_usertype_subset, on="lotcsuid", how="left")

In [23]:
# Select only the "DESC_ENG" and "SPUSE_ENG" columns from gdf_merged_ozp
gdf_lot_usertype_subset = gdf_lot_usertype[["usertype", "shape"]]

# Perform spatial overlay
gdf_overlay_blgstr_lot = gpd.overlay(gdf_sjoin_blgstr, gdf_lot_usertype_subset, how="intersection")

# Calculate area of the intersected polygons
gdf_overlay_blgstr_lot['intersected_area'] = gdf_overlay_blgstr_lot.geometry.area

# Calculate area of the original polygons
gdf_sjoin_blgstr['original_area'] = gdf_sjoin_blgstr.geometry.area

# Merge the original area into the overlay GeoDataFrame
gdf_overlay_blgstr_lot = gdf_overlay_blgstr_lot.merge(gdf_sjoin_blgstr[['buildingstructureid', 'original_area']], on='buildingstructureid')

# Calculate the ratio of the intersected area to the original area
gdf_overlay_blgstr_lot['area_ratio'] = gdf_overlay_blgstr_lot['intersected_area'] / gdf_overlay_blgstr_lot['original_area']

# Filter out polygons where the area ratio is less than 20%
gdf_overlay_blgstr_lot = gdf_overlay_blgstr_lot[gdf_overlay_blgstr_lot['area_ratio'] >= 0.2]

# Sort by intersected area so largest area is last
gdf_overlay_blgstr_lot.sort_values(by='intersected_area', inplace=True)

# Drop duplicates, keep last/largest
gdf_overlay_blgstr_lot.drop_duplicates(subset='buildingstructureid', keep='last', inplace=True)

In [24]:
# Select only the "buildingstructureid", "DESC_ENG" and "SPUSE_ENG" columns from gdf_overlay_blgstr
gdf_overlay_blgstr_lot_subset = gdf_overlay_blgstr_lot[["buildingstructureid", "usertype",'intersected_area', 'original_area', 'area_ratio']]

gdf_sjoin_blgstr = gdf_sjoin_blgstr.merge(gdf_overlay_blgstr_lot_subset, on="buildingstructureid", how="left")

# Drop unnecessary columns
gdf_overlay_blgstr_lot.drop(columns=['intersected_area', 'original_area', 'area_ratio'], inplace=True)

In [25]:
gdf_sjoin_blgstr['age_final'] = gdf_sjoin_blgstr['age_blg']
gdf_sjoin_blgstr['age_final'] = gdf_sjoin_blgstr['age_final'].fillna(gdf_sjoin_blgstr['age_bdbiar'])

In [26]:
sitestype_dict = {
    1: "Farm",
    2: "Accommodation",
    3: "Commerce",
    4: "Education & Training",
    5: "Health & Medical Service",
    6: "Leisure, Culture & Sports",
    7: "Organization",
    8: "Public & Social Service",
    9: "Religion",
    10: "Tourist Attraction",
    11: "Military",
    12: "Industry",
    13: "Works Area",
    14: "Transportation",
    15: "Utilities"
}

# Replace the values in the 'sitestype' column using the dictionary
gdf_sjoin_blgstr['sitestype'] = gdf_sjoin_blgstr['sitestype'].replace(sitestype_dict)

In [27]:
gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['nsearch5_e']

# Create a condition to check where 'nsearch5_e' is NaN or 'others'
condition = gdf_sjoin_blgstr['nsearch5_e'].isna() | (gdf_sjoin_blgstr['nsearch5_e'] == 'Others')

# Fill 'nsearch5_e' with 'sitestype' values where the condition is true
gdf_sjoin_blgstr.loc[condition, 'blguse_final'] = gdf_sjoin_blgstr.loc[condition, 'sitestype']

gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].fillna(gdf_sjoin_blgstr['usertype'])
gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].fillna(gdf_sjoin_blgstr['DESC_ENG'])

In [28]:
# Keep only relevant columns
gdf_sjoin_blgstr = gdf_sjoin_blgstr.loc[:, ("buildingstructureid",
                    "buildingcsuid",
                    "buildingstructuretype",
                    "region",
                    "district",
                    "infotype",
                    "infodescription",
                    "catdesc",
                    "catnote",
                    "status",
                    "officialbuildingnameen",
                    "officialbuildingnametc",
                    "numabovegroundstoreys",
                    "numbasementstoreys",
                    "topheight",
                    "baseheight",
                    "opno",
                    "opdate",
                    "age_blg",
                    "age_bdbiar",
                    "age_final",
                    "address_e",
                    "address_c",
                    "search1_e",
                    "search1_c",
                    "search2_e",
                    "search2_c",
                    "nsearch2_e",
                    "nsearch2_c",
                    "nsearch3_e",
                    "nsearch3_c",
                    "nsearch4_e",
                    "nsearch4_c",
                    "nsearch5_e",
                    "nsearch5_c",
                    "sitestype",
                    "sitecode",
                    "DESC_ENG",
                    "SPUSE_ENG",
                    "usertype",
                    "blguse_final",
                    "shape"
                    )]

In [29]:
gdf_sjoin_blgstr['officialbuildingnameen'] = gdf_sjoin_blgstr['officialbuildingnameen'].fillna("N/A")
gdf_sjoin_blgstr['officialbuildingnametc'] = gdf_sjoin_blgstr['officialbuildingnametc'].fillna("N/A")
gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].fillna("N/A")

gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].str.title()

In [30]:
# Load the lookup data into a DataFrame
df_new_blguse = pd.read_csv("new_blguse.csv")

# Create a dictionary for mapping values
mapping_dict = pd.Series(df_new_blguse.new_use.values, index=df_new_blguse.old_use).to_dict()

# Use the mapping dictionary to replace values in the 'blguse_final' column of the main DataFrame
gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].map(mapping_dict)

gdf_sjoin_blgstr['blguse_final'] = gdf_sjoin_blgstr['blguse_final'].fillna("N/A")

In [31]:
gdf_sjoin_blgstr.to_postgis("gdf_sjoin_blgstr_10m", engine, schema='Output', if_exists="replace")

In [None]:
import csv

def list_to_text_table(data):
    # Prepare table header
    table = f"{'Category':<40}\n"
    table += '-' * 40 + '\n'

    # Prepare table rows
    for item in data:
        table += f"{item:<40}\n"
    
    return table

data = gdf_sjoin_blgstr.blguse_final.unique()

# Convert list to text table using the function
text_table = list_to_text_table(data)

print(text_table)

In [None]:
# Write the data to a CSV file
csv_filename = "../../geospatial_data/ls_project2/123.csv"
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Category"])
    for item in data:
        writer.writerow([item])

print(f"Data has been exported to {csv_filename}")