### Import packages

In [9]:
from google.cloud import bigquery, bigquery_storage
import pandas as pd
import geopandas as gpd
import warnings
warnings.filterwarnings('ignore')

### Instantiate a BQ client

In [10]:
bqclient = bigquery.Client(project="logistics-customer-staging")
bqstorage_client = bigquery_storage.BigQueryReadClient()

### Pull the WoF data

In [11]:
wof_data_query = """
    SELECT
        geometry,
        wof_country_code,
        wof_name,
        wof_placetype_eng
    FROM `logistics-data-storage-staging.long_term_pricing.wof_data_per_bad_gadm_data_market`
"""

df_wof = pd.DataFrame(bqclient.query(query=wof_data_query).result().to_dataframe(bqstorage_client=bqstorage_client, progress_bar_type='tqdm'))

### Create sub-dataframes that contain only the data for the respective place types

In [12]:
df_wof_locality = df_wof[df_wof['wof_placetype_eng'] == 'locality'].reset_index(drop=True)
df_wof_neighbourhood = df_wof[df_wof['wof_placetype_eng'] == 'neighbourhood'].reset_index(drop=True)
df_wof_county = df_wof[df_wof['wof_placetype_eng'] == 'county'].reset_index(drop=True)

### Create a function that prepares the dataframes by cross joining the locality dataframe with the neighbourhood/county dataframe

In [14]:
def dataset_prep(country_code):
    """
    A function that prepares the dataframes by cross joining the locality dataframe with the neighbourhood/county dataframe
    """
    # Filter the dataframes for the respective country
    df_wof_locality_func = df_wof_locality[df_wof_locality['wof_country_code'] == country_code]
    df_wof_neighbourhood_func = df_wof_neighbourhood[df_wof_neighbourhood['wof_country_code'] == country_code]
    df_wof_county_func = df_wof_county[df_wof_county['wof_country_code'] == country_code]

    # Merge the locality dataframe with the neighbourhood dataframe
    df_wof_locality_neighbourhood = pd.merge(
        left=df_wof_locality_func,
        right=df_wof_neighbourhood_func,
        how='cross',
        suffixes=('_locality', '_neighbourhood')
    )

    # Merge the locality dataframe with the neighbourhood dataframe
    df_wof_locality_county = pd.merge(
        left=df_wof_locality_func,
        right=df_wof_county_func,
        how='cross',
        suffixes=('_locality', '_county')
    )

    # Change the geometry columns to geopandas geometry columns
    if not df_wof_locality_neighbourhood.empty:
        df_wof_locality_neighbourhood['geometry_locality'] = gpd.GeoSeries.from_wkt(df_wof_locality_neighbourhood['geometry_locality'])
        df_wof_locality_neighbourhood['geometry_neighbourhood'] = gpd.GeoSeries.from_wkt(df_wof_locality_neighbourhood['geometry_neighbourhood'])
    else:
        print(f"Skipping df_wof_locality_neighbourhood for {country_code}")
    
    if not df_wof_locality_county.empty:
        df_wof_locality_county['geometry_locality'] = gpd.GeoSeries.from_wkt(df_wof_locality_county['geometry_locality'])
        df_wof_locality_county['geometry_county'] = gpd.GeoSeries.from_wkt(df_wof_locality_county['geometry_county'])
        print(f"Skipping df_wof_locality_county for {country_code}")

    return df_wof_locality_neighbourhood, df_wof_locality_county

### Loop through all countries and prepare the dataframes

In [15]:
df_wof_locality_neighbourhood = []
df_wof_locality_county = []
for ctry in df_wof["wof_country_code"].unique():
    print("Prepping the data for country code: " + ctry)
    df_wof_locality_neighbourhood_iter, df_wof_locality_county_iter = dataset_prep(country_code=ctry)
    df_wof_locality_neighbourhood.append(df_wof_locality_neighbourhood_iter)
    df_wof_locality_county.append(df_wof_locality_county_iter)

# Concatenate the dataframes
df_wof_locality_neighbourhood = pd.concat(df_wof_locality_neighbourhood).reset_index(drop=True)
df_wof_locality_county = pd.concat(df_wof_locality_county).reset_index(drop=True)

Prepping the data for country code: BH
Skipping df_wof_locality_neighbourhood for BH
Prepping the data for country code: CZ
Prepping the data for country code: HK
Skipping df_wof_locality_neighbourhood for HK
Prepping the data for country code: KW
Skipping df_wof_locality_neighbourhood for KW
Prepping the data for country code: MY
Skipping df_wof_locality_county for MY
Prepping the data for country code: QA
Skipping df_wof_locality_county for QA
Prepping the data for country code: SG
Skipping df_wof_locality_neighbourhood for SG
Prepping the data for country code: TW
Skipping df_wof_locality_county for TW


### Add the is_intersects column to both dataframes to check whether the `neighbourhood/county` granularities intersects with the `locality` granularity. If they intersect, then we need to choose only one of them per country

In [17]:
df_wof_locality_neighbourhood["is_intersects"] = df_wof_locality_neighbourhood.apply(lambda x: x['geometry_locality'].intersects(x['geometry_neighbourhood']), axis=1)
df_wof_locality_county["is_intersects"] = df_wof_locality_county.apply(lambda x: x['geometry_locality'].intersects(x['geometry_county']), axis=1)

In [27]:
print("The number of intersections between locality and neighbourhood:")
print(df_wof_locality_neighbourhood[["wof_country_code_locality", "is_intersects"]].value_counts().sort_index())
print("\n")
print("The number of intersections between locality and county:")
print(df_wof_locality_county[["wof_country_code_locality", "is_intersects"]].value_counts().sort_index())

The number of intersections between locality and neighbourhood:
wof_country_code_locality  is_intersects
CZ                         False            154498
                           True                181
MY                         False             46419
                           True                131
QA                         True                 54
TW                         False             69772
                           True                872
Name: count, dtype: int64


The number of intersections between locality and county:
wof_country_code_locality  is_intersects
MY                         False            70265
                           True               785
QA                         False               22
                           True                66
TW                         False              551
                           True                37
Name: count, dtype: int64
