In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.ops import unary_union
from shapely.affinity import scale
from shapely.geometry import Polygon
from concurrent.futures import ThreadPoolExecutor

In [2]:
divisions = gpd.read_file('../../preprocessing/geodata/maps/dist/div/div-c005-s020-vw-pr.geojson').to_crs(epsg=32645)
districts = gpd.read_file('../../preprocessing/geodata/maps/dist/dis/dis-c005-s020-vw-pr.geojson').to_crs(epsg=32645)
upazilas = gpd.read_file('../../preprocessing/geodata/maps/dist/upa/upa-c005-s020-vw-pr.geojson').to_crs(epsg=32645)
unions = gpd.read_file('../../preprocessing/geodata/maps/dist/uni/uni-c005-s020-vw-pr.geojson').to_crs(epsg=32645)
mouzas = gpd.read_file('../../preprocessing/geodata/maps/dist/mou/mou-c005-s020-vw-pr.geojson').to_crs(epsg=32645)

divisions['region_key'] = divisions['div']
districts['region_key'] = districts['div'] + '@' + districts['dis']
upazilas['region_key'] = upazilas['div'] + '@' + upazilas['dis'] + '@' + upazilas['upa']
unions['region_key'] = unions['div'] + '@' + unions['dis'] + '@' + unions['upa'] + '@' + unions['uni']
mouzas['region_key'] = mouzas['div'] + '@' + mouzas['dis'] + '@' + mouzas['upa'] + '@' + mouzas['uni'] + '@' + mouzas['mou']

# Rules for datastructure
1. Region keys must be complete, containing no null values
2. Region keys must be unique
3. Each region (excluding divs) must have a valid parent which it is contained by and labelled by region key as within
4. Regions must not overlap
5. Regions must be completely contained by parent
6. Regions must not contain gaps

# Divisions
## Region keys must be complete, containing no null values

In [3]:
divisions.info()

print('\n')
print('area, div, geometry non-null is 8. df length is also 8')

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   area        8 non-null      float64 
 1   div         8 non-null      object  
 2   geometry    8 non-null      geometry
 3   region_key  8 non-null      object  
dtypes: float64(1), geometry(1), object(2)
memory usage: 388.0+ bytes


area, div, geometry non-null is 8. df length is also 8


## Region keys must be unique

In [4]:
div_vc = divisions['region_key'].value_counts()

if len(div_vc[div_vc > 1]) == 0:
    print('divion region keys unique')

divion region keys unique


## Each region (excluding divs) must have a valid parent

divs do not require parent

## Regions must not overlap

In [5]:
overlap_pairs = []

for i, geom1 in divisions.iterrows():
    for j, geom2 in divisions.iterrows():
        if i != j:
            if geom1['geometry'].overlaps(geom2['geometry']):
                overlap_pairs.append((i, j))

# Report the results
if overlap_pairs:
    print("Overlapping regions found:")
    for pair in overlap_pairs:
        print(f"Region {pair[0]} overlaps with Region {pair[1]}")
else:
    print("No overlapping regions found.")

No overlapping regions found.


## Regions must be completely contained by parent

divs do not require parent

## Regions must not contain gaps

In [6]:
def check_region_containment(parents, children) -> list:
    containment_issues = []

    for i, parent in parents.iterrows():
        parent_region_key = parent['region_key']
        child_regions = children[children['region_key'].str.contains(parent_region_key, regex=False)]

        if child_regions.empty:
            continue

        parent_geometry = parent['geometry']
        parent_area = parent_geometry.area

        total_child_area = sum(child['geometry'].area for _, child in child_regions.iterrows())
        total_intersection_area = sum(child['geometry'].intersection(parent_geometry).area for _, child in child_regions.iterrows())
        
        if total_child_area < 0.99 * parent_area or total_child_area > 1.01 * parent_area:
            containment_issues.append(f'''
                Area mismatch for parent region key: {parent_region_key} 
                Total child area: {total_child_area}, Parent area: {parent_area}
            ''')
        
        if total_intersection_area < 0.99 * parent_area or total_intersection_area > 1.01 * parent_area:
            containment_issues.append(f'''
                Containment error for parent region key: {parent_region_key} 
                Total intersection area: {total_intersection_area}, Parent area: {parent_area}
            ''')

    return containment_issues

containment_issues = check_region_containment(divisions, districts)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No parent gap issues")

No parent gap issues


# Districts
## Region keys must be complete, containing no null values

In [7]:
districts.info()

print('\n')
print('area, div, dis, geometry non-null is 64. df length is also 64')

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   area        64 non-null     float64 
 1   dis         64 non-null     object  
 2   div         64 non-null     object  
 3   geometry    64 non-null     geometry
 4   region_key  64 non-null     object  
dtypes: float64(1), geometry(1), object(3)
memory usage: 2.6+ KB


area, div, dis, geometry non-null is 64. df length is also 64


## Region keys must be unique

In [8]:
dis_vc = districts['region_key'].value_counts()

if len(dis_vc[dis_vc > 1]) == 0:
    print('district region keys unique')

district region keys unique


## Each region (excluding divs) must have a valid parent

In [9]:
def find_best_parent(child_geometry, potential_parents):
    best_parent = None
    max_intersection_area = 0
    child_area = child_geometry.area

    for idx, parent_row in potential_parents.iterrows():
        parent_geometry = parent_row['geometry']
        
        intersection = child_geometry.intersection(parent_geometry)
        intersection_area = intersection.area
        
        if intersection_area > max_intersection_area:
            max_intersection_area = intersection_area
            best_parent = parent_row
    
    intersection_percentage = max_intersection_area / child_area if child_area > 0 else 0
    return best_parent, intersection_percentage

def get_labelled_parent(child_region, divisions):
    region_parent_key = '@'.join(child_region['region_key'].split('@')[:-1])
    labelled_parent = divisions[divisions['region_key'] == region_parent_key]
    
    if len(labelled_parent) == 1:
        return labelled_parent.iloc[0]
    else:
        return None

def validate_region_parents(regions, parent_regions, parent_level):
    count = 0
    errors = []

    pps = {
        'div': divisions,
        'dis': districts,
        'upa': upazilas,
        'uni': unions,
        'mou': mouzas,
    }
    
    for i, row1 in regions.iterrows():
        err_dict = { 'region': row1, 'errors': [] }
        count += 1

        potential_parents = parent_regions[parent_regions[parent_level] == row1[parent_level]]

        labelled_parent = None
        if len(potential_parents) == 0:
            print(f"Cannot find regions labelled parent region {i}")

            potential_parents = pps[parent_level]
        else:
            labelled_parent = get_labelled_parent(row1, pps[parent_level])
    
        best_parent, max_area = find_best_parent(row1['geometry'], potential_parents)
    
        if best_parent is None or (labelled_parent is not None and best_parent['region_key'] != labelled_parent['region_key']):
            if best_parent is None:
                err_dict['errors'].append('No geographic parent found')
                print('No geographic parent found')
    
            if labelled_parent is not None and best_parent['region_key'] != labelled_parent['region_key']:
                err_dict['errors'].append(f"Best parent region key ({best_parent['region_key']}) does not match labelled parent region key ({labelled_parent['region_key']})")
                print(f"Best parent region key: {best_parent['region_key']}")
                print(f"Labelled parent region key: {labelled_parent['region_key']}")
                
            fig, ax = plt.subplots()
            
            if best_parent is not None:
                gpd.GeoDataFrame(geometry=[best_parent['geometry']]).plot(ax=ax, linewidth=2, edgecolor='green', facecolor='none', alpha=0.5, label='Geographic Parent')
    
            if labelled_parent is not None:
                gpd.GeoDataFrame(geometry=[labelled_parent['geometry']]).plot(ax=ax, linewidth=2, edgecolor='red', facecolor='none', alpha=0.5, label='Labelled Parent')
            
            gpd.GeoDataFrame(geometry=[row1['geometry']]).plot(ax=ax, linewidth=2, edgecolor='blue', facecolor='none', alpha=0.5, label='Child')
            plt.legend()
            plt.show()
    
        errors.append(err_dict)
    return errors

errors = validate_region_parents(districts, divisions, 'div')

total_errors = sum(len(err_dict['errors']) for err_dict in errors)
print(f'Total number of errors: {total_errors}')

if total_errors == 0:
    print("No errors found.")
else:
    print(f"Errors found: {total_errors}")


Total number of errors: 0
No errors found.


## Regions must not overlap

In [10]:
def check_overlap(i, geom1, regions):
    overlap_pairs = []
    for j, geom2 in regions.iterrows():
        if i != j and geom1.overlaps(geom2['geometry']):
            overlap_pairs.append((i, j))
    return overlap_pairs

def find_overlapping_regions(regions):
    overlap_pairs = []
    total_items = len(regions)  # Total number of items to process
    progress_interval = 1000  # Print progress every 1000 items

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(check_overlap, i, geom1['geometry'], regions)
            for i, geom1 in regions.iterrows()
        ]
        
        for idx, future in enumerate(futures):
            result = future.result()
            overlap_pairs.extend(result)
            
            # Print progress every 1000 items
            if (idx + 1) % progress_interval == 0 or (idx + 1) == total_items:
                print(f"Processed {idx + 1} out of {total_items} items.")

    return overlap_pairs

overlaps = find_overlapping_regions(districts)

if overlaps:
    print("Overlapping regions found:")
    for pair in overlaps:
        print(f"Region {pair[0]} overlaps with Region {pair[1]}")
else:
    print("No overlapping regions found.")


Processed 64 out of 64 items.
No overlapping regions found.


## Regions must be completely contained by parent

In [11]:
def check_individual_containment(children, parents, threshold=0.99) -> list:
    containment_issues = []

    for i, child in children.iterrows():
        parent_region_key = '@'.join(child['region_key'].split('@')[:-1])
        parent_region = parents[parents['region_key'] == parent_region_key]
        if not parent_region.empty:
            parent = parent_region.iloc[0]['geometry']
            intersection_area = child['geometry'].intersection(parent).area
            intersection_percentage = intersection_area / child['geometry'].area
            
            if intersection_percentage < threshold or intersection_percentage > 1.01:
                containment_issues.append(f'''
                    Containment error region key: {child["region_key"]} 
                    Intersection percentage: {intersection_percentage}
                ''')
        else:
            containment_issues.append(f"Parent region not found for child region key: {child['region_key']}")
    return containment_issues

containment_issues = check_individual_containment(districts, divisions)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No containment issues")


No containment issues


## Regions must not contain gaps

In [12]:
containment_issues = check_region_containment(districts, divisions)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No parent gap issues")


No parent gap issues


# Upazilas
## Region keys must be complete, containing no null values

In [13]:
upazilas.info()

print('\n')
print('area, div, dis, upa, geometry non-null is 544. df length is also 544')

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   area        544 non-null    float64 
 1   upa         544 non-null    object  
 2   dis         544 non-null    object  
 3   div         544 non-null    object  
 4   geometry    544 non-null    geometry
 5   region_key  544 non-null    object  
dtypes: float64(1), geometry(1), object(4)
memory usage: 25.6+ KB


area, div, dis, upa, geometry non-null is 544. df length is also 544


## Region keys must be unique

In [14]:
upa_vc = upazilas['region_key'].value_counts()

if len(upa_vc[upa_vc > 1]) == 0:
    print('upazila region keys unique')

upazila region keys unique


## Each region (excluding divs) must have a valid parent which it is contained by and labelled by region key as within

In [15]:
errors = validate_region_parents(upazilas, districts, 'dis')

total_errors = sum(len(err_dict['errors']) for err_dict in errors)
print(f'Total number of errors: {total_errors}')

if total_errors == 0:
    print("No errors found.")
else:
    print(f"Errors found: {total_errors}")

Total number of errors: 0
No errors found.


## Regions must not overlap

In [16]:
overlaps = find_overlapping_regions(upazilas)

if overlaps:
    print("Overlapping regions found:")
    for pair in overlaps:
        print(f"Region {pair[0]} overlaps with Region {pair[1]}")
else:
    print("No overlapping regions found.")

Processed 544 out of 544 items.
No overlapping regions found.


## Regions must be completely contained by parent

In [17]:
containment_issues = check_individual_containment(upazilas, districts)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No containment issues")

No containment issues


## Regions must not contain gaps

In [18]:
containment_issues = check_region_containment(upazilas, districts)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No parent gap issues")

No parent gap issues


# Unions
## Region keys must be complete, containing no null values

In [19]:
unions.info()

print('\n')
print('area, div, dis, upa, uni, geometry non-null is 5160. df length is also 5160')

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5160 entries, 0 to 5159
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   area        5160 non-null   float64 
 1   uni         5160 non-null   object  
 2   upa         5160 non-null   object  
 3   dis         5160 non-null   object  
 4   div         5160 non-null   object  
 5   geometry    5160 non-null   geometry
 6   region_key  5160 non-null   object  
dtypes: float64(1), geometry(1), object(5)
memory usage: 282.3+ KB


area, div, dis, upa, uni, geometry non-null is 5160. df length is also 5160


## Region keys must be unique

In [20]:
# We know unions can be corrected by merging those with same region key
# from work done in map-data-fix.ipynb (mouzas are not as simple as some exist in 
# wrong parent region)
uni_vc = unions['region_key'].value_counts()

if len(uni_vc[uni_vc > 1]) == 0:
    print('union region keys unique')
else:
    print(f'{len(uni_vc[uni_vc > 1])} not unique')

def merge_regions_by_key(unions):
    merged_unions = (
        unions.groupby('region_key')
        .agg({
            'geometry': lambda x: unary_union(x),
            'div': 'first',
            'dis': 'first',
            'upa': 'first',
            'uni': 'first',
            'area': 'sum'
        })
        .reset_index()
    )

    return merged_unions

unions = merge_regions_by_key(unions)
unions['region_key'] = unions['div'] + '@' + unions['dis'] + '@' + unions['upa'] + '@' + unions['uni']

duplicate_keys = unions['region_key'].value_counts()
print('Unions with duplicate region keys:', len(duplicate_keys[duplicate_keys > 1]))

print(unions.info())

uni_vc = unions['region_key'].value_counts()

if len(uni_vc[uni_vc > 1]) == 0:
    print('union region keys unique')
else:
    print(f'{len(uni_vc[uni_vc > 1])} not unique')

union region keys unique
Unions with duplicate region keys: 0
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5160 entries, 0 to 5159
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   region_key  5160 non-null   object  
 1   geometry    5160 non-null   geometry
 2   div         5160 non-null   object  
 3   dis         5160 non-null   object  
 4   upa         5160 non-null   object  
 5   uni         5160 non-null   object  
 6   area        5160 non-null   float64 
dtypes: float64(1), geometry(1), object(5)
memory usage: 282.3+ KB
None
union region keys unique


## Each region (excluding divs) must have a valid parent which it is contained by and labelled by region key as within

In [21]:
errors = validate_region_parents(unions, upazilas, 'upa')

total_errors = sum(len(err_dict['errors']) for err_dict in errors)
print(f'Total number of errors: {total_errors}')

if total_errors == 0:
    print("No errors found.")
else:
    print(f"Errors found: {total_errors}")

Total number of errors: 0
No errors found.


## Regions must not overlap

In [22]:
# overlaps = find_overlapping_regions(unions)

# if overlaps:
#     print("Overlapping regions found:")
#     for pair in overlaps:
#         print(f"Region {pair[0]} overlaps with Region {pair[1]}")
# else:
#     print("No overlapping regions found.")

## Regions must be completely contained by parent

In [23]:
containment_issues = check_individual_containment(unions, upazilas)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No containment issues")

No containment issues


## Regions must not contain gaps

In [24]:
containment_issues = check_region_containment(upazilas, districts)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No parent gap issues")

No parent gap issues


In [34]:
mc = mouzas.copy()
unc = unions.copy()

# Drop items with no geometry
mc = mc.dropna(subset=['geometry'])

# add area to mouca df
mc['area'] = mc['geometry'].area

In [35]:
# find big mouza, visually identified in qgis
big_mou = mc[
    (mc['div'] == 'Dhaka') & 
    (mc['dis'] == 'Dhaka') & 
    (mc['upa'].isnull()) & 
    (mc['uni'].isnull()) & 
    (mc['mou'].isnull())
]

# find unions within big mou
unis_within_big_mou = unc[unc['geometry'].centroid.within(big_mou['geometry'].iloc[0])].copy().to_crs(epsg=32645)
unis_within_big_mou.info()
mc.info()
unis_within_big_mou['mou'] = unis_within_big_mou['uni']
mc = pd.concat([mc, unis_within_big_mou], ignore_index=True)

# reset region key
mc['region_key'] = mc['div'] + '@' + mc['dis'] + '@' + mc['upa'] + '@' + mc['uni'] + '@' + mc['mou']

# remove big mouza
mc = mc[~(
    (mc['div'] == 'Dhaka') & 
    (mc['dis'] == 'Dhaka') & 
    (mc['upa'].isnull()) & 
    (mc['uni'].isnull()) & 
    (mc['mou'].isnull())
)]

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 137 entries, 1464 to 1666
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   region_key  137 non-null    object  
 1   geometry    137 non-null    geometry
 2   div         137 non-null    object  
 3   dis         137 non-null    object  
 4   upa         137 non-null    object  
 5   uni         137 non-null    object  
 6   area        137 non-null    float64 
dtypes: float64(1), geometry(1), object(5)
memory usage: 8.6+ KB
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 58187 entries, 0 to 58187
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   area        58187 non-null  float64 
 1   div         58155 non-null  object  
 2   dis         58158 non-null  object  
 3   upa         58157 non-null  object  
 4   uni         58154 non-null  object  
 5   mou         57097 non-null  object  

  return GeometryArray(data, crs=_get_common_crs(to_concat))


## Region keys must be complete, containing no null values

In [None]:
unc['centroid'] = unc['geometry'].centroid

# Fill missing values for 'div', 'dis', 'upa', and 'uni'
count = 0
for i, m in mc.iterrows():
    count += 1

    if count % 500 == 0:
        print(f"Processing row {count} / {len(mc)}")
    parent, _ = find_best_parent(m['geometry'], unc)
    if parent is None:
        # if mouza is not within a union, add its geometry
        # to the nearest union by centroid
        m_centroid = m['geometry'].centroid
        unc['distance_to_geometry'] = unc['centroid'].apply(lambda x: m_centroid.distance(x))
        nearest_union = unc.sort_values(by='distance_to_geometry').head(1).index[0]

        # expand union to include parentless mouza
        unc.at[nearest_union, 'geometry'] = unc.loc[nearest_union, 'geometry'].union(m['geometry'])
        parent, _ = find_best_parent(m['geometry'], unc)

        if parent is None:
            print(f'still no parent for {i} after merging geometry to nearest union')

    mc.at[i, 'div'] = parent['div']
    mc.at[i, 'dis'] = parent['dis']
    mc.at[i, 'upa'] = parent['upa']
    mc.at[i, 'uni'] = parent['uni']

print('step 1')
# List to store new merged mouzas
rows_to_add = []
rows_to_remove = []

# Handle merging of mouzas within the same union
for upa_uni, union_group in mc.groupby(['upa', 'uni']):
    mouza_missing = union_group[union_group['mou'].isnull()]

    if len(mouza_missing) == 0:
        continue

    if len(mouza_missing) == 1:
        # If there is only one mouza missing, set its 'mou' to the 'uni' value
        mouza_index = mouza_missing.index[0]
        mc.at[mouza_index, 'mou'] = mc.loc[mouza_index, 'uni']
        continue
    
    # Merge geometries of all mouza_missing into a single polygon
    merged_geometry = mouza_missing.geometry.union_all()
    
    # Create a new row for the merged mouza
    new_row = {
        'div': mouza_missing['div'].iloc[0],
        'dis': mouza_missing['dis'].iloc[0],
        'upa': mouza_missing['upa'].iloc[0],
        'uni': mouza_missing['uni'].iloc[0],
        'mou': mouza_missing['uni'].iloc[0],
        'geometry': merged_geometry
    }
    
    # Append new row to rows_to_add
    rows_to_add.append(new_row)
    
    # Collect the mouzas to be removed
    rows_to_remove.extend(mouza_missing.index.tolist())

print('step 2')

# Remove all rows that were merged
mc = mc.drop(rows_to_remove)

# Convert rows_to_add into a DataFrame and append to the existing DataFrame
merged_mouzas_df = pd.DataFrame(rows_to_add)
mc = pd.concat([mc, merged_mouzas_df], ignore_index=True)

# Update region_key column
mc['region_key'] = mc['div'] + '@' + mc['dis'] + '@' + mc['upa'] + '@' + mc['uni'] + '@' + mc['mou']

# Merge rows with duplicate region_key
rows_to_add = []
rows_to_remove = []

# Identify and merge rows with duplicate region_key
for region_key, group in mc.groupby('region_key'):
    if len(group) > 1:
        # Merge geometries of duplicates
        merged_geometry = group.geometry.union_all()

        # Create new row for merged region
        new_row = {
            'div': group['div'].iloc[0],
            'dis': group['dis'].iloc[0],
            'upa': group['upa'].iloc[0],
            'uni': group['uni'].iloc[0],
            'mou': group['mou'].iloc[0],
            'geometry': merged_geometry,
            'region_key': region_key
        }

        # Append new row to rows_to_add
        rows_to_add.append(new_row)

        # Collect the original rows to be removed
        rows_to_remove.extend(group.index.tolist())

# Remove duplicate rows
mc = mc.drop(rows_to_remove)

# Add merged rows
merged_regions_df = pd.DataFrame(rows_to_add)
mc = pd.concat([mc, merged_regions_df], ignore_index=True)

# Update region_key column
mc['region_key'] = mc['div'] + '@' + mc['dis'] + '@' + mc['upa'] + '@' + mc['uni'] + '@' + mc['mou']
# Output the resulting DataFrame info
mc.info()

# Check if there are still any mouzas without names
unnamed_mouzas = mc[mc['mou'].isnull()]
print(f'Number of unnamed mouzas: {len(unnamed_mouzas)}')

if len(unnamed_mouzas) > 0:
    print(unnamed_mouzas)

## Region keys must be unique

In [None]:
mou_vc = mc['region_key'].value_counts()

if len(mou_vc[mou_vc > 1]) == 0:
    print('mouza region keys unique')
else:
    print(f'{len(mou_vc[mou_vc > 1])} not unique')

## Each region (excluding divs) must have a valid parent which it is contained by and labelled by region key as within

In [None]:
errors = validate_region_parents(mc, unc, 'uni')

total_errors = sum(len(err_dict['errors']) for err_dict in errors)
print(f'Total number of errors: {total_errors}')

if total_errors == 0:
    print("No errors found.")
else:
    print(f"Errors found: {total_errors}")

## Regions must not overlap

In [None]:
num_chunks = 128
chunks = [[] for _ in range(num_chunks)]
for i, row in mc.iterrows():
    chunk_index = i % num_chunks
    chunks[chunk_index].append(row)

chunks = [pd.DataFrame(chunk) for chunk in chunks]

overlaps = []
for i in range(len(chunks)):
    overlaps.extend(find_overlapping_regions(chunks[i]))

if overlaps:
    print("Overlapping regions found:")
    for pair in overlaps:
        print(f"Region {pair[0]} overlaps with Region {pair[1]}")
else:
    print("No overlapping regions found.")

## Regions must be completely contained by parent

In [None]:
containment_issues = check_individual_containment(mc, unc, 0.9)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    
    for issue in containment_issues:
        print(issue)
        
        # Get labelled parent for the current issue
        mouza_region_key = issue.split('\n')[1].split(':')[1].strip()
        mouza = mc[mc['region_key'] == mouza_region_key].iloc[0]
        
        labelled_parent = get_labelled_parent(
            mouza,
            unc,
        )

        fig, ax = plt.subplots()
        
        gpd.GeoDataFrame(geometry=[labelled_parent['geometry']]).plot(ax=ax, linewidth=2, edgecolor='red', facecolor='none', alpha=0.5)
        gpd.GeoDataFrame(geometry=[mouza['geometry']]).plot(ax=ax, linewidth=1, edgecolor='blue', facecolor='none', alpha=0.5)
        
        plt.legend()
        plt.show()
        print('================================')

else:
    print("No containment issues")


## Regions must not contain gaps

In [None]:
containment_issues = check_region_containment(mc, unions)

if len(containment_issues) > 0:
    print(f'{len(containment_issues)} containment issues found:')
    for issue in containment_issues:
        print(issue)
else:
    print("No parent gap issues")

In [None]:
divisions.to_file('div.geojson', driver='GeoJSON')
districts.to_file('dis.geojson', driver='GeoJSON')
upazilas.to_file('upa.geojson', driver='GeoJSON')
unc.drop(columns=['centroid']).to_file('uni.geojson', driver='GeoJSON')
mc.to_file('mou.geojson', driver='GeoJSON')