The goal of this notebook is to create a geodaframe containing information about each solar panel annotation. This dataframe will be useful in creating image mask pairs of tiles of the Cape Town geotiff images. The code to create the geodataframe and the image make pairs is available in this notebook.   

In [2]:
# import necessary packages
import json
import pandas as pd
import geopandas as gpd
import os
import numpy as np
import rasterio
import ast
import cv2
import re
import shutil
import random
from rasterio.transform import rowcol
from rasterio.windows import Window
from shapely.geometry import Polygon
from shapely.geometry import shape
from shapely.geometry import box
from collections import namedtuple
from tqdm import tqdm
from PIL import Image

In [None]:
# Determine the images that have been annotated
status = pd.read_excel(r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\CapeTown_ImageIDs.xlsx")
completed = status[status["Status"]=='Completed']
completed = completed[['Image ID', 'Annotator']]
completed

  warn(msg)


Unnamed: 0,Image ID,Annotator
56,W07A_1,Gary Alvarez Mejia
63,W07A_16,Abby Finkle
72,W07A_24,Vanshika Mittal
106,W07C_10,Biz Yoder
107,W07C_11,Biz Yoder
...,...,...
2563,W57C_3,Fiona Bolte-Bradhurst
2564,W57C_4,Fiona Bolte-Bradhurst
2565,W57C_5,Fiona Bolte-Bradhurst
2568,W57C_8,Fiona Bolte-Bradhurst


There are 2807 images of Cape Town in total. 
As of Nov 29, there 544 completed images, including those that have no solar panel annotation. 151 geotiff images have annotations.
As of Fev 1st, there 731 completed images, including those that have no solar panel annotation. 251 geotiff images have annotations.

In [11]:
completed['Annotator'].unique()

array(['Gary Alvarez Mejia', 'Abby Finkle', 'Vanshika Mittal',
       'Biz Yoder', 'Zeinab Mukhtar', 'Shehr Naz Ashraf', 'Ye Khaung Oo',
       'Veena Shirsath', 'Brian Mulu Mutua', 'Halle Evans',
       'Ummamah Shah', 'Fiona Bolte-Bradhurst'], dtype=object)

Abby Finkle has only one completed image. Gary Alvarez Mejia has two.

I couldn't find Abby Finkle's annotations layer. Veena didn't upload an annotations layer.

In [None]:
# Filter completed annotations
selected_annotators = ['Gary Alvarez Mejia', 'Abby Finkle', 'Vanshika Mittal',
       'Biz Yoder', 'Zeinab Mukhtar', 'Shehr Naz Ashraf', 'Ye Khaung Oo',
       'Veena Shirsath', 'Brian Mulu Mutua', 'Halle Evans',
       'Ummamah Shah', 'Fiona Bolte-Bradhurst']
filtered_completed_df = completed[completed['Annotator'].isin(selected_annotators)]

# Map annotator names to keys
key_mapping = {
    'Biz Yoder': 'biz',
    'Brian Mulu Mutua': 'mutua',
    'Fiona Bolte-Bradhurst': 'fiona',
    'Ummamah Shah': 'shah',
    'Ye Khaung Oo': 'ye',
    'Gary Alvarez Mejia': 'mejia',
    'Vanshika Mittal': 'mittal',
    'Zeinab Mukhtar': 'mukhtar',
    'Shehr Naz Ashraf': 'shehr',
    'Halle Evans': 'evans'
}

# Set file paths
file_paths = {
    'biz': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Biz Yoder\yoder_annotations 0128_v2.shp",
    'mutua': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Brian Mulu Mutua\mutua_annotations 1027.shp",
    'fiona': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Fiona Bolte-Bradhurst\1.31\bolte.bradhurst_annotations_1.31.shp",
    'shah': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Ummamah Shah\Shah_30.shp",
    'ye': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Ye Khaung Oo\Ye_annotations.shp",
    'mejia': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Gary Alvarez Mejia\11-13\alvarezmejia_annotations.shp",
    'mittal': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Vanshika Mittal\Mittal_annotations.shp",
    'mukhtar': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Zeinab Mukhtar\mostupdated\mukhtar_annotations_dec5.shp",
    'shehr': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Shehr Naz Ashraf\ashraf_annotations.shp",
    'evans': r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\Annotation layers\Halle Evans\evans_annotations_layer.shp"
}


annotator_image_ids = filtered_completed_df.groupby('Annotator')['Image ID'].apply(list).to_dict()
annotator_image_ids = {key_mapping[old_key]: value for old_key, value in annotator_image_ids.items()}

In [None]:
# Extract the CRS of the images. This will be used to reproject the annotations layers to this CRS to avoid any inconsistencies.
input_tif = r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\AP2023_TIFFs_Bass\2023_RGB_8cm_W18B_5.tif"
with rasterio.open(input_tif) as dataset:
    geotif_crs = dataset.crs

# Clean the annotation layers be removing those marked as PV pool and PV heater
def clean_annotation_layers(layer):
    copy = layer
    copy = copy.to_crs(geotif_crs)
    copy['area'] = copy.geometry.area
    
    # remove annotations tagged as PV pool and PV heater
    if 'PV_Pool' in copy.columns:
        copy = copy[copy['PV_Pool'] != 1]

    if 'PV_pool' in copy.columns:
        copy = copy[copy['PV_pool'] != 1]

    if 'PV_heater' in copy.columns:
        copy = copy[copy['PV_heater'] != 1]
    
    columns_to_drop = ['layer', 'path', 'PV_heater', 'uncertflag', 'PV_Pool', 'PV_pool']
    copy = copy.drop([col for col in columns_to_drop if col in copy.columns], axis=1) 
    
    copy.reset_index(drop=True, inplace=True)       
    
    return copy

In [None]:
# Load and process annotations
annotations_list = []
for annotator, path in file_paths.items():
    annotations = gpd.read_file(path)
    # print(annotations.columns)
    # print(annotator)
    annotations['annotator'] = annotator
    if (annotator != 'biz') & (annotator != 'evans'):
        annotations = annotations[annotations['path'].isnull()]
    annotations = clean_annotation_layers(annotations)
    annotations_list.append(annotations)

# Concatenate all annotations and add important polygon features (polygon's centroid coordinates)
annotations = pd.concat(annotations_list, ignore_index=True)
annotations.reset_index(drop=True, inplace=True)
annotations['id'] = annotations.index
annotations['centroid'] = annotations.geometry.centroid
annotations['centroid_latitude'] = annotations.centroid.y
annotations['centroid_longitude'] = annotations.centroid.x
annotations.drop(columns=['centroid'], inplace=True)
annotations = annotations[annotations['geometry'].notnull()]

# Add additional columns to annotations
annotations[['image_name', 'nw_corner_of_image_latitude', 'nw_corner_of_image_longitude', 
             'se_corner_of_image_latitude', 'se_corner_of_image_longitude']] = None

annotations

Unnamed: 0,id,area,geometry,annotator,centroid_latitude,centroid_longitude,image_name,nw_corner_of_image_latitude,nw_corner_of_image_longitude,se_corner_of_image_latitude,se_corner_of_image_longitude
0,0,23.162657,"POLYGON ((-19993.55 -3769759.6, -19988.801 -37...",biz,-3.769763e+06,-19991.996891,,,,,
1,1,8.567160,"POLYGON ((-13352.818 -3768899.929, -13345.695 ...",biz,-3.768902e+06,-13349.578806,,,,,
2,2,9.287182,"POLYGON ((-13353.655 -3768901.389, -13346.512 ...",biz,-3.768904e+06,-13350.422909,,,,,
3,3,8.580228,"POLYGON ((-13354.608 -3768903.024, -13347.583 ...",biz,-3.768906e+06,-13351.395241,,,,,
4,4,10.808848,"POLYGON ((-13354.102 -3768907.616, -13353.441 ...",biz,-3.768910e+06,-13351.308980,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
19908,19908,4.039285,"POLYGON ((-33704.692 -3760339.71, -33701.724 -...",evans,-3.760340e+06,-33703.292947,,,,,
19909,19909,2.332195,"POLYGON ((-33661.945 -3760112.949, -33660.812 ...",evans,-3.760114e+06,-33661.385134,,,,,
19910,19910,26.211835,"POLYGON ((-30450.965 -3763339.573, -30447.33 -...",evans,-3.763343e+06,-30448.351613,,,,,
19911,19911,25.601888,"POLYGON ((-30458.329 -3763342.933, -30454.279 ...",evans,-3.763346e+06,-30455.592527,,,,,


Next, we need to determine which image each polygon belongs to. We are doing this by calculating the border coordinates for each annotator's images, then checking which polygons are within the images' bounds.

In [None]:
def get_image_border_coordinates(image_path):
    with rasterio.open(image_path) as src:
        return src.bounds

folder_path = r'C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\AP2023_TIFFs_Bass'
prefix = '2023_RGB_8cm_'

# Dictionary to store border coordinates for each annotator's images
annotator_border_coordinates = {}
completed_ann = []

# Dictionary to store already processed images
processed_images = {}

# Iterate over each annotator and add the names of the images they completed
for annotator, image_names in annotator_image_ids.items():
    border_coordinates = {}
    for image_name in image_names:
        full_name = f"{prefix}{image_name}.tif"
        image_path = os.path.join(folder_path, full_name)
        
        if os.path.exists(image_path):
            if image_name not in processed_images:
                coordinates = get_image_border_coordinates(image_path)
                processed_images[image_name] = coordinates
            else:
                coordinates = processed_images[image_name]
            
            border_coordinates[image_name] = coordinates
    
    annotator_border_coordinates[annotator] = border_coordinates
    completed_ann.append(annotator)

In [None]:
# some more data manipulations before associating the polygons to the images
BoundingBox = namedtuple('BoundingBox', ['left', 'bottom', 'right', 'top'])

transformed_dict = {}
for annotator, images in annotator_border_coordinates.items():
    transformed_dict[annotator] = {image: {'left': bounds.left, 'bottom': bounds.bottom, 'right': bounds.right, 'top': bounds.top} for image, bounds in images.items()}

annotator_border_coordinates = transformed_dict 

# Function to check if the centroid (consequently the polygon) is within a bounding box
def is_point_within_bounds(lat, lon, bounds):
    return bounds['left'] <= lon <= bounds['right'] and bounds['bottom'] <= lat <= bounds['top']

In [None]:
flattened_coordinates = {}
for annotator, images in annotator_border_coordinates.items():
    flattened_coordinates.update(images)

# Iterate over each row in the annotations DataFrame
for idx, row in annotations.iterrows():
    # annotator = row['annotator']
    centroid_lat = row['centroid_latitude']
    centroid_lon = row['centroid_longitude']
    
    # Check which image the centroid belongs to
    for image_name, bounds in flattened_coordinates.items():
        if is_point_within_bounds(centroid_lat, centroid_lon, bounds):
            annotations.loc[idx, 'image_name'] = image_name
            annotations.loc[idx, 'nw_corner_of_image_latitude'] = bounds['top']
            annotations.loc[idx, 'nw_corner_of_image_longitude'] = bounds['left']
            annotations.loc[idx, 'se_corner_of_image_latitude'] = bounds['bottom']
            annotations.loc[idx, 'se_corner_of_image_longitude'] = bounds['right']
            break

# Now we associated the polygons to the images
annotations

Unnamed: 0,id,area,geometry,annotator,centroid_latitude,centroid_longitude,image_name,nw_corner_of_image_latitude,nw_corner_of_image_longitude,se_corner_of_image_latitude,se_corner_of_image_longitude
0,0,23.162657,"POLYGON ((-19993.55 -3769759.6, -19988.801 -37...",biz,-3.769763e+06,-19991.996891,W16C_21,-3769000.0,-20000.0,-3770000.0,-19000.0
1,1,8.567160,"POLYGON ((-13352.818 -3768899.929, -13345.695 ...",biz,-3.768902e+06,-13349.578806,W16D_17,-3768000.0,-14000.0,-3769000.0,-13000.0
2,2,9.287182,"POLYGON ((-13353.655 -3768901.389, -13346.512 ...",biz,-3.768904e+06,-13350.422909,W16D_17,-3768000.0,-14000.0,-3769000.0,-13000.0
3,3,8.580228,"POLYGON ((-13354.608 -3768903.024, -13347.583 ...",biz,-3.768906e+06,-13351.395241,W16D_17,-3768000.0,-14000.0,-3769000.0,-13000.0
4,4,10.808848,"POLYGON ((-13354.102 -3768907.616, -13353.441 ...",biz,-3.768910e+06,-13351.308980,W16D_17,-3768000.0,-14000.0,-3769000.0,-13000.0
...,...,...,...,...,...,...,...,...,...,...,...
19908,19908,4.039285,"POLYGON ((-33704.692 -3760339.71, -33701.724 -...",evans,-3.760340e+06,-33703.292947,W36B_2,-3760000.0,-34000.0,-3761000.0,-33000.0
19909,19909,2.332195,"POLYGON ((-33661.945 -3760112.949, -33660.812 ...",evans,-3.760114e+06,-33661.385134,W36B_2,-3760000.0,-34000.0,-3761000.0,-33000.0
19910,19910,26.211835,"POLYGON ((-30450.965 -3763339.573, -30447.33 -...",evans,-3.763343e+06,-30448.351613,,,,,
19911,19911,25.601888,"POLYGON ((-30458.329 -3763342.933, -30454.279 ...",evans,-3.763346e+06,-30455.592527,,,,,


In [None]:
# Saving the processed dataframe locally
# available_annotations = annotations[annotations['image_name'].notnull()]
# available_annotations.to_file(r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Teams\Team 1 Machine learning\CT - MachineLearning\S1 Machine Learning\available_annotations_feb.shp")

  available_annotations.to_file(r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Teams\Team 1 Machine learning\CT - MachineLearning\S1 Machine Learning\available_annotations_feb.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


So far, we created a dataset that contains all of the available annotations, their geographical coordinates, each polygon centroid's coordinates, the annotator name, and the image each annotation belongs to. Now, we will calculate the pixel coordinates and the area in pixels of each polygon. These pixel corrdinates are necessary to cut the images into image-mask pairs.

In [None]:
# This cell will run for a while. It calculates the pixel metrics of each polygon.
annotations = annotations[annotations['image_name'].notnull()]

def calculate_pixel_metrics_grouped(group, image_base_path):
    metrics = []
    for _, row in group.iterrows():
        geotiff_path = image_base_path + row['image_name'] + ".tif"
        try:
            with rasterio.open(geotiff_path) as src:
                transform = src.transform
                metric = calculate_pixel_metrics_dynamic(row, transform)
                metrics.append(metric)
        except FileNotFoundError:
            print(f"GeoTIFF not found: {geotiff_path}")
            metrics.append(None)
    
    return pd.DataFrame(metrics, index=group.index)

def calculate_pixel_metrics_dynamic(row, transform):
    polygon = shape(row['geometry'])
    centroid = polygon.centroid

    centroid_pixel_row, centroid_pixel_col = rowcol(transform, centroid.x, centroid.y)

    pixel_vertices = [
        rowcol(transform, vertex[0], vertex[1]) for vertex in polygon.exterior.coords
    ]

    pixel_polygon = shape({
        'type': 'Polygon',
        'coordinates': [[(c, r) for r, c in pixel_vertices]]
    })
    pixel_area = pixel_polygon.area

    return {
        'polygon_vertices_pixels': pixel_vertices,
        'centroid_latitude_pixels': centroid_pixel_row,
        'centroid_longitude_pixels': centroid_pixel_col,
        'area_pixels': pixel_area
    }

# Group the annotations by image_name
grouped_annotations = annotations.groupby('image_name')

image_base_path = r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\AP2023_TIFFs_Bass\2023_RGB_8cm_"
pixel_metrics_list = []

for name, group in tqdm(grouped_annotations, desc="Processing Images"):
    print(f"Processing image: {name}")
    metrics = calculate_pixel_metrics_grouped(group, image_base_path)
    pixel_metrics_list.append(metrics)

# Concatenate all metrics into a single DataFrame
pixel_metrics_df = pd.concat(pixel_metrics_list)

# Assign each metric to its respective column in the annotations DataFrame
annotations = annotations.join(pixel_metrics_df)

Processing Images:   0%|          | 0/184 [00:00<?, ?it/s]

Processing image: W07A_1


Processing Images:   1%|          | 1/184 [00:00<02:05,  1.46it/s]

Processing image: W07A_24


Processing Images:   1%|          | 2/184 [00:01<02:02,  1.49it/s]

Processing image: W07C_10


Processing Images:   2%|▏         | 3/184 [00:05<06:43,  2.23s/it]

Processing image: W07C_11


Processing Images:   2%|▏         | 4/184 [00:14<15:16,  5.09s/it]

Processing image: W07C_12


Processing Images:   3%|▎         | 5/184 [00:29<25:34,  8.58s/it]

Processing image: W07C_13


Processing Images:   3%|▎         | 6/184 [00:59<46:42, 15.75s/it]

Processing image: W07C_16


Processing Images:   4%|▍         | 7/184 [01:09<41:19, 14.01s/it]

Processing image: W07C_17


Processing Images:   4%|▍         | 8/184 [01:12<30:11, 10.29s/it]

Processing image: W07C_2


Processing Images:   5%|▍         | 9/184 [01:13<22:06,  7.58s/it]

Processing image: W07C_21


Processing Images:   5%|▌         | 10/184 [01:18<19:46,  6.82s/it]

Processing image: W07C_22


Processing Images:   6%|▌         | 11/184 [01:25<19:33,  6.78s/it]

Processing image: W07C_23


Processing Images:   7%|▋         | 12/184 [01:25<13:51,  4.84s/it]

Processing image: W07C_3


Processing Images:   7%|▋         | 13/184 [01:31<14:09,  4.97s/it]

Processing image: W07C_4


Processing Images:   8%|▊         | 14/184 [01:32<10:38,  3.75s/it]

Processing image: W07C_5


Processing Images:   8%|▊         | 15/184 [01:33<08:36,  3.05s/it]

Processing image: W07C_6


Processing Images:   9%|▊         | 16/184 [01:36<08:12,  2.93s/it]

Processing image: W07C_7


Processing Images:   9%|▉         | 17/184 [01:37<06:36,  2.37s/it]

Processing image: W07C_8


Processing Images:  10%|▉         | 18/184 [01:50<15:45,  5.70s/it]

Processing image: W07C_9


Processing Images:  10%|█         | 19/184 [01:55<14:31,  5.28s/it]

Processing image: W07D_1


Processing Images:  11%|█         | 20/184 [01:55<10:42,  3.92s/it]

Processing image: W07D_6


Processing Images:  11%|█▏        | 21/184 [01:56<07:55,  2.92s/it]

Processing image: W08A_1


Processing Images:  12%|█▎        | 23/184 [01:59<05:53,  2.20s/it]

Processing image: W08A_12
Processing image: W08A_2


Processing Images:  13%|█▎        | 24/184 [02:00<04:50,  1.82s/it]

Processing image: W08B_4


Processing Images:  14%|█▎        | 25/184 [02:01<03:42,  1.40s/it]

Processing image: W08B_9


Processing Images:  14%|█▍        | 26/184 [02:02<03:47,  1.44s/it]

Processing image: W12A_17


Processing Images:  15%|█▍        | 27/184 [02:08<07:02,  2.69s/it]

Processing image: W12A_21


Processing Images:  15%|█▌        | 28/184 [02:13<08:36,  3.31s/it]

Processing image: W12A_22


Processing Images:  16%|█▋        | 30/184 [02:31<13:59,  5.45s/it]

Processing image: W12A_23


Processing Images:  17%|█▋        | 31/184 [02:31<09:50,  3.86s/it]

Processing image: W12A_24
Processing image: W12C_11


Processing Images:  17%|█▋        | 32/184 [02:33<08:13,  3.24s/it]

Processing image: W12C_12


Processing Images:  18%|█▊        | 33/184 [02:37<08:57,  3.56s/it]

Processing image: W12C_16


Processing Images:  18%|█▊        | 34/184 [02:38<06:45,  2.70s/it]

Processing image: W12C_4


Processing Images:  19%|█▉        | 35/184 [02:42<07:59,  3.22s/it]

Processing image: W12C_6


Processing Images:  20%|██        | 37/184 [02:43<04:23,  1.79s/it]

Processing image: W13C_11
Processing image: W13C_13


Processing Images:  21%|██        | 38/184 [18:38<11:40:22, 287.83s/it]

Processing image: W13C_16


Processing Images:  21%|██        | 39/184 [34:26<19:33:53, 485.75s/it]

Processing image: W13C_18


Processing Images:  22%|██▏       | 40/184 [36:06<14:47:59, 370.00s/it]

Processing image: W13C_22


Processing Images:  22%|██▏       | 41/184 [37:44<11:27:46, 288.57s/it]

Processing image: W13C_8


Processing Images:  23%|██▎       | 42/184 [39:30<9:13:20, 233.80s/it] 

Processing image: W14A_11


Processing Images:  23%|██▎       | 43/184 [41:20<7:41:37, 196.44s/it]

Processing image: W14A_12


Processing Images:  24%|██▍       | 44/184 [43:20<6:45:06, 173.62s/it]

Processing image: W14A_21


Processing Images:  24%|██▍       | 45/184 [44:59<5:50:19, 151.22s/it]

Processing image: W16C_14


Processing Images:  25%|██▌       | 46/184 [46:55<5:23:26, 140.63s/it]

Processing image: W16C_15


Processing Images:  26%|██▌       | 47/184 [48:47<5:01:22, 131.99s/it]

Processing image: W16C_17


Processing Images:  26%|██▌       | 48/184 [50:51<4:53:59, 129.70s/it]

Processing image: W16C_18


Processing Images:  27%|██▋       | 49/184 [52:49<4:43:52, 126.17s/it]

Processing image: W16C_19


Processing Images:  27%|██▋       | 50/184 [55:01<4:45:21, 127.77s/it]

Processing image: W16C_20


Processing Images:  28%|██▊       | 51/184 [57:08<4:43:16, 127.80s/it]

Processing image: W16C_21


Processing Images:  28%|██▊       | 52/184 [59:41<4:57:39, 135.30s/it]

Processing image: W16C_22


Processing Images:  29%|██▉       | 53/184 [1:01:37<4:42:50, 129.55s/it]

Processing image: W16D_12


Processing Images:  29%|██▉       | 54/184 [1:03:15<4:19:38, 119.84s/it]

Processing image: W16D_16


Processing Images:  30%|██▉       | 55/184 [1:05:14<4:17:07, 119.59s/it]

Processing image: W16D_17


Processing Images:  30%|███       | 56/184 [1:06:59<4:05:58, 115.30s/it]

Processing image: W16D_21


Processing Images:  31%|███       | 57/184 [1:09:14<4:16:29, 121.18s/it]

Processing image: W16D_22


Processing Images:  32%|███▏      | 58/184 [1:11:09<4:10:40, 119.37s/it]

Processing image: W16D_23


Processing Images:  32%|███▏      | 59/184 [1:12:55<4:00:40, 115.52s/it]

Processing image: W16D_24


Processing Images:  33%|███▎      | 60/184 [1:14:41<3:52:36, 112.55s/it]

Processing image: W16D_25


Processing Images:  33%|███▎      | 61/184 [1:16:30<3:48:39, 111.54s/it]

Processing image: W16D_7


Processing Images:  34%|███▎      | 62/184 [1:18:16<3:43:10, 109.75s/it]

Processing image: W17B_11


Processing Images:  34%|███▍      | 63/184 [1:20:20<3:49:59, 114.04s/it]

Processing image: W17B_2


Processing Images:  35%|███▍      | 64/184 [1:22:22<3:53:06, 116.56s/it]

Processing image: W18B_5


Processing Images:  35%|███▌      | 65/184 [1:24:17<3:50:15, 116.10s/it]

Processing image: W18B_8


Processing Images:  36%|███▌      | 66/184 [1:25:47<3:32:41, 108.15s/it]

Processing image: W18B_9


Processing Images:  36%|███▋      | 67/184 [1:27:51<3:40:09, 112.90s/it]

Processing image: W18C_4


Processing Images:  37%|███▋      | 68/184 [1:29:27<3:28:20, 107.77s/it]

Processing image: W18D_21


Processing Images:  38%|███▊      | 69/184 [1:30:01<2:44:07, 85.63s/it] 

Processing image: W18D_22


Processing Images:  38%|███▊      | 70/184 [1:32:00<3:02:04, 95.83s/it]

Processing image: W19B_2


Processing Images:  39%|███▊      | 71/184 [1:33:34<2:59:15, 95.18s/it]

Processing image: W22D_22


Processing Images:  39%|███▉      | 72/184 [1:35:05<2:55:34, 94.06s/it]

Processing image: W23A_13


Processing Images:  40%|███▉      | 73/184 [1:36:38<2:53:10, 93.61s/it]

Processing image: W23A_14


Processing Images:  40%|████      | 74/184 [1:38:19<2:55:45, 95.87s/it]

Processing image: W25A_3


Processing Images:  41%|████      | 75/184 [1:40:10<3:02:22, 100.39s/it]

Processing image: W25A_4


Processing Images:  41%|████▏     | 76/184 [1:41:59<3:05:10, 102.88s/it]

Processing image: W25A_5


Processing Images:  42%|████▏     | 77/184 [1:43:56<3:11:24, 107.33s/it]

Processing image: W25A_6


Processing Images:  42%|████▏     | 78/184 [1:45:48<3:11:51, 108.60s/it]

Processing image: W25A_7


Processing Images:  43%|████▎     | 79/184 [1:47:50<3:16:55, 112.52s/it]

Processing image: W25A_8


Processing Images:  43%|████▎     | 80/184 [1:49:51<3:19:43, 115.22s/it]

Processing image: W25A_9


Processing Images:  44%|████▍     | 81/184 [1:51:44<3:16:19, 114.37s/it]

Processing image: W25B_1


Processing Images:  45%|████▍     | 82/184 [1:53:24<3:07:16, 110.16s/it]

Processing image: W25B_12


Processing Images:  45%|████▌     | 83/184 [1:55:12<3:04:18, 109.49s/it]

Processing image: W25B_16


Processing Images:  46%|████▌     | 84/184 [1:57:10<3:06:34, 111.95s/it]

Processing image: W25B_17


Processing Images:  46%|████▌     | 85/184 [1:59:06<3:06:44, 113.18s/it]

Processing image: W25B_2


Processing Images:  47%|████▋     | 86/184 [2:00:48<2:59:32, 109.92s/it]

Processing image: W25B_21


Processing Images:  47%|████▋     | 87/184 [2:02:41<2:59:13, 110.86s/it]

Processing image: W25B_6


Processing Images:  48%|████▊     | 88/184 [2:04:31<2:57:02, 110.65s/it]

Processing image: W25B_7


Processing Images:  48%|████▊     | 89/184 [2:06:14<2:51:31, 108.33s/it]

Processing image: W25C_1


Processing Images:  49%|████▉     | 90/184 [2:07:54<2:45:42, 105.77s/it]

Processing image: W25C_11


Processing Images:  49%|████▉     | 91/184 [2:09:39<2:43:38, 105.58s/it]

Processing image: W25C_12


Processing Images:  50%|█████     | 92/184 [2:11:18<2:39:04, 103.74s/it]

Processing image: W25C_13


Processing Images:  51%|█████     | 93/184 [2:13:02<2:37:09, 103.63s/it]

Processing image: W33A_10


Processing Images:  51%|█████     | 94/184 [2:14:41<2:33:22, 102.25s/it]

Processing image: W33A_18


Processing Images:  52%|█████▏    | 95/184 [2:16:13<2:26:58, 99.09s/it] 

Processing image: W33A_9


Processing Images:  52%|█████▏    | 96/184 [2:17:40<2:20:08, 95.55s/it]

Processing image: W33B_2


Processing Images:  53%|█████▎    | 97/184 [2:19:05<2:14:06, 92.49s/it]

Processing image: W33C_10


Processing Images:  53%|█████▎    | 98/184 [2:20:30<2:09:28, 90.33s/it]

Processing image: W33C_7


Processing Images:  54%|█████▍    | 99/184 [2:22:02<2:08:16, 90.54s/it]

Processing image: W33C_8


Processing Images:  54%|█████▍    | 100/184 [2:23:27<2:04:47, 89.14s/it]

Processing image: W33C_9


Processing Images:  55%|█████▍    | 101/184 [2:25:05<2:06:44, 91.62s/it]

Processing image: W36A_19


Processing Images:  55%|█████▌    | 102/184 [2:26:51<2:11:02, 95.88s/it]

Processing image: W36A_2


Processing Images:  56%|█████▌    | 103/184 [2:28:22<2:07:46, 94.65s/it]

Processing image: W36A_20


Processing Images:  57%|█████▋    | 104/184 [2:30:07<2:10:13, 97.66s/it]

Processing image: W36A_21


Processing Images:  57%|█████▋    | 105/184 [2:31:55<2:12:36, 100.71s/it]

Processing image: W36A_22


Processing Images:  58%|█████▊    | 106/184 [2:33:39<2:12:21, 101.82s/it]

Processing image: W36A_23


Processing Images:  58%|█████▊    | 107/184 [2:35:27<2:12:46, 103.46s/it]

Processing image: W36A_24


Processing Images:  59%|█████▊    | 108/184 [2:37:05<2:09:18, 102.08s/it]

Processing image: W36A_25


Processing Images:  59%|█████▉    | 109/184 [2:38:57<2:11:13, 104.98s/it]

Processing image: W36A_3


Processing Images:  60%|█████▉    | 110/184 [2:40:28<2:04:03, 100.59s/it]

Processing image: W36A_6


Processing Images:  60%|██████    | 111/184 [2:42:14<2:04:21, 102.22s/it]

Processing image: W36A_7


Processing Images:  61%|██████    | 112/184 [2:43:59<2:03:49, 103.19s/it]

Processing image: W36A_8


Processing Images:  61%|██████▏   | 113/184 [2:45:24<1:55:41, 97.77s/it] 

Processing image: W36B_1


Processing Images:  62%|██████▏   | 114/184 [2:47:13<1:57:57, 101.10s/it]

Processing image: W36B_10


Processing Images:  62%|██████▎   | 115/184 [2:49:05<2:00:02, 104.39s/it]

Processing image: W36B_11


Processing Images:  63%|██████▎   | 116/184 [2:50:53<1:59:35, 105.53s/it]

Processing image: W36B_12


Processing Images:  64%|██████▎   | 117/184 [2:52:42<1:58:47, 106.37s/it]

Processing image: W36B_14


Processing Images:  64%|██████▍   | 118/184 [2:54:33<1:58:41, 107.90s/it]

Processing image: W36B_16


Processing Images:  65%|██████▍   | 119/184 [2:56:18<1:55:51, 106.94s/it]

Processing image: W36B_17


Processing Images:  65%|██████▌   | 120/184 [2:58:02<1:53:19, 106.24s/it]

Processing image: W36B_18


Processing Images:  66%|██████▌   | 121/184 [2:59:57<1:54:07, 108.69s/it]

Processing image: W36B_2


Processing Images:  66%|██████▋   | 122/184 [3:01:44<1:51:50, 108.24s/it]

Processing image: W45C_16


Processing Images:  67%|██████▋   | 123/184 [3:03:27<1:48:29, 106.72s/it]

Processing image: W47B_18


Processing Images:  67%|██████▋   | 124/184 [3:05:21<1:48:50, 108.84s/it]

Processing image: W47B_3


Processing Images:  68%|██████▊   | 125/184 [3:06:51<1:41:35, 103.31s/it]

Processing image: W47C_1


Processing Images:  68%|██████▊   | 126/184 [3:08:47<1:43:26, 107.01s/it]

Processing image: W47C_2


Processing Images:  69%|██████▉   | 127/184 [3:10:17<1:36:57, 102.06s/it]

Processing image: W47C_3


Processing Images:  70%|██████▉   | 128/184 [3:11:06<1:20:21, 86.09s/it] 

Processing image: W47C_6


Processing Images:  70%|███████   | 129/184 [3:12:30<1:18:10, 85.28s/it]

Processing image: W48C_11


Processing Images:  71%|███████   | 130/184 [3:13:48<1:14:55, 83.26s/it]

Processing image: W48C_16


Processing Images:  71%|███████   | 131/184 [3:15:22<1:16:27, 86.55s/it]

Processing image: W48C_22


Processing Images:  72%|███████▏  | 132/184 [3:16:36<1:11:31, 82.52s/it]

Processing image: W48C_6


Processing Images:  72%|███████▏  | 133/184 [3:17:02<55:52, 65.73s/it]  

Processing image: W49A_11


Processing Images:  73%|███████▎  | 134/184 [3:18:38<1:02:13, 74.66s/it]

Processing image: W49A_16


Processing Images:  73%|███████▎  | 135/184 [3:20:20<1:07:42, 82.90s/it]

Processing image: W49A_2


Processing Images:  74%|███████▍  | 136/184 [3:21:43<1:06:22, 82.97s/it]

Processing image: W50D_25


Processing Images:  74%|███████▍  | 137/184 [3:23:31<1:10:49, 90.41s/it]

Processing image: W50D_4


Processing Images:  75%|███████▌  | 138/184 [3:25:02<1:09:27, 90.59s/it]

Processing image: W50D_5


Processing Images:  76%|███████▌  | 139/184 [3:26:40<1:09:39, 92.88s/it]

Processing image: W52D_25


Processing Images:  76%|███████▌  | 140/184 [3:28:25<1:10:49, 96.58s/it]

Processing image: W53B_14


Processing Images:  77%|███████▋  | 141/184 [3:29:59<1:08:39, 95.80s/it]

Processing image: W53B_18


Processing Images:  77%|███████▋  | 142/184 [3:30:22<51:48, 74.02s/it]  

Processing image: W53B_19


Processing Images:  78%|███████▊  | 143/184 [3:32:12<57:49, 84.61s/it]

Processing image: W53B_20


Processing Images:  78%|███████▊  | 144/184 [3:33:55<1:00:05, 90.13s/it]

Processing image: W53B_23


Processing Images:  79%|███████▉  | 145/184 [3:34:49<51:38, 79.46s/it]  

Processing image: W53B_24


Processing Images:  79%|███████▉  | 146/184 [3:36:50<58:08, 91.80s/it]

Processing image: W57A_17


Processing Images:  80%|███████▉  | 147/184 [3:38:41<1:00:16, 97.74s/it]

Processing image: W57A_18


Processing Images:  80%|████████  | 148/184 [3:40:23<59:15, 98.77s/it]  

Processing image: W57A_19


Processing Images:  81%|████████  | 149/184 [3:42:04<58:01, 99.48s/it]

Processing image: W57A_21


Processing Images:  82%|████████▏ | 150/184 [3:43:40<55:46, 98.44s/it]

Processing image: W57A_22


Processing Images:  82%|████████▏ | 151/184 [3:45:34<56:45, 103.20s/it]

Processing image: W57A_23


Processing Images:  83%|████████▎ | 152/184 [3:47:20<55:27, 104.00s/it]

Processing image: W57A_24


Processing Images:  83%|████████▎ | 153/184 [3:49:12<54:58, 106.39s/it]

Processing image: W57A_25


Processing Images:  84%|████████▎ | 154/184 [3:50:51<52:02, 104.09s/it]

Processing image: W57B_10


Processing Images:  84%|████████▍ | 155/184 [3:52:40<51:09, 105.83s/it]

Processing image: W57B_12


Processing Images:  85%|████████▍ | 156/184 [3:54:31<50:02, 107.22s/it]

Processing image: W57B_13


Processing Images:  85%|████████▌ | 157/184 [3:56:26<49:21, 109.68s/it]

Processing image: W57B_14


Processing Images:  86%|████████▌ | 158/184 [3:58:09<46:33, 107.45s/it]

Processing image: W57B_15


Processing Images:  86%|████████▋ | 159/184 [3:59:55<44:38, 107.16s/it]

Processing image: W57B_18


Processing Images:  87%|████████▋ | 160/184 [4:01:53<44:06, 110.26s/it]

Processing image: W57B_19


Processing Images:  88%|████████▊ | 161/184 [4:03:47<42:46, 111.59s/it]

Processing image: W57B_2


Processing Images:  88%|████████▊ | 162/184 [4:05:25<39:24, 107.46s/it]

Processing image: W57B_20


Processing Images:  89%|████████▊ | 163/184 [4:07:15<37:53, 108.27s/it]

Processing image: W57B_3


Processing Images:  89%|████████▉ | 164/184 [4:08:59<35:38, 106.90s/it]

Processing image: W57B_4


Processing Images:  90%|████████▉ | 165/184 [4:10:54<34:35, 109.23s/it]

Processing image: W57B_5


Processing Images:  90%|█████████ | 166/184 [4:12:32<31:47, 105.95s/it]

Processing image: W57B_6


Processing Images:  91%|█████████ | 167/184 [4:14:30<31:02, 109.59s/it]

Processing image: W57B_7


Processing Images:  91%|█████████▏| 168/184 [4:16:20<29:15, 109.73s/it]

Processing image: W57B_8


Processing Images:  92%|█████████▏| 169/184 [4:18:06<27:06, 108.46s/it]

Processing image: W57B_9


Processing Images:  92%|█████████▏| 170/184 [4:19:46<24:45, 106.08s/it]

Processing image: W57C_10


Processing Images:  93%|█████████▎| 171/184 [4:21:28<22:41, 104.72s/it]

Processing image: W57C_12


Processing Images:  93%|█████████▎| 172/184 [4:23:13<20:58, 104.91s/it]

Processing image: W57C_17


Processing Images:  94%|█████████▍| 173/184 [4:24:49<18:44, 102.25s/it]

Processing image: W57C_2


Processing Images:  95%|█████████▍| 174/184 [4:26:38<17:21, 104.12s/it]

Processing image: W57C_21


Processing Images:  95%|█████████▌| 175/184 [4:28:23<15:41, 104.65s/it]

Processing image: W57C_22


Processing Images:  96%|█████████▌| 176/184 [4:30:07<13:55, 104.40s/it]

Processing image: W57C_23


Processing Images:  96%|█████████▌| 177/184 [4:32:04<12:36, 108.12s/it]

Processing image: W57C_24


Processing Images:  97%|█████████▋| 178/184 [4:33:51<10:47, 107.90s/it]

Processing image: W57C_25


Processing Images:  97%|█████████▋| 179/184 [4:35:35<08:53, 106.70s/it]

Processing image: W57C_3


Processing Images:  98%|█████████▊| 180/184 [4:37:21<07:05, 106.47s/it]

Processing image: W57C_4


Processing Images:  98%|█████████▊| 181/184 [4:39:10<05:21, 107.13s/it]

Processing image: W57C_5


Processing Images:  99%|█████████▉| 182/184 [4:40:57<03:34, 107.10s/it]

Processing image: W57C_8


Processing Images:  99%|█████████▉| 183/184 [4:42:40<01:45, 105.91s/it]

Processing image: W57C_9


Processing Images: 100%|██████████| 184/184 [4:44:31<00:00, 92.78s/it] 


In [None]:
# Check the order of the columns of the annotations dataframe and replace them here
annotations.columns = ['id', 'annotator', 'area', 'centroid_latitude', 'centroid_longitude',
       'image_name', 'nw_corner_of_image_latitude',
       'nw_corner_of_image_longitude', 'se_corner_of_image_latitude',
       'se_corner_of_image_longitude', 'polygon_vertices_pixels',
       'centroid_latitude_pixels', 'centroid_longitude_pixels', 'area_pixels',
       'geometry']

# Save the final dataframe. Now we are ready to create the image-mask pairs
annotations.to_file(r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Teams\Team 1 Machine learning\CT - MachineLearning\S1 Machine Learning\annotations_final_feb.geojson")

Some necessary data manipulation of the annotations dataframe's polygon_vertices_pixels column before creating image mask pairs.

In [105]:
df = annotations.copy()

# Clean the strings to remove np.int32 calls and fix any incomplete strings
df['polygon_vertices_pixels'] = df['polygon_vertices_pixels'].apply(
    lambda x: re.sub(r'np\.int32\((\d+)\)', r'\1', x)
)

# Replace parentheses with square brackets
df['polygon_vertices_pixels'] = df['polygon_vertices_pixels'].apply(
    lambda x: re.sub(r'\((\d+), (\d+)\)', r'[\1, \2]', x)
)

# Ensure all tuples are properly closed
def fix_incomplete_tuples(polygon_str):
    # Find all tuples
    tuples = re.findall(r'\[\d+, \d+\]', polygon_str)
    # Join them back into a string
    fixed_str = '[' + ', '.join(tuples) + ']'
    return fixed_str

df['polygon_vertices_pixels'] = df['polygon_vertices_pixels'].apply(fix_incomplete_tuples)

# Convert the string representation of lists back to actual lists
df['polygon_vertices_pixels'] = df['polygon_vertices_pixels'].apply(
    lambda x: ast.literal_eval(x)
)

print(df['polygon_vertices_pixels'])

0        [[9495, 80], [9522, 139], [9575, 118], [9545, ...
1        [[11249, 8089], [11302, 8178], [11313, 8172], ...
2        [[11267, 8079], [11321, 8168], [11333, 8160], ...
3        [[11287, 8067], [11341, 8155], [11351, 8147], ...
4        [[11345, 8073], [11332, 8081], [11367, 8137], ...
                               ...                        
11303    [[6492, 3431], [6500, 3479], [6523, 3474], [65...
11304    [[8268, 1976], [8264, 1990], [8280, 1996], [82...
11305    [[7834, 2180], [7825, 2200], [7854, 2211], [78...
11306    [[4246, 3691], [4249, 3728], [4265, 3727], [42...
11307    [[1411, 4225], [1411, 4239], [1437, 4239], [14...
Name: polygon_vertices_pixels, Length: 11308, dtype: object


In [106]:
annotations = df

Now that the dataframe is ready, we can finally create the image mask pairs. Below are the functions we use to do so

In [None]:
def create_mask(image_shape, polygons):
    mask = np.zeros(image_shape[:2], dtype="uint8")
    for polygon in polygons:
        cv2.fillPoly(mask, [polygon], 255)
    flipped_mask = cv2.flip(mask, 0)
    rotated_mask = cv2.rotate(flipped_mask, cv2.ROTATE_90_CLOCKWISE)
    return rotated_mask

def save_tile_and_mask(tile, mask, tile_index_pixels, tile_dir, mask_dir, image_name):
    tile_filename = os.path.join(tile_dir, f'i_{image_name}_{tile_index_pixels}.png')
    mask_filename = os.path.join(mask_dir, f'm_{image_name}_{tile_index_pixels}.png')
    cv2.imwrite(tile_filename, cv2.cvtColor(tile, cv2.COLOR_RGB2BGR))
    cv2.imwrite(mask_filename, mask)

def adjust_polygon_coordinates(polygons, x_offset, y_offset):
    adjusted_polygons = []
    for polygon in polygons:
        adjusted_polygon = polygon - np.array([x_offset, y_offset])
        adjusted_polygons.append(adjusted_polygon)
    return adjusted_polygons

def process_geotiff(image_name, geotiff_path, tile_size, df, tile_dir, mask_dir):
    with rasterio.open(geotiff_path) as src:
        geotiff_array = src.read()

        if len(geotiff_array.shape) == 3:
            geotiff_array = np.transpose(geotiff_array, (1, 2, 0))

        height, width = geotiff_array.shape[:2]

        # Calculate padding
        pad_height = (tile_size - height % tile_size) % tile_size
        pad_width = (tile_size - width % tile_size) % tile_size

        # Add padding to the image
        padded_image = np.pad(geotiff_array, ((0, pad_height), (0, pad_width), (0, 0)), mode='constant')

        padded_height, padded_width = padded_image.shape[:2]

        tile_index = 0
        for y in range(0, padded_height, tile_size):
            for x in range(0, padded_width, tile_size):
                tile = padded_image[y:y+tile_size, x:x+tile_size]

                polygons_in_tile = []
                for _, row in df.iterrows():
                    polygon = row['polygon_vertices_pixels']
                    
                    bounds = {
                        'left': x,
                        'right': x+tile_size,
                        'bottom': y+tile_size,
                        'top': y
                    }
                    
                    if (bounds['left'] <= row['centroid_longitude_pixels'] <= bounds['right'] and bounds['top'] <= row['centroid_latitude_pixels'] <= bounds['bottom']):
                        polygons_in_tile.append(polygon)
                    
                adjusted_polygons = adjust_polygon_coordinates(polygons_in_tile, y, x)

                mask = create_mask(tile.shape, adjusted_polygons)
                
                # save only the masks and tiles that contain annotations
                if np.any(mask > 0):
                    tile_index_pixels = str(int(y/tile_size)) + "_" + str(int(x/tile_size))
                    save_tile_and_mask(tile, mask, tile_index_pixels, tile_dir, mask_dir, image_name)
                
                tile_index += 1


def process_all_images_in_folder(folder_path, annotations_df, tile_size, tile_dir, mask_dir, processed_images_list):
    unique_images = annotations_df['image_name'].unique()
    idx = 0
    for image_name in unique_images:
        image_path = folder_path + image_name + ".tif"
        
        if os.path.exists(image_path):
            print(image_name)
            print(idx)
            idx += 1
            processed_images_list.append(image_name)
            
            image_annotations_df = annotations_df[annotations_df['image_name'] == image_name]
            process_geotiff(image_name, image_path, tile_size, image_annotations_df, tile_dir, mask_dir)


In [None]:
# path where you want to save the tiles
tile_dir = r'C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Teams\Team 1 Machine learning\CT - MachineLearning\S1 Machine Learning\dataset\tiles_320'
# path where you want to save the images
mask_dir = r'C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Teams\Team 1 Machine learning\CT - MachineLearning\S1 Machine Learning\dataset\masks_320'
os.makedirs(tile_dir, exist_ok=True)
os.makedirs(mask_dir, exist_ok=True)

# path where the original images are saved
dataset_path = r"C:\Users\AICHA\Box\Cape Town Energy Transitions Bass Connections\Class Materials 2024-2025\Aerial Imagery\AP2023_TIFFs_Bass\2023_RGB_8cm_"

# Choose the tile size of your new dataset
tile_size = 320 

# The names of the processed images will be printed to monitor the progress
processed_images_list = []
process_all_images_in_folder(dataset_path, annotations, tile_size=320, tile_dir=tile_dir, mask_dir=mask_dir, processed_images_list=processed_images_list)

W16C_21
W16D_17
W18B_8
W18B_9
W48C_11
W57C_21
W57C_17
W57C_24
W57B_10
W36A_2
W57B_5
W57B_13
W57B_14
W45C_16
W25A_3
W57C_25
W57C_23
W57C_8
W57C_9
W57C_12
W57A_18
W57A_19
W57A_23
W57A_24
W57A_22
W57C_10
W57C_3
W57C_4
W57C_2
W57A_21
W47C_1
W36A_8
W36A_3
W16C_22
W25A_6
W18B_5
W47C_2
W57A_25
W57C_5
W25C_12
W25C_13
W25C_11
W47C_6
W57B_12
W16D_25
W16D_24
W07C_10
W07C_11
W07C_12
W07C_13
W07C_16
W07C_17
W07C_2
W07C_21
W07C_22
W07C_23
W07C_3
W07C_4
W07C_5
W07C_6
W07C_7
W07C_8
W07C_9
W07D_1
W07D_6
W08A_1
W08A_12
W08A_2
W08B_4
W08B_9
W12A_17
W12A_21
W12A_22
W12A_23
W12C_11
W12C_12
W12C_16
W12C_4
W12C_6
W13C_11
W13C_13
W13C_16
W13C_18
W13C_22
W13C_8
W14A_11
W14A_12
W14A_21
W16C_14
W16C_15
W16C_17
W16C_18
W16C_19
W16C_20
W16D_12
W16D_16
W16D_21
W16D_22
W16D_23
W16D_7
W17B_11
W33A_9
W33A_10
W33A_18
W33B_2
W33C_7
W33C_8
W33C_9
W33C_10
W57A_17
W57B_2
W57B_3
W57B_4
W57B_6
W57B_7
W57B_8
W57B_9
W57B_18
W57B_19
W57B_20
W57B_15
W57C_22
W48C_6
W48C_16
W48C_22
W49A_11
W49A_2
W49A_16
W50D_4
W50D_5
W50D_25
W47C

There are a lot of images that don't contain any solar panel. To avoid wasting training resources, we'll only select the images that contain the panels to work with (it may still be useful to train the model with the images that don't contain any panels to increase model robustness)

In [None]:
# Organizing the folders locally -> copy the masks and tiles to the destination folders

# copy the masks that contain the target to a seperate folder
def check_mask_has_target(mask_path):
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    return np.any(mask > 0)

def copy_masks_with_target(source_folder, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    
    for filename in os.listdir(source_folder):
        file_path = os.path.join(source_folder, filename)
        
        if check_mask_has_target(file_path):
            shutil.copy(file_path, destination_folder)
            print(f"Copied {filename} to {destination_folder}")
            
# copy the corresponding images to a new folder
def copy_corresponding_images(mask_folder, image_folder, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    
    for mask_filename in os.listdir(mask_folder):
        image_filename = 'i' + mask_filename[1:]
        image_path = os.path.join(image_folder, image_filename)
        
        if os.path.exists(image_path):
            shutil.copy(image_path, destination_folder)
            print(f"Copied {image_filename} to {destination_folder}")



In [None]:
source_folder = '/home/as1233/data/cape_town/masks'
destination_folder = '/home/as1233/data/cape_town/masks_target'

copy_masks_with_target(source_folder, destination_folder)

mask_folder = '/home/as1233/data/cape_town/masks_target'
image_folder = '/home/as1233/data/cape_town/tiles'
destination_folder = '/home/as1233/data/cape_town/images_target'

copy_corresponding_images(mask_folder, image_folder, destination_folder)

In [None]:
# split the dataset into train, test, and val datasets

def split_data(mask_folder, image_folder, train_folder, val_folder, test_folder, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    os.makedirs(os.path.join(train_folder, 'images'), exist_ok=True)
    os.makedirs(os.path.join(train_folder, 'masks'), exist_ok=True)
    os.makedirs(os.path.join(val_folder, 'images'), exist_ok=True)
    os.makedirs(os.path.join(val_folder, 'masks'), exist_ok=True)
    os.makedirs(os.path.join(test_folder, 'images'), exist_ok=True)
    os.makedirs(os.path.join(test_folder, 'masks'), exist_ok=True)
    
    mask_files = os.listdir(mask_folder)
    
    random.shuffle(mask_files)
    
    total_files = len(mask_files)
    train_count = int(total_files * train_ratio)
    val_count = int(total_files * val_ratio)
    test_count = total_files - train_count - val_count
    
    train_files = mask_files[:train_count]
    val_files = mask_files[train_count:train_count + val_count]
    test_files = mask_files[train_count + val_count:]
    
    def copy_files(file_list, dest_image_folder, dest_mask_folder):
        for mask_filename in file_list:
            shutil.copy(os.path.join(mask_folder, mask_filename), dest_mask_folder)
            
            image_filename = 'i' + mask_filename[1:]
            shutil.copy(os.path.join(image_folder, image_filename), dest_image_folder)
    
    copy_files(train_files, os.path.join(train_folder, 'images'), os.path.join(train_folder, 'masks'))
    copy_files(val_files, os.path.join(val_folder, 'images'), os.path.join(val_folder, 'masks'))
    copy_files(test_files, os.path.join(test_folder, 'images'), os.path.join(test_folder, 'masks'))

mask_folder = '/home/as1233/data/cape_town/masks_target'
image_folder = '/home/as1233/data/cape_town/images_target'
train_folder = '/home/as1233/data/cape_town/train'
val_folder = '/home/as1233/data/cape_town/val'
test_folder = '/home/as1233/data/cape_town/test'

split_data(mask_folder, image_folder, train_folder, val_folder, test_folder)