# Preparing Data for Unet Model

**Author**: Sage McGinley-Smith  
**Class**: CS 230: Deep Learning  
**Date**: November 2024

# Install and Load Necessary Packages and Mount Drive

In [None]:
!pip install rasterio google-cloud-storage
import os
import rasterio
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "insert path to .json key file in drive here"

# Define Functions for Tiling and Bucketing
Image tiles are 128 x 128

In [None]:
def upload_to_gcp(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

In [None]:
def tile_and_upload(geotiff_path, bucket_name):
    tile_size = 128  # Define the size of each tile
    filename = os.path.basename(geotiff_path).split(".")[0]

    # Determine the quarter based on the filename
    if "q1" in filename:
        quarter_folder = "q1"
    elif "q2" in filename:
        quarter_folder = "q2"
    elif "q3" in filename:
        quarter_folder = "q3"
    elif "q4" in filename:
        quarter_folder = "q4"
    else:
        raise ValueError("Filename does not contain a valid quarter identifier (q1, q2, q3, q4).")

    # Determine if the file is a mask or sentinel image
    if "mask" in filename:
        folder = "mask-tiles"
    else:
        folder = "sentinel-tiles"

    # Open the GeoTIFF and create tiles
    with rasterio.open(geotiff_path) as src:
        img_width, img_height = src.width, src.height

        # Iterate through the image to create tiles
        for i in range(0, img_width, tile_size):
            for j in range(0, img_height, tile_size):
                window = rasterio.windows.Window(i, j, tile_size, tile_size)

                # Read the windowed tile and save if it matches tile size
                transform = src.window_transform(window)
                tile_data = src.read(window=window)

                # Skip if the tile is smaller than expected (edge case)
                if tile_data.shape[1] != tile_size or tile_data.shape[2] != tile_size:
                    continue

                # Define tile filename and save path
                tile_filename = f"{filename}_tile_{i}_{j}.tif"
                tile_path = f"./{tile_filename}"

                # Save the tile
                with rasterio.open(
                    tile_path,
                    'w',
                    driver='GTiff',
                    height=tile_size,
                    width=tile_size,
                    count=src.count,
                    dtype=tile_data.dtype,
                    crs=src.crs,
                    transform=transform
                ) as dst:
                    dst.write(tile_data)

                # Upload to GCP in the appropriate folder structure
                destination_blob_name = f"{folder}/{quarter_folder}/{tile_filename}"
                upload_to_gcp(bucket_name, tile_path, destination_blob_name)

                # Remove local tile after uploading
                os.remove(tile_path)

# Loop Through Images and Tile + Upload Them

In [None]:
bucket_name = "230-project-tiles"
geotiff_paths = [
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/image_q1_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/image_q2_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/image_q3_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/image_q4_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/mask_q1_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/mask_q2_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/mask_q3_2019.tif",
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/mask_q4_2019.tif"
]

for geotiff_path in geotiff_paths:
    tile_and_upload(geotiff_path, bucket_name)