# Generate Cloud-Optimized Geotiffs from tiles


## Load Libraries

Check that gdal is installed

In [1]:
!gdalinfo --version

GDAL 3.6.4, released 2023/04/17


In [2]:
from osgeo import gdal
import subprocess
import json 
import pandas as pd
from google.cloud import storage
import os
import glob


In [3]:
os.environ['GS_NO_SIGN_REQUEST'] = 'YES'
os.environ['GDAL_NUM_THREADS'] = '5'

Log in to google cloud if needed 

In [4]:
#!{gcloud auth login --update-adc}

In [5]:
!{gcloud config set project swhm-dev}

Updated property [core/project].


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update



## Functions

A function to list blobs on the storage bucket 

In [13]:
def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
    """Lists all the blobs in the bucket that begin with the prefix.
    """
    storage_client = storage.Client()
   
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

    # Note: The call returns a response only when the iterator is consumed.
    blob_list = []
    for blob in blobs:
        blob_list.append(blob.name)

    if delimiter:
        print("Prefixes:")
        for prefix in blobs.prefixes:
            blob_list.append([prefix])
    
    return blob_list


### 1. Download images

In [14]:
def dl(lay_name):
    cmd = f'gsutil -m cp -R gs://swhm-image-exports/{lay_name} .'
    !{cmd}

### 2. Reproject images

saves reprojected images to /tmp 

In [15]:
def reproject(lay_name, target_crs='EPSG:3857'): 

    directory_path = lay_name
               #make a list of the files in the directory 
    files = os.listdir(directory_path+"/reprojected")
    print(files)
    # create a new file for writing
    list_file = "files.txt"
    try:
        os.remove(list_file)
    except OSError:
        pass
    
    for filename in files:
        if filename.endswith(".tif"):
            input_path = os.path.join(directory_path, filename)
            output_path = os.path.join(directory_path+"/reprojected", filename)
            cmd = f'gdalwarp -t_srs {target_crs} -overwrite {input_path} {output_path}'
            !{cmd}

### 3. Make vrt



In [19]:
#https://gdal.org/programs/gdal_translate.html
#cmdoption-gdal_translate-ovr

def makevrt(lay_name):
    directory_path = lay_name+'/*.tif'
    print('Making VRT...')
    cmd = f'gdalbuildvrt  output.vrt {directory_path}'
    !{cmd}
    print('VRT Complete!')
    

def ul(file_name, lay_name):
    print('Uploading Layer...')
    cmd = f'gsutil cp {file_name} gs://live_data_layers/rasters/{lay_name}.tif'
    !{cmd}
    print('Layer upload complete!') 

---

## 4. Wrapper Function

In [17]:
def convert_layer(lay_name): 
    '''
    Function that does the following: 
    1. Downloads from gcp bucket 
    2. makes a virtual raster - saves to output.vrt 
    3. Checks the projection and reprojects if necessary - saves to tmp.tif, otherwise translates to tmp.tif 
    ''' 
    

    
    #check projection 
    p = subprocess.run(["rio", "info", "output.vrt"], capture_output=True, text=True)
    raster_info = json.loads(p.stdout)

#reproject if needed 
    if (raster_info['crs']) != "EPSG:3857": 
        #reproject 
        print(f'reprojecting from {raster_info["crs"]}')
        warp_cmd = 'gdalwarp -t_srs EPSG:3857 -overwrite output.vrt tmp.tif \
         -co NUM_THREADS=5 -co TILED=YES -co COMPRESS=LZW -co BIGTIFF=YES \
         --config CHECK_DISK_FREE_SPACE FALSE'
        !{warp_cmd}

    else: 
        print(f'saving as {lay_name}_cog.tif') 
        translate_cmd = f'gdal_translate output.vrt {lay_name}_cog.tif \
        -co TILED=YES -co COMPRESS=LZW \
        -co BIGTIFF=YES \
         -co NUM_THREADS=5 -co TILED=YES --config CHECK_DISK_FREE_SPACE FALSE'
        !{translate_cmd}
        
    #rebuild pyramids
    
    if (raster_info["dtype"] != 'uint8'): 
            print(f'rebuilding overviews using average sampling')
            resampling = 'average'
            #!{'gdaladdo -r average tmp.tif'}
    else: 
            print(f'rebuilding overviews using nearest neighbors sampling') 
            resampling = 'nearest' 
            #!{'gdaladdo -r nearest tmp.tif'}
        
    print('translating tmp.tif to cog raster') 
    
    translate_cmd = f"rio cogeo create tmp.tif rio_{lay_name}_cog.tif \
    --allow-intermediate-compression \
    --cog-profile lzw \
    --web-optimized \
    --config CHECK_DISK_FREE_SPACE=FALSE \
    --overview-resampling={resampling} \
    --resampling={resampling}"

    # translate_cmd = f'gdal_translate tmp.tif {lay_name}_cog.tif \
    # -co TILED=YES -co COMPRESS=LZW -co COPY_SRC_OVERVIEWS=YES \
    # -co BIGTIFF=YES -stats\
    #  -co NUM_THREADS=5  --config CHECK_DISK_FREE_SPACE FALSE'
    # 
    !{translate_cmd}

           



In [24]:

resampling = 'average' 

translate_cmd = f"rio cogeo create {lay_name}_cog.tif rio_{lay_name}_cog.tif \
    --allow-intermediate-compression \
    --cog-profile lzw \
    --config CHECK_DISK_FREE_SPACE=FALSE \
    --overview-resampling={resampling}"

In [25]:
translate_cmd

'rio cogeo create Imperviousness_cog.tif rio_Imperviousness_cog.tif     --allow-intermediate-compression     --cog-profile lzw     --config CHECK_DISK_FREE_SPACE=FALSE     --overview-resampling=average'

## Get list of objects in data bucket

In [11]:
BUCKET_NAME = 'swhm-image-exports'
blobsout = list_blobs_with_prefix(BUCKET_NAME,'')

In [12]:
df = pd.DataFrame(blobsout, columns=['file_path'])#.iloc[1:]
#df['folder_name'] = df['file_path'].str.split(BUCKET_NAME, 1,expand = True)
df['gdal_path'] = df['file_path'].str.replace('gs://', '/vsigs/') 


In [13]:
lay_names= df['file_path'].str.split('/', 0).str[0]#.str.replace('/','',regex=False)
df['layer_name'] = lay_names.str.split('/',1).str[0]
#lay_names

  lay_names= df['file_path'].str.split('/', 0).str[0]#.str.replace('/','',regex=False)
  df['layer_name'] = lay_names.str.split('/',1).str[0]


In [14]:
lay_names = df["layer_name"].unique()
print(lay_names)



['Imperviousness' 'Land_Cover']


In [18]:
#function to check if the file is a Cloud Optimized GeoTIFF
def is_cog(file_path):
    command = ["rio", "cogeo", "validate", "--strict",file_path]
    result = subprocess.run(command, capture_output=True, text=True)
    return result.stderr

## Loop through layer names

Use this to run the pipeline for all layers in a list

In [19]:
def run_pipeline(lay_name): 
    #remove old files.tif 
    files_to_check = ["tmp.tif", "output.vrt", lay_name+"_cog.tiff"]

    for file in files_to_check:
        if os.path.exists(file):
            os.remove(file)
            print(f"{file} has been deleted.")

    #download layer 
    dl(lay_name)
    makevrt(lay_name)
    #convert layer 
    convert_layer(lay_name) 
    fn = f'{lay_name}_cog.tif'
    #check converted layer 
    cog_check = is_cog(fn)
    if(cog_check is None):
        print('✅ Valid COG! Beginning upload...')
        #upload layer 
        ul(fn,lay_name)

    else: 
        print('❌', cog_check)

In [23]:
# # rio_translate_cmd = f"rio cogeo create {lay_name}_cog.tif rio_{lay_name}_cog.tif \
#  --allow-intermediate-compression --cog-profile lzw --web-optimized --config CHECK_DISK_FREE_SPACE=FALSE"
# !{rio_translate_cmd}

In [27]:
run_pipeline("Imperviousness")

output.vrt has been deleted.
If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000000000.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000046592.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000069888.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000023296.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000093184.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-0000139776.tif...
Copying gs://swhm-image-exports/Imperviousness/Imperviousness0000000000-00002096

In [28]:
rio_info_cmd = f"rio cogeo info rio_{lay_name}_cog.tif"
!{rio_info_cmd}

[1mDriver:[0m GTiff
[1mFile:[0m /Users/christiannilsen/Documents/repos/data_pipelines/ipynb/rio_Imperviousness_cog.tif
[1mCOG:[0m True
[1mCompression:[0m LZW
[1mColorSpace:[0m None

[1mProfile[0m
    [1mWidth:[0m            396288
    [1mHeight:[0m           378880
    [1mBands:[0m            1
    [1mTiled:[0m            True
    [1mDtype:[0m            float64
    [1mNoData:[0m           None
    [1mAlpha Band:[0m       False
    [1mInternal Mask:[0m    False
    [1mInterleave:[0m       BAND
    [1mColorMap:[0m         False
    [1mColorInterp:[0m      ('gray',)
    [1mScales:[0m           (1.0,)
    [1mOffsets:[0m          (0.0,)

[1mGeo[0m
    [1mCrs:[0m              EPSG:3857
    [1mOrigin:[0m           (-13904201.1931867, 6328374.445786312)
    [1mResolution:[0m       (1.194328566955879, -1.194328566955879)
    [1mBoundingBox:[0m      (-13904201.1931867, 5875867.238338069, -13430903.11404489, 6328374.445786312)
    [1mMinZoom:[0m 

In [None]:
# #check projection 
# p = subprocess.run(["rio", "info", "output.vrt"], capture_output=True, text=True)
# raster_info = json.loads(p.stdout)
# raster_info

In [None]:
# warp_cmd = 'gdalwarp -t_srs EPSG:3857 -overwrite output.vrt tmp.tif \
# -co TILED=YES -co COMPRESS=LZW -co BIGTIFF=YES \
# -ot Float32 \
# --config CHECK_DISK_FREE_SPACE FALSE'
# !{warp_cmd}

In [1]:
# #file_name = f'rio_{lay_name}.tif'
lay_name = "Imperviousness"
file_name = f'rio_{lay_name}_cog.tif'
cmd = f'gsutil cp {file_name} gs://live_data_layers/rasters/{lay_name}.tif'
print(cmd)
#!{cmd}

gsutil cp rio_Imperviousness_cog.tif gs://live_data_layers/rasters/Imperviousness.tif


In [46]:

ee_cmd = f"earthengine upload image --asset_id=projects/ee-swhm/assets/production_layers/{lay_name} \
gs://live_data_layers/rasters/{lay_name}.tif" 

!{ee_cmd}


Started upload task with ID: YLI34QN4DBMD4V5UNJENFJJ6
