# Generate Cloud-Optimized Geotiffs from tiles


## Load Libraries

Check that gdal is installed

In [1]:
!gdalinfo --version

GDAL 3.6.4, released 2023/04/17


In [15]:
#from osgeo import gdal
import subprocess
import json 
#import pandas as pd
#from google.cloud import storage
import os
import glob


In [3]:
os.environ['GS_NO_SIGN_REQUEST'] = 'YES'
os.environ['GDAL_NUM_THREADS'] = '5'

Log in to google cloud if needed 

In [4]:
#!{gcloud auth login --update-adc}

In [5]:
!{gcloud config set project swhm-dev}

Updated property [core/project].


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update



To take a quick anonymous survey, run:
  $ gcloud survey



## Functions

A function to list blobs on the storage bucket 

In [6]:
def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
    """Lists all the blobs in the bucket that begin with the prefix.
    """
    storage_client = storage.Client()
   
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

    # Note: The call returns a response only when the iterator is consumed.
    blob_list = []
    for blob in blobs:
        blob_list.append(blob.name)

    if delimiter:
        print("Prefixes:")
        for prefix in blobs.prefixes:
            blob_list.append([prefix])
    
    return blob_list


### 1. Download images

In [7]:
def dl(lay_name):
    cmd = f'gcloud storage cp -R gs://swhm-image-exports/{lay_name} .'
    !{cmd}

### 2. Reproject images

saves reprojected images to /tmp 

In [6]:
def reproject(lay_name, target_crs='EPSG:3857'): 

    directory_path = lay_name
               #make a list of the files in the directory 
    files = os.listdir(directory_path+"/reprojected")
    print(files)
    # create a new file for writing
    list_file = "files.txt"
    try:
        os.remove(list_file)
    except OSError:
        pass
    
    for filename in files:
        if filename.endswith(".tif"):
            input_path = os.path.join(directory_path, filename)
            output_path = os.path.join(directory_path+"/reprojected", filename)
            cmd = f'gdalwarp -t_srs {target_crs} -overwrite {input_path} {output_path}'
            !{cmd}

### 3. Make vrt



In [7]:
#https://gdal.org/programs/gdal_translate.html
#cmdoption-gdal_translate-ovr

def makevrt(lay_name):
    directory_path = lay_name+'/*.tif'
    print('Making VRT...')
    cmd = f'gdalbuildvrt  output.vrt {directory_path}'
    !{cmd}
    print('VRT Complete!')
    

def ul(file_name, lay_name):
    print('Uploading Layer...')
    cmd = f'gcloud storage cp {file_name} gs://live_data_layers/rasters/{lay_name}.tif'
    !{cmd}
    print('Layer upload complete!') 

---

## 4. Wrapper Function

In [8]:
def convert_layer(lay_name): 
    '''
    Function that does the following: 
    1. Downloads from gcp bucket 
    2. makes a virtual raster - saves to output.vrt 
    3. Checks the projection and reprojects if necessary - saves to tmp.tif, otherwise translates to tmp.tif 
    ''' 
    

    
    #check projection 
    p = subprocess.run(["rio", "info", "output.vrt"], capture_output=True, text=True)
    raster_info = json.loads(p.stdout)

#reproject if needed 
    if (raster_info['crs']) != "EPSG:3857": 
        #reproject 
        print(f'reprojecting from {raster_info["crs"]}')
        warp_cmd = f'gdalwarp -t_srs EPSG:3857 -overwrite output.vrt {lay_name}_cog.tif \
         -co NUM_THREADS=5 -co TILED=YES -co COMPRESS=LZW -co BIGTIFF=YES \
         --config CHECK_DISK_FREE_SPACE FALSE'
        !{warp_cmd}

    else: 
        print(f'saving as {lay_name}_cog.tif') 
        translate_cmd = f'gdal_translate output.vrt {lay_name}_cog.tif \
        -co TILED=YES -co COMPRESS=LZW \
        -co BIGTIFF=YES \
         -co NUM_THREADS=5 -co TILED=YES --config CHECK_DISK_FREE_SPACE FALSE'
        !{translate_cmd}
        
    #rebuild pyramids
    
    if (raster_info["dtype"] != 'uint8'): 
            print(f'rebuilding overviews using average sampling')
            resampling = 'average'
            #!{'gdaladdo -r average tmp.tif'}
    else: 
            print(f'rebuilding overviews using nearest neighbors sampling') 
            resampling = 'nearest' 
            #!{'gdaladdo -r nearest tmp.tif'}
        
    print('translating to rio cog raster') 
    
    translate_cmd = f"rio cogeo create {lay_name}_cog.tif rio_{lay_name}_cog.tif \
    --allow-intermediate-compression \
    --cog-profile lzw \
    --web-optimized \
    --config CHECK_DISK_FREE_SPACE=FALSE \
    --overview-resampling={resampling} \
    --resampling={resampling}"

    # translate_cmd = f'gdal_translate tmp.tif {lay_name}_cog.tif \
    # -co TILED=YES -co COMPRESS=LZW -co COPY_SRC_OVERVIEWS=YES \
    # -co BIGTIFF=YES -stats\
    #  -co NUM_THREADS=5  --config CHECK_DISK_FREE_SPACE FALSE'
    # 
    !{translate_cmd}

           



In [12]:
# lay_name = 'Soils'
# resampling = 'nearest' 

# translate_cmd = f"rio cogeo create {lay_name}_cog.tif rio_{lay_name}_cog.tif \
#     --allow-intermediate-compression \
#     --cog-profile lzw \
#     --config CHECK_DISK_FREE_SPACE=FALSE \
#     --overview-resampling={resampling}"

In [13]:
# !{translate_cmd}

## Get list of objects in data bucket

In [23]:
BUCKET_NAME = 'swhm-image-exports'
blobsout = list_blobs_with_prefix(BUCKET_NAME,'')

In [24]:
df = pd.DataFrame(blobsout, columns=['file_path'])#.iloc[1:]
#df['folder_name'] = df['file_path'].str.split(BUCKET_NAME, 1,expand = True)
df['gdal_path'] = df['file_path'].str.replace('gs://', '/vsigs/') 


In [25]:
lay_names= df['file_path'].str.split('/', 0).str[0]#.str.replace('/','',regex=False)
df['layer_name'] = lay_names.str.split('/',1).str[0]
#lay_names

  lay_names= df['file_path'].str.split('/', 0).str[0]#.str.replace('/','',regex=False)
  df['layer_name'] = lay_names.str.split('/',1).str[0]


In [26]:
lay_names = df["layer_name"].unique()
print(lay_names)



['copper_concentration_ug_per_L' 'p_concentration_ug_per_L'
 'tkn_concentration_ug_per_L' 'tss_concentration_ug_per_L'
 'zinc_concentration_ug_per_L']


In [9]:
#function to check if the file is a Cloud Optimized GeoTIFF
def is_cog(file_path):
    command = ["rio", "cogeo", "validate", "--strict",file_path]
    result = subprocess.run(command, capture_output=True, text=True)
    return result.stderr

## Loop through layer names

Use this to run the pipeline for all layers in a list

In [10]:
def run_pipeline(lay_name): 
    #remove old files.tif 
    files_to_check = ["tmp.tif", "output.vrt", lay_name+"_cog.tiff"]

    for file in files_to_check:
        if os.path.exists(file):
            os.remove(file)
            print(f"{file} has been deleted.")

    #download layer 
    dl(lay_name)
    makevrt(lay_name)
    #convert layer 
    convert_layer(lay_name) 
    fn = f'rio_{lay_name}_cog.tif'
    #check converted layer 
    cog_check = is_cog(fn)
    if(cog_check is None):
        print('✅ Valid COG! Beginning upload...')
        #upload layer 
        ul(fn,lay_name)

    else: 
        print('❌', cog_check)

In [19]:
# # rio_translate_cmd = f"rio cogeo create {lay_name}_cog.tif rio_{lay_name}_cog.tif \
#  --allow-intermediate-compression --cog-profile lzw --web-optimized --config CHECK_DISK_FREE_SPACE=FALSE"
# !{rio_translate_cmd}

# Scratch Pad below

In [17]:
lay_name = "imperviousness"
makevrt(lay_name)
#fn = f'rio_{lay_name}_cog.tif'
#dl(lay_name)
#ul(fn,lay_name)
#makevrt(lay_name)
    #convert alayer 
#convert_layer(lay_name) 

Making VRT...
0...10...20...30...40...50...60...70...80...90...100 - done.
VRT Complete!


In [18]:
convert_layer(lay_name) 

saving as imperviousness_cog.tif
Input file size is 72206, 63597
0...10...20...30...40...50...60...70...80...90...100 - done.
rebuilding overviews using average sampling
translating to rio cog raster
Reading input: /Users/christiannilsen/Documents/repos/data_pipelines/ipynb/imperviousness_cog.tif
[?25l  [####################################]  100%          [?25h
Adding overviews...
Updating dataset tags...
Writing output to: /Users/christiannilsen/Documents/repos/data_pipelines/ipynb/rio_imperviousness_cog.tif


In [27]:
cmd = f"gdaladdo -r average {lay_name}_cog.tif"
!{cmd}

0...10...20...30...40...50...60...70...80...90...100 - done.


In [25]:
rio_info_cmd = f"rio cogeo info {lay_name}_cog.tif"
!{rio_info_cmd}

[1mDriver:[0m GTiff
[1mFile:[0m /Users/christiannilsen/Documents/repos/data_pipelines/ipynb/imperviousness_cog.tif
[1mCOG:[0m False
[1mCompression:[0m LZW
[1mColorSpace:[0m None

[1mProfile[0m
    [1mWidth:[0m            72206
    [1mHeight:[0m           63597
    [1mBands:[0m            1
    [1mTiled:[0m            True
    [1mDtype:[0m            float64
    [1mNoData:[0m           None
    [1mAlpha Band:[0m       False
    [1mInternal Mask:[0m    False
    [1mInterleave:[0m       BAND
    [1mColorMap:[0m         False
    [1mColorInterp:[0m      ('gray',)
    [1mScales:[0m           (1.0,)
    [1mOffsets:[0m          (0.0,)

[1mGeo[0m
    [1mCrs:[0m              EPSG:3857
    [1mOrigin:[0m           (-14120950.1931867, 6440286.445786312)
    [1mResolution:[0m       (12.500000000000002, -12.500000000000002)
    [1mBoundingBox:[0m      (-14120950.1931867, 5645323.945786312, -13218375.1931867, 6440286.445786312)
    [1mMinZoom:[0m     

In [7]:
file_name = f'rio_{lay_name}.tif'

file_name = f'rio_{lay_name}_cog.tif'
cmd = f'gcloud storage cp {file_name} gs://live_data_layers/rasters/{lay_name}.tif'
print(cmd)
!{cmd}

gcloud storage cp rio_Land_Cover_cog.tif gs://live_data_layers/rasters/Land_Cover.tif
Copying file://rio_Land_Cover_cog.tif to gs://live_data_layers/rasters/Land_Cover.tif
uploading large objects. If you would like to opt-out and instead
perform a normal upload, run: gcloud config set
storage/parallel_composite_upload_enabled False If you would like to
storage/parallel_composite_upload_enabled True Note that with parallel
composite upload, your object might be uploaded as composite objects
(https://cloud.google.com/storage/docs/composite-objects) which means
that any user who downloads such objects will need to have crc32c
library to compute checksum for data integrity checking. This library
is already present with Cloud SDK, so downloading these objects using
gcloud storage should not be an issue.

⠧ Completed files 0/1 | 0B/2.4GiB                                              Resuming upload for gcloud/tmp/parallel_composite_uploads/see_gcloud_storage_cp_help_for_details/2460348662_c1

In [32]:

layers = ['HSPF_Land_Cover_Type', 'Hydrologic_Response_Units', 'Imperviousness']

for lay_name in layers:
    ee_cmd = f"earthengine upload image --asset_id=projects/ee-swhm/assets/staging/{lay_name} \
    gs://live_data_layers/rasters/{lay_name}.tif" 
    !{ee_cmd}


Started upload task with ID: 6Y43ZQYA2DJLG6SRAKJIU4DW
Started upload task with ID: 7YNPEX5YFPXPPPK6WMD2KJC7
Started upload task with ID: BYQDZAGB3UTB7THMVQQCDEN3


In [33]:

ee_cmd = f"earthengine upload image --asset_id=projects/ee-swhm/assets/staging/{lay_name} \
gs://live_data_layers/rasters/{lay_name}.tif" 
!{ee_cmd}


Started upload task with ID: MBRFSBE2PYBGD5QGD5KVGKT7


Check pryamiding policy

In [3]:
ee_cmd = f"earthengine asset info projects/ee-swhm/assets/production_layers/{lay_name}"
!{ee_cmd}

{
  "bands": [
    {
      "dataType": {
        "precision": "INT",
        "range": {
          "max": 255
        }
      },
      "grid": {
        "affineTransform": {
          "scaleX": 1.194328566955879,
          "scaleY": -1.194328566955879,
          "translateX": -13904201.1931867,
          "translateY": 6328374.445786312
        },
        "crsCode": "EPSG:3857",
        "dimensions": {
          "height": 378880,
          "width": 396288
        }
      },
      "id": "b1",
      "pyramidingPolicy": "MEAN"
    }
  ],
  "geometry": {
    "coordinates": [
      [
        [
          -124.90357548563415,
          49.31438728345055
        ],
        [
          -124.90357466151073,
          46.592740774523946
        ],
        [
          -124.7457889554153,
          46.59283783034879
        ],
        [
          -124.58800331573232,
          46.59271796025322
        ],
        [
          -124.40531309087523,
          46.592836910698516
        ],
        [
     