# Monitor download state and task progress
Run this notebook to visualise download state and monitor download tasks in progress on Google Earth Engine.

In [1]:
# Necessary imports
import os
os.environ['USE_PYGEOS'] = '0'
import pandas as pd
import geopandas as gpd
import time

from db_utils import DB
from dotenv import load_dotenv

## Load environment and project details

As with the other notebooks, we load credentials and project details from a hidden ```.env``` file.

In [2]:
# Load environment variables (including path to credentials) from '.env' file
env_file_path = "../.env"

assert load_dotenv(dotenv_path=env_file_path) == True, "[ERR] Failed to load environment!"
assert "GOOGLE_APPLICATION_CREDENTIALS" in os.environ, "[ERR] Missing $GOOGLE_APPLICATION_CREDENTIAL!"
assert "GS_USER_PROJECT" in os.environ, "[ERR] Missing $GS_USER_PROJECT!"
key_file_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
assert os.path.exists(key_file_path), f"[ERR] Google credential key file does not exist: \n{key_file_path} "
assert "ML4FLOODS_BASE_DIR" in os.environ, "[ERR] Missing $ML4FLOODS_BASE_DIR!"
base_path = os.environ["ML4FLOODS_BASE_DIR"]
assert os.path.exists(base_path), f"[ERR] Base path does not exist: \n{base_path} "
print("[INFO] Successfully loaded FloodMapper environment.")

[INFO] Successfully loaded FloodMapper environment.


## Query the download state from the database

In [3]:
# All work is conducted under a unique session name
session_name = "EMSR586"

In [4]:
# Connect to the database (point to the .env file for credentials)
db_conn = DB(env_file_path)

[INFO] Connecting to DB 'floodmapper-db'.
[INFO] Connection successfully established.


In [9]:
# Query the patches and their download status
query = (f"SELECT DISTINCT im.image_id, im.satellite, "
         f"im.patch_name, im.status, ST_AsText(gr.geometry) "
         f"FROM image_downloads im "
         f"INNER JOIN grid_loc gr "
         f"ON im.patch_name = gr.patch_name "
         f"INNER JOIN session_patches sp "
         f"ON im.patch_name = sp.patch_name "
         f"WHERE sp.session = %s ;")
data = (session_name,)
grid_df = db_conn.run_query(query, data, fetch=True)
print(f"[INFO] Returned {len(grid_df)} rows.")

# Format the results into a correct GeoDataFrame
grid_df['geometry'] = gpd.GeoSeries.from_wkt(grid_df['st_astext'])
grid_df.drop(['st_astext'], axis=1, inplace = True)
grid_gdf = gpd.GeoDataFrame(grid_df, geometry='geometry', crs="EPSG:4326")
grid_gdf

[INFO] Returned 522 rows.


Unnamed: 0,image_id,satellite,patch_name,status,geometry
0,GRID31658_Landsat_2022-06-15,Landsat,GRID31658,1,"POLYGON ((150.51000 -33.20516, 150.51000 -32.9..."
1,GRID31658_Landsat_2022-07-09,Landsat,GRID31658,1,"POLYGON ((150.51000 -33.20516, 150.51000 -32.9..."
2,GRID31658_Landsat_2022-07-17,Landsat,GRID31658,1,"POLYGON ((150.51000 -33.20516, 150.51000 -32.9..."
3,GRID31658_S2_2022-06-13,S2,GRID31658,1,"POLYGON ((150.51000 -33.20516, 150.51000 -32.9..."
4,GRID31658_S2_2022-06-18,S2,GRID31658,0,"POLYGON ((150.51000 -33.20516, 150.51000 -32.9..."
...,...,...,...,...,...
517,GRID33181_S2_2022-07-13,S2,GRID33181,0,"POLYGON ((152.31000 -32.80516, 152.31000 -32.5..."
518,GRID33181_S2_2022-07-15,S2,GRID33181,1,"POLYGON ((152.31000 -32.80516, 152.31000 -32.5..."
519,GRID33181_S2_2022-07-18,S2,GRID33181,0,"POLYGON ((152.31000 -32.80516, 152.31000 -32.5..."
520,GRID33181_S2_2022-07-20,S2,GRID33181,1,"POLYGON ((152.31000 -32.80516, 152.31000 -32.5..."


In [22]:
# Filter for downloaded
grid_dl_gdf = grid_gdf.loc[grid_gdf.status == 1]
patch_grp = grid_dl_gdf.groupby("patch_name")

In [31]:
for (p, x) in patch_grp:
    print(x)
    print()
    #print(p, x.count(), x)

                       image_id satellite patch_name  status  \
0  GRID31658_Landsat_2022-06-15   Landsat  GRID31658       1   
1  GRID31658_Landsat_2022-07-09   Landsat  GRID31658       1   
2  GRID31658_Landsat_2022-07-17   Landsat  GRID31658       1   
3       GRID31658_S2_2022-06-13        S2  GRID31658       1   
5       GRID31658_S2_2022-07-03        S2  GRID31658       1   
6       GRID31658_S2_2022-07-08        S2  GRID31658       1   
7       GRID31658_S2_2022-07-13        S2  GRID31658       1   
8       GRID31658_S2_2022-07-18        S2  GRID31658       1   
9       GRID31658_S2_2022-07-23        S2  GRID31658       1   

                                            geometry  
0  POLYGON ((150.51000 -33.20516, 150.51000 -32.9...  
1  POLYGON ((150.51000 -33.20516, 150.51000 -32.9...  
2  POLYGON ((150.51000 -33.20516, 150.51000 -32.9...  
3  POLYGON ((150.51000 -33.20516, 150.51000 -32.9...  
5  POLYGON ((150.51000 -33.20516, 150.51000 -32.9...  
6  POLYGON ((150.51000 -33.20

In [None]:
# Path to the JSON file outout by download script
json_path = "../scripts/2023-03-16_11.50.44.json"
os.path.exists(json_path)

## Display a progress bar

The cells here can be quickly run in sequence to produce progress bars for the tasks being tracked by the database. Note that the ```01_download_images.py``` script must remain running for this notebook to work. 

In [None]:
# Connect to the database
db_conn = DB()

In [None]:
# Load the JSON 
task_list = json.load(open(json_path, "r"))
n_tasks = len(task_list)
print(f"JSON currently contains {n_tasks} task entries.")

# Convert to a DataFrame
tasks_df = pd.DataFrame(task_list)
tasks_df["gridname"] = tasks_df["description"].str.split("_").str[0]
tasks_df

In [None]:
# Query the DB for the download status of the images
image_ids = tuple(tasks_df['description'].unique())
query = (f"SELECT image_id, status "
         f"FROM image_downloads "
         f"WHERE image_id IN %s;")
data = (tuple(image_ids),)
image_db = db_conn.run_query(query, data, fetch=True)
image_db

In [None]:
# Initialise progress bar for all available tasks. 
batch_bar = tqdm(total=len(tasks_df), 
                 dynamic_ncols=True, 
                 leave=False, 
                 position=0, 
                 desc="All Tasks",
                 colour="GREEN")

# Logic : Check all tasks, keep removing them as and when the 
# in_progress flag is set to 0 for the task in the database.
while len(tasks_df) >= 1:
    
    # Loop through the tasks grouped by gridname
    for name, gdf in tasks_df.groupby(by='gridname'):
        for i, task in gdf.iterrows():
            
            # Check if download is still marked as in-progress in the DB
            desc = task['description']
            ip = image_db[image_db['image_id'] == desc]['status'].item()
            
            # Do nothing if still in-progress
            if ip == -1:
                continue

            ## Drop entry if not still in-progress
            if ip == 0 or ip == 1:
                tasks_df.drop(i, inplace = True)
                batch_bar.update()
        
        time.sleep(0.25)