In [None]:
# BLANK

import pathlib
from datetime import datetime
import time

import torch
from torch import cuda

import glob
import json
import os
import shutil
import sys
import ast
import random
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import skimage.io as io
from shapely import Polygon
from matplotlib import pyplot as plt
from PIL import Image
from torch import cuda
import supervision as sv
from ultralytics import YOLO
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm, trange
from importlib import reload

# Clone georip repo here: https://www.github.com/joeletho/georip.git

# Cloned repo directory
sys.path.append("path/to/georip")

import georip


In [None]:
# BLANK

print(sys.version)

In [None]:
# BLANK

print(reload(georip))

In [None]:
# BLANK

has_gpu = cuda.is_available()

device = torch.device('cuda' if has_gpu else 'cpu')
print(device)
if has_gpu:
    print(cuda.get_device_name(0))


#### Example directory structure:
```
Root
  ├── GEORIP_YOLO
  |         ├── datasets
  |         ├── models
  ├── NDVI
  ├── QGIS
  ├── Readme.txt
  ├── Shapefiles
  ...
```

In [None]:
# BLANK
path_map = {}

# Change this to your project path
path_map['ROOT'] = Path("path/to/project/root")

path_map['PROJECT_NAME'] = 'GEORIP_YOLO'
path_map['georip'] = path_map['ROOT'] / path_map["PROJECT_NAME"]
path_map['GEORIP_DS'] = path_map['georip'] / 'datasets'
path_map['GEORIP_MODELS'] = path_map['georip'] / 'models'

In [None]:
# BLANK
def make_directories(paths_map, verbose=True, exist_ok=False):
    if verbose:
        print("Creating directory structure")
    for name, path in paths_map.items():
        if isinstance(path, Path):
            if path.is_file() or len(path.suffix) > 0:
                paths_map[name] = path.resolve()
            else:
                path = path.resolve()
                paths_map[name] = path
                path.mkdir(parents=True, exist_ok=exist_ok)
                if verbose:
                    print('  ',path)
    if verbose:
        print("Complete")

def make_project_paths(root,*, verbose=True, exist_ok=False):    
    paths = {'NDVI': Path(root, 'NDVI', 'NDVI Difference Rasters')}
    paths['SHAPE_FILES'] = Path(root, 'Shapefiles')

    paths['GEORIP_DS_META'] = path_map['GEORIP_DS'] / 'meta'
    paths['GEORIP_DS_CSV'] = paths['GEORIP_DS_META'] / 'csv'
    paths['GEORIP_DS_SHP'] = paths['GEORIP_DS_META'] / 'shp'
    
    # Data
    paths['PRED_SHP'] = paths['SHAPE_FILES'] / 'ModelPredictions'
    paths['SHPZ10_SHP'] = paths['SHAPE_FILES'] / 'Treatments_UTMz10_Only_08-18-24' / 'Treatments_UTMz10_Only_08-18-24_GEE.shp'
    paths['SHPZ10_SHP_BACKGROUND'] = paths['SHAPE_FILES'] / 'facts_calmapper_utmz10n/facts_calmapper_utmz10n.shp'
    for name, path in paths.items():
        path_map[name] = path
    make_directories(paths, verbose=verbose, exist_ok=exist_ok)

        
def make_dataset_paths(ds_root, models_root, model_name, *,verbose=True, exist_ok=False):
    ds_root = Path(ds_root)
    models_root = Path(models_root)
    paths = {}
        
    paths['MODEL_NAME'] = model_name
    paths['GEORIP_MODEL'] = models_root / paths['MODEL_NAME']
    paths['GEORIP_DS_MODEL'] = ds_root / paths['MODEL_NAME']
    paths['GEORIP_DS_MODEL_META'] = paths['GEORIP_DS_MODEL'] / 'meta'
    paths['GEORIP_DS_MODEL_SHP'] = paths['GEORIP_DS_MODEL_META'] / 'shp'
    paths['GEORIP_DS_MODEL_CSV'] = paths['GEORIP_DS_MODEL_META'] / 'csv'
    
    paths['GEORIP_DS_DATA'] = paths['GEORIP_DS_MODEL'] / 'meta'
    paths['GEORIP_DS_CONFIG_FILE'] = paths['GEORIP_DS_MODEL'] / 'config' / 'data.yaml'
    paths['GEORIP_DS_YOLO_DATA_FILE'] = paths['GEORIP_DS_DATA'] / 'yolo_ndvi_ds.csv'
    
    # Images and labels
    paths['GEORIP_DS_IMAGES'] = paths['GEORIP_DS_MODEL'] / 'images'
    paths['GEORIP_DS_LABELS'] = paths['GEORIP_DS_MODEL'] / 'labels'
    paths['GEORIP_DS_LABELS_GENERATED'] = paths['GEORIP_DS_LABELS'] / 'generated'
    
    paths['GEORIP_DS_tileS'] = paths["GEORIP_DS_IMAGES"] / 'tiles'
    paths['GEORIP_DS_PNGS'] = paths["GEORIP_DS_IMAGES"] / 'png'
    paths['GEORIP_DS_TIFS'] = paths["GEORIP_DS_IMAGES"] / 'tif'
    
    paths['GEORIP_DS_IMAGES_TRAIN'] = paths['GEORIP_DS_IMAGES'] / 'train'
    paths['GEORIP_DS_IMAGES_TEST'] = paths['GEORIP_DS_IMAGES'] / 'test'
    paths['GEORIP_DS_IMAGES_VAL'] = paths['GEORIP_DS_IMAGES'] / 'val'
    
    paths['GEORIP_DS_LABELS_TRAIN'] = paths['GEORIP_DS_LABELS'] / 'train'
    paths['GEORIP_DS_LABELS_TEST'] = paths['GEORIP_DS_LABELS'] / 'test'
    paths['GEORIP_DS_LABELS_VAL'] = paths['GEORIP_DS_LABELS'] / 'val'

    # Metadata

    # Zone 10
    paths['CSVZ10'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_UTMz10.csv'
    paths['CSVZ10_NORM'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_UTMz10_normalized.csv'
    paths['CSVZ10_CLEANED'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_UTMz10_normalized_cleaned.csv'
    paths['CSVZ10_TILED'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_UTMz10_normalized_tiled.csv'
    paths['CSVZ10_TILED_LABELS_UTM'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_z10utm_tiled_labels.csv'
    paths['CSVZ10_TILED_LABELS_PIXEL'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_z10pixel_tiled_labels.csv'
    paths['CSVZ10_TILED_LABELS_PIXEL_ENCODED'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_z10pixel_tiled_labels_encoded.csv'
    paths['CSVZ10_TILED_LABELS_PREYOLO'] = paths['GEORIP_DS_MODEL_CSV'] / 'Treatments_z10pixel_tiles_labels_encoded_preyolo.csv'
    
    for name, path in paths.items():
        path_map[name] = path
    
    make_directories(paths, verbose=verbose, exist_ok=exist_ok)

    path_map['SHPZ10_PRED_SHP'] = path_map['PRED_SHP'] / f"Treatmentsz10_{paths['MODEL_NAME']}.shp"
    path_map['SHPZ11_PRED_SHP'] = path_map['PRED_SHP'] / f"Treatmentsz11_{paths['MODEL_NAME']}.shp"

make_project_paths(path_map['ROOT'], exist_ok=True)

In [None]:
# BLANK
# Class encoder function
def classify(row):
    geom = row.get('geometry')
    return (0, "Treatment") if geom is not None and not geom.is_empty and geom.area > 1 else (-1, "Background")

In [None]:
# BLANK
import pandas as pd

def parse_dates(df, column_name):
    try:
        return pd.to_datetime(df[column_name], format='mixed', errors='coerce')
    except Exception as e:
        print(f"Mixed format failed for {column_name}: {e}")

# Fix field name mismatch
background_df = georip.io.load_shapefile(path_map['SHPZ10_SHP_BACKGROUND'])
if background_df.get("StartDate") is not None:
    background_df["StartYear"] = parse_dates(background_df, 'StartDate').dt.year.astype(pd.Int64Dtype())
    background_df["EndYear"] = parse_dates(background_df, 'EndDate').dt.year.astype(pd.Int64Dtype())
    background_df = background_df.drop(columns=["StartDate", "EndDate"])
    georip.io.save_as_shp(background_df, path_map['SHPZ10_SHP_BACKGROUND'], exist_ok=True)


In [None]:
# BLANK
from georip.modeling.utils import DatasetSplitMode

# Dataset configuration settings
CHIP_SIZE = [320, 640]
YEARS=[2019, 2020, 2021, 2022, 2023]
SPLIT=0.70
SPLIT_MODE=[DatasetSplitMode.All, DatasetSplitMode.Collection]
BACKGROUND_RATIO=1.0
SHUFFLE_SPLIT=[False, True]
TREATMENTS = [0,1,2,3,4,5,6,7]

single_treatment = False

# Error list
errors = []

# Configure TQDM progress bar
total_updates = (
    len(TREATMENTS) *
    (len(YEARS)-1) *
    len(SPLIT_MODE) *
    len(SHUFFLE_SPLIT) *
    len(CHIP_SIZE)
    )
root_pbar = trange(total_updates)
updates = 0

# Get tile size
for size in IMG_SIZE:
    # Get treatment
    for tmt_idx, treatment in enumerate(TREATMENTS): 
        if not single_treatment and tmt_idx > 0:
            break
        # Get years
        for year_idx in range(len(YEARS)-1):
            years = (YEARS[year_idx], YEARS[year_idx+1])
            years = None if years is None or years[0] is None else years
            
            # Get split mode
            for mode in SPLIT_MODE:
                # Get split flag
                for shuffle_split in SHUFFLE_SPLIT:
                    # Get the background bias
                    for background_ratio in BACKGROUND_RATIO:
                        ndvi_has_suffix = ndvi_path.name in ['vv', 'vh']
                        ndvi_parts = ndvi_path.parts
                        ndvi_name = ndvi_parts[-2].split()[0] if ndvi_has_suffix else ndvi_parts[-1].split()[0]

                        if include_sentinel and ndvi_idx > 0 and ndvi_has_suffix:
                            ndvi_name = '_'.join([ndvi_name, ndvi_parts[-1]])
                        
                        years_str = f"{str(years) if years is None else f'{str(years[0])}to{str(years[1])}'}"
                        
                        # Update pbar message
                        model_info = f"DS: {ndvi_name}, T: {'all' if treatment == 0 else str(treatment) if len(TREATMENTS) == 1 else '_'.join([str(tmt) for tmt in TREATMENTS])}, Y: {years_str}, SM: {mode}, S: {SPLIT}, SS: {shuffle_split}, B: {str(background_ratio)}"
                        root_pbar.set_description(f"Creating dataset {updates+1}: {model_info}")
        
                        # Create the model name and create its repository
                        path_map['MODEL_NAME'] = f"yolo_treatments={'all' if treatment == 0 else str(treatment)}_years={years_str}_imgsz={size if size is not None else 'Default'}_split={int(SPLIT*100)}_mode={mode}_shuffle-split={shuffle_split}_bg={str(background).replace('.','_')}{'' if not SHUFFLE_BACKGROUND else '_shuffle-bg=True'}"
                        make_dataset_paths(
                            path_map['GEORIP_DS'], 
                            path_map['GEORIP_MODELS'],  
                            path_map['MODEL_NAME'], 
                            verbose=False, 
                            exist_ok=True
                        )
        
                        # Load the master shapefile
                        shpz10 = georip.io.load_shapefile(path_map['SHPZ10_SHP'])
                        if treatment == 0:
                            # All treatments
                            shpz10 = shpz10[shpz10['TreatmentT'] != 8]
                        else:
                            # Individual treatment
                            shpz10 = shpz10[shpz10['TreatmentT'] == treatment]
        
                        # Rename these rows to align filenames that are parsed later
                        shpz10.loc[shpz10['Subregion'] == "Humboldt", "Subregion"] = 'Humboldt4'
                        if years is not None:
                            shpz10 = shpz10[shpz10['StartYear'] == years[0]]
                            shpz10 = shpz10[shpz10['EndYear'] == years[1]]
        
                        # Declare the path of the base files used for this dataset
                        BASE_FILEPATH = Path(f'base_years={years_str}', 'Treatments_UTMz10_Only_08-18-24')
                        
                        # Save the files
                        georip.io.save_as_csv(shpz10, path_map['GEORIP_DS_CSV'] / BASE_FILEPATH.with_suffix('.csv'), exist_ok=True)
                        georip.io.save_as_shp(shpz10, path_map['GEORIP_DS_SHP'] / BASE_FILEPATH.with_suffix('.shp'), exist_ok=True)

                        try:
                            # Make the dataset using the base shapefile
                            yolo_ds = georip.datasets.YOLONDVIDifferenceDataset.create(
                                source = shp_source,
                                source_images_dir = ndvi_path,
                                output_dir = path_map['GEORIP_DS_MODEL'],
                                region_column = ['Subregion', 'gee_region'],
                                year_start_column = "StartYear",
                                year_end_column = "EndYear",
                                geometry_column = "geometry",
                                years = years,
                                background = False,
                                background_ratio = background_ratio,
                                split=mode,
                                split_ratio= SPLIT,  # 0.7 (70/30)
                                shuffle_split = shuffle_split,  # True/False
                                generate_labels = True,
                                generate_train_data = True,  # True/False
                                tile_size=size,
                                translate_xy = True,  # True/False
                                class_encoder= encode_classes,  # None or callback(row)
                                exist_ok = True,  # True/False
                                clear_output_dir = True,  # True/False
                                save_shp = True,  # True/False
                                save_gpkg = True,  # True/False
                                save_csv = True,  # True/False
                                pbar_leave = True,  # True/False
                                convert_to_png = True,
                                use_segments = True,
                                num_workers = 8,
                                preserve_fields = ["TreatmentT"]
                            )            
                    if len(yolo_ds.images) < 40:
                        # If the size is too small the dataset encounters issues so we limit it to a 
                        # size that may provide a decent number of images for training
                        raise ValueError("Too few images to be viable dataset")

                    # (Optional) Change the root path of the dataset to the target directory where it will be used later on
                    yolo_ds.generate_yaml_file(
                        root_abs_path=Path(SOME_OTHER_PATH, path_map['MODEL_NAME']),
                        dest_abs_path=path_map['GEORIP_DS_MODEL'] / 'config',
                    )
                except Exception as e:
                    # Append the error message and remove the created files
                    errors.append(f"{path_map['MODEL_NAME']}: {e}")
                    shutil.rmtree(path_map['GEORIP_DS_MODEL'])

                # Updae the pbar and update counter
                root_pbar.update()
                updates += 1

root_pbar.set_description(f"Dataset completed with {len(errors)} errors.")
root_pbar.refresh()
root_pbar.close()

if len(errors) > 0:
    print("The following errors occurred:\n", "\n".join(errors), file=sys.stderr)


In [None]:
# BLANK
yolo_ds.summary()