# Add new labeled data 🛰️

**Description:** Stand alone notebook for adding new training and evaluation data. 

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/openmapflow/notebooks/new_data.ipynb)

# 1. Setup

If you don't already have one, obtain a Github Personal Access Token using the steps [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token). Save this token somewhere private.

In [None]:
try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False
    
if IN_COLAB:
    from getpass import getpass
    github_url = input("Github HTTPS URL: ")
    email = input("Github email: ")
    username = input("Github username: ")
    token = getpass('Github Personal Access Token:')

    !git config --global user.email $username
    !git config --global user.name $email
    !git clone {github_url.replace("https://", f"https://{username}:{token}@")}

    !pip install openmapflow -q
else:
    !pip install google-auth -q
    print("Running notebook outside Google Colab. Assuming in local repository.")

In [None]:
from pathlib import Path
from ipywidgets import Box
from tqdm.notebook import tqdm
from openmapflow.constants import CONFIG_FILE
from openmapflow.utils import colab_gee_gcloud_login

import ipywidgets as widgets
import os

cwd = Path.cwd()
root = None
for p in [cwd, cwd.parent, cwd.parent.parent]:
    if (p / CONFIG_FILE).exists():
        root = p
        break
if root == None:
    root = input("Path to project_root: ")
%cd {root}

from openmapflow.config import PROJECT_ROOT, DataPaths, GCLOUD_PROJECT_ID

In [None]:
box_layout = widgets.Layout(flex_flow='column')

options = ["Add new labels", "Check progress of previously uploaded labels"]
use = widgets.RadioButtons(
    options=options,
    style= {'description_width': 'initial'},
    value=options[0],
    description='',
    disabled=False
)

branches_available = []
local_branches = os.popen('git branch').read().split("\n")
remote_branches = os.popen('git branch -r').read().split("\n")
for branch in local_branches + remote_branches:
    if branch == "":
        continue
    branches_available.append(branch.replace("*", "").strip().replace("origin/", ""))

new_branch = widgets.Text(description='Enter a new branch name',
                        style={'description_width': 'initial'})
existing_branch = widgets.Dropdown(options=branches_available, 
                              description="Branch with existing labels",
                              style={'description_width': 'initial'})
existing_branch.layout.visibility = "hidden"

def change_visibility(event):
    try:
        i = event["new"]["index"]  
    except:
        return
    show_new = i == 0
    existing_branch.layout.visibility = "hidden" if show_new else "visible" 
    new_branch.layout.display = "block" if show_new else "none"

use.observe(change_visibility)
Box(children=[use, new_branch, existing_branch], layout=box_layout)

In [None]:
checking_progress_only = new_branch.value == ""
if checking_progress_only:
    !git checkout {existing_branch.value}
    !git pull
else:
    !git checkout -b'{new_branch.value}'

# 2. Download latest data
Data is stored in remote storage (ie. Google Drive) so authentication is necessary.

In [None]:
if IN_COLAB or not checking_progress_only:
    for p in tqdm([DataPaths.MODELS, DataPaths.PROCESSED_LABELS, DataPaths.COMPRESSED_FEATURES]):
        !dvc pull {p} -q

    !tar -xzf {DataPaths.COMPRESSED_FEATURES} -C data

# 3. Upload labels

In [None]:
if checking_progress_only:
    print("Checking progress only, skipping this cell.")
else:
    dataset_name = input("Dataset name (suggested format: <Country_Region_Year>): ")
    while True:
        dataset_dir = PROJECT_ROOT / DataPaths.RAW_LABELS / dataset_name
        if dataset_dir.exists() and len(list(dataset_dir.iterdir())) > 0:
            dataset_name = input("Dataset name already exists, try a different name: ")
        else:
            dataset_dir.mkdir(exist_ok=True)
            break

    print("--------------------------------------------------")
    print(f"Dataset: {dataset_name} directory created")
    print("---------------------------------------------------")
    
    if IN_COLAB:
        uploaded = files.upload()

        for file_name in uploaded.keys():
            Path(file_name).rename(dataset_dir / file_name)
    else:
        print(f"Please add file(s) into {dataset_dir}")

# 4. Create features
<img src="https://storage.googleapis.com/harvest-public-assets/openmapflow/new_data.png"/>

In [None]:
if checking_progress_only:
    print("Checking progress only, skipping this cell.")
else:
    user_confirmation = input(
        "Open datasets.py and add a `LabeledDataset` object representing the labels just added.\n"+
        "Added `LabeledDataset y/[n]: "
    )
    if user_confirmation.lower() != "y":
        print("New features can only be created when a `LabeledDataset` object is added.")

In [None]:
from openmapflow.config import GCLOUD_PROJECT_ID

In [None]:
# TODO figure out public bucket permissions
if IN_COLAB:
    colab_gee_gcloud_login(GCLOUD_PROJECT_ID, google)
else:
    !earthengine authenticate

`openmapflow create-features` creates features from labels and earth observation data referenced in datasets.py.

It first checks if the necessary earth observation data is already available in Cloud Storage, or if an active Earth Engine task is already active. So Google Cloud and Earth Engine authentication is needed.

In [None]:
!openmapflow create-features

In [None]:
!cat {DataPaths.DATASETS}

In [None]:
!git diff {DataPaths.DATASETS}

# 4. Pushing the new data to the repository

In [None]:
# Pushing to remote storage
for p in tqdm([DataPaths.RAW_LABELS, DataPaths.PROCESSED_LABELS, DataPaths.COMPRESSED_FEATURES]):
    !dvc commit {p} -f -q
!dvc push

In [None]:
# Pushing reference to github
commit_message = input("Commit message: ")
!git add .
!git commit -m '{commit_message}'
!git push 

Create a Pull Request so the data can be merged into the main branch.