Use Prochlorococcus example to figure out how to match data to a place (I will ultimately do BATS)\
Krista Longnecker, 15 June 2025
### Update 
Did not go too far down this path as it's clear I will need the HPC because this takes a bit of memory. Keep this notebook in case the notes I took here are useful.


In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
sys.path.append("../config")
from config import API_KEY
import pycmap
from pycmap.viz import plot_timeseries
import os #sometimes useful

In [12]:
#first, need to figure out how to run .py scripts from jupyter notebook
#scripts are all in matchToBATS
%run collect.py


********************************
Downloading  ('tblSeaFlow_v1_5', ['cruise', 'abundance_prochloro', 'abundance_synecho', 'abundance_picoeuk'])  ...
********************************


********************************
Downloading  ('tblFlombaum', ['prochlorococcus_abundance_flombaum', 'synechococcus_abundance_flombaum'])  ...
********************************



In [13]:
%run colocalize.py

KeyboardInterrupt: 

In [7]:
#have an issue, see if I can work through the steps here to figure out whyb

In [8]:
#Stick some code below this spot as a holding zone, but put in a stop so the notebook doesn't try and run the bits as they may or may not work.

In [9]:
 raise SystemExit("Stop execution here")

SystemExit: Stop execution here


To exit: use 'exit', 'quit', or Ctrl-D.



In [None]:
import os, glob
import concurrent.futures
import pycmap
from config.config import API_KEY
from settings import DATA_DIR, COLOCALIZED_DIR
from common import halt, makedir, environmental_datasets
import pandas as pd
import datetime
from dateutil.parser import parse



def cyano_csv_files(cyanoDir):
    """
    Returns a list of path to csv files that hold observations of cyanobacteria.
    """
    return glob.glob(f"{cyanoDir}*.csv")


def add_env_columns(df, envs):
    """
    Adds new columns to the dataframe form each environmental variable.
    """
    for env in envs.values():
        for v in env.get("variables"):
            if v not in df.columns: df[v] = None
    return df
    

In [None]:
def add_env_temporal_coverage(api, envs):
    """
    Adds new entries to the envs dictionary indicating the temporal coverage of each environmental dataset.
    """
    for table, env in envs.items():
        df = api.query(f"SELECT MIN([time]) startTime, MAX([time]) endTime FROM {table}")
        if len(df) > 0:
            envs[table]["startTime"] = df.loc[0, "startTime"]
            envs[table]["endTime"] = df.loc[0, "endTime"]
    return envs

In [None]:
DATA_DIR = "./data/"           
cyanoFiles = cyano_csv_files(DATA_DIR)

In [None]:
cyanoFiles

In [None]:
    api = pycmap.API(token=API_KEY)
    makedir(COLOCALIZED_DIR)
    envs = environmental_datasets()        

In [None]:
envs

In [None]:
#this adds time information to an existing envs dictionary
envs = add_env_temporal_coverage(api, envs)

In [None]:
envs

In [None]:
cyanoFiles

In [None]:
cyanoFile = cyanoFiles[1]
df = pd.read_csv(cyanoFile)
df = add_env_columns(df, envs)
dfs = [df.loc[i].to_frame().T for i in range(len(df))]

In [None]:
colocalizedDF  = pd.DataFrame({})

In [None]:
def match(df, api, envs, cyanoFile, rowCount):
    """
    Takes a single-row dataframe containing cyano observations and colocalizes with the 
    environmental variables included in the `envs` argument. The tolerance parametrs 
    are also included in the `envs` argument.
    """ 
    def get_month(dt):
        return parse(dt).month

    def shift_dt(dt, delta):
        delta = float(delta)
        dt = parse(dt)
        dt += datetime.timedelta(days=delta)
        return dt.strftime("%Y-%m-%d %H:%M:%S")

    def in_time_window(sourceDT, targetMinDT, targetMaxDT):
        targetMinDT = targetMinDT.split(".000Z")[0]
        targetMaxDT = targetMaxDT.split(".000Z")[0]
        return not (
                    parse(sourceDT) < parse(targetMinDT) or 
                    parse(sourceDT) > parse(targetMaxDT)
                    )

    def construc_query(table, env, t, lat, lon, depth):
        variables = env["variables"] 
        timeTolerance = env["tolerances"][0] 
        latTolerance = env["tolerances"][1] 
        lonTolerance = env["tolerances"][2]  
        depthTolerance = env["tolerances"][3]  
        hasDepth = env["hasDepth"] 
        isClimatology = env["isClimatology"]
        inTimeRange = True
        if not isClimatology:
            startTime = env["startTime"] #issue here, KeyError 'startTime'
            endTime = env["endTime"]    
            inTimeRange = in_time_window(t, startTime, endTime)
        selectClause = "SELECT " + ", ".join([f"AVG({v}) {v}" for v in variables]) + " FROM " + table
        timeClause = f" WHERE [time] BETWEEN '{shift_dt(t, -timeTolerance)}' AND '{shift_dt(t, timeTolerance)}' "
        if not inTimeRange or isClimatology: timeClause = f" WHERE [month]={get_month(t)} "
        latClause = f" AND lat BETWEEN {lat-latTolerance} AND {lat+latTolerance} "
        lonClause = f" AND lon BETWEEN {lon-lonTolerance} AND {lon+lonTolerance} "
        depthClause = f" AND depth BETWEEN {depth-depthTolerance} AND {depth+depthTolerance} "
        if not hasDepth: depthClause = ""                
        return selectClause + timeClause + latClause + lonClause + depthClause        


    if len(df) != 1: halt(f"Invalid dataframe input.\nExpected a single row dataframe but received {len(df)} rows.")
    rowIndex = df.index.values[0]
    df.reset_index(drop=True, inplace=True)
    t= df.iloc[0]["time"]
    lat = df.iloc[0]["lat"]
    lon = df.iloc[0]["lon"] 
    depth = 0
    if 'depth' in df.columns: depth = df.iloc[0]["depth"]
    for table, env in envs.items():
        print(f"{rowIndex} / {rowCount-1}\n\t{datetime.datetime.now()}: Colocalizing {table} with {cyanoFile} ...")
        query = construc_query(table, env, t, lat, lon, depth)
        matchedEnv = api.query(query)
        if len(matchedEnv)>0:
            for v in env["variables"]: df.at[0, v] = matchedEnv.iloc[0][v] 
    return df

In [None]:

def get_month(dt):
    return parse(dt).month

def shift_dt(dt, delta):
    delta = float(delta)
    dt = parse(dt)
    dt += datetime.timedelta(days=delta)
    return dt.strftime("%Y-%m-%d %H:%M:%S")

def in_time_window(sourceDT, targetMinDT, targetMaxDT):
    targetMinDT = targetMinDT.split(".000Z")[0]
    targetMaxDT = targetMaxDT.split(".000Z")[0]
    return not (
                parse(sourceDT) < parse(targetMinDT) or 
                parse(sourceDT) > parse(targetMaxDT)
                )

In [None]:
def construc_query(table, env, t, lat, lon, depth):
    variables = env["variables"] 
    timeTolerance = env["tolerances"][0] 
    latTolerance = env["tolerances"][1] 
    lonTolerance = env["tolerances"][2]  
    depthTolerance = env["tolerances"][3]  
    hasDepth = env["hasDepth"] 
    isClimatology = env["isClimatology"]
    inTimeRange = True
    if not isClimatology:
        startTime = env["startTime"] #issue here, KeyError 'startTime'
        endTime = env["endTime"]    
        inTimeRange = in_time_window(t, startTime, endTime)
    selectClause = "SELECT " + ", ".join([f"AVG({v}) {v}" for v in variables]) + " FROM " + table
    timeClause = f" WHERE [time] BETWEEN '{shift_dt(t, -timeTolerance)}' AND '{shift_dt(t, timeTolerance)}' "
    if not inTimeRange or isClimatology: timeClause = f" WHERE [month]={get_month(t)} "
    latClause = f" AND lat BETWEEN {lat-latTolerance} AND {lat+latTolerance} "
    lonClause = f" AND lon BETWEEN {lon-lonTolerance} AND {lon+lonTolerance} "
    depthClause = f" AND depth BETWEEN {depth-depthTolerance} AND {depth+depthTolerance} "
    if not hasDepth: depthClause = ""                
    return selectClause + timeClause + latClause + lonClause + depthClause   

In [None]:
envs.items()

In [None]:
env

In [None]:
env["startTime"]


In [None]:
pip install debugger

In [None]:
#this will make a file with everything: data/compiled/compiled.csv 
%run compiler.py