# Tropical Cyclones Data Download and Convertion

This script downloads the SMOS/SMAP/SAR data from the [Cyclobs API](https://cyclobs.ifremer.fr/app/docs/) and converts the resulting .nc files into .png files.

## Imports and configurations

In [None]:
# Insert your desired path to work on
import os
os.chdir('../data')

Install requirements.

In [None]:
# General imports
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import netCDF4
from glob import glob
import cv2
#from google.colab.patches import cv2_imshow
from datetime import datetime
import re
import random

# import rasterio as rio
# import rioxarray # geospatial extension for xarray
# import bokeh.io
# import cartopy
# import cartopy.crs as ccrs
# import geoviews as gv
# import geoviews.feature as gf

# bokeh.io.output_notebook()
# gv.extension('bokeh','matplotlib')

Set folder structure.

In [None]:
config = {
    'in_nc': 'nc',
    # Define the SAR features to stack along the 3rd-axis and call the folder according to their siglas
    'out_feats': 'VV_VH_WS',
    'out_archer': 'link_ARCHER_IFREMER'
}

# List comprehension for the folder structure code
[os.makedirs(val) for key, val in config.items() if not os.path.exists(val)]

## 1. Download the .nc products from the Cyclobs API
The "request_url" has to be changed according to the category, mission (Sentinel-1 A and B / SMOS / SMAP) and product type (swath) to be downloaded, as follows:
* **Category 1**: request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-1&cat_max=cat-2&mission=S1B,S1A&product_type=swath&include_cols=all"
* **Category 2**: request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-2&cat_max=cat-3&mission=S1B,S1A&product_type=swath&include_cols=all"
* **Category 3**: request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-3&cat_max=cat-4&mission=S1B,S1A&product_type=swath&include_cols=all"
* **Category 4**: request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-4&cat_max=cat-5&mission=S1B,S1A&product_type=swath&include_cols=all"
* **Category 5**: request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-5&mission=S1B,S1A&product_type=swath&include_cols=all"

Please refer to https://cyclobs.ifremer.fr/app/docs/getData.html for more details.

In [None]:
# Download path
download_path = f"{config['in_nc']}/category5"
os.makedirs(download_path, exist_ok=True)

# Make the resquest using cyclobs API and store the result in a pandas dataframe
#request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-4&cat_max=cat-5&mission=S1B,S1A&product_type=swath&include_cols=all"
request_url="https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-5&mission=S1B,S1A&product_type=swath&include_cols=all"
df_request = pd.read_csv(request_url)

# Add download path
df_request['path'] = df_request['data_url'].map(lambda x : os.path.join(download_path,os.path.basename(x)))
#df_request

# Download 'data_url' to 'path' with wget, and read files
projection=ccrs.Mercator()
datasets = []
for idx,entry in tqdm(df_request.iterrows(), total=df_request.shape[0]):
    ret = os.system('cd %s ; wget -N  %s' % (os.path.dirname(entry['path']),entry['data_url']))

    if ret == 0 : 
        ds = xr.open_dataset(entry['path'])
        datasets.append(ds)
        #datasets.append(ds.rio.reproject(projection.proj4_params))
    else:
        datasets.append(None) # error fetching file

#print(datasets)
df_request['dataset'] = datasets

"""
df_request['dataset'].iloc[0]['wind_speed']

gv_list=[gf.coastline.opts(projection=projection)]
for ds in df_request['dataset']:
    print(ds)
    gv_list.append(gv.Image(ds['wind_speed'].squeeze()[::5,::5],crs=projection).opts(cmap='jet',tools=['hover']))
    
gv.Overlay(gv_list).options(width=800, height=500)
"""

### 1.1. Download data_url, sid and name of each Tropical Cyclone

In [None]:
download_path = "best_track"
os.makedirs(download_path, exist_ok=True)

request_url = "https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-5&mission=SMAP,SMOS,S1B,S1A&include_cols=all"
df_request = pd.read_csv(request_url)

# Add download path
df_request['path'] = df_request['data_url'].map(lambda x : os.path.join(download_path,os.path.basename(x)))

info = pd.DataFrame(data = {'data_url': df_request['data_url'],
                            'sid': df_request["sid"],
                            'TC_name': df_request["cyclone_name"]}
                                     )
info["data_url"] = info["data_url"].apply(lambda x: os.path.basename(x))
info

In [None]:
info.to_csv(f'{download_path}/Cyclobs_info_names.csv', index=False, header=True)  

## 2. Save the features in the .nc product as an image (3 channels = 3 features)

In [None]:
def extractFeature(infoImage, feat, mask=None, mask_dilated=None):
    feature = infoImage.variables[feat][:]

    if mask is not None:
        # Mask out the land values
        if mask_dilated is None:
            feature[0][mask[0] != 0] = 0
            #feature[0][mask[0] == 1] = 1  # if we want land==1
        else:
            feature[0][mask_dilated != 0] = 0
            
    feature = (feature[0] - np.min(feature[0]))/(np.max(feature[0]) - np.min(feature[0]))
    #print("Feature '{}' normalised: max={}, min={}".format(feat, np.max(feature), np.min(feature)))
    #plt.imshow(feature)
    #plt.show()
    return feature

In [None]:
with_land_mask = True
dilate_mask = False
categories = ["category1", "category2", "category3", "category4", "category5"]

for category in categories:
    # Directory where .nc files are
    saved_dir = f"{config['in_nc']}/{category}"
    nc_list = glob(f"{saved_dir}/*.nc")
    
    # Directory where corresponding images will be saved
    images_dir = f"{config['out_feats']}/{category}"
    #images_dir = "Vmax/{}".format(category)
    #images_dir = "masks/{}".format(category)
    os.makedirs(images_dir, exist_ok=True)

    #lon_dir = f"{images_dir}/lon"
    #os.makedirs(lon_dir, exist_ok=True)
    #lat_dir = f"{images_dir}/lat"
    #os.makedirs(lat_dir, exist_ok=True)

    count=0
    for nc_image in nc_list:
        # Read .nc product
        full_info_image = netCDF4.Dataset(nc_image, mode='r') 

        try:
            count += 1
            # Extract the information of the features wanted
            if with_land_mask:
                # Extract mask flag
                mask = full_info_image.variables["mask_flag"][:]
                #print(np.unique(mask[0]))
                #plt.imshow(mask[0])
                #plt.show()
                
                if dilate_mask:
                    mask[mask != 0] = 1
                    kernel = np.ones((11, 11), np.int8)
                    mask_dilated = cv2.dilate(mask[0], kernel, iterations=1)
                else:
                    mask_dilated = None

            # Co-polarization (VV) 
            feature_co = extractFeature(full_info_image, "nrcs_detrend_co", mask, mask_dilated)

            # Cross-polarization (VH)
            feature_cross = extractFeature(full_info_image, "nrcs_detrend_cross", mask, mask_dilated)

            # Wind Speed (WS)
            feature_wind = extractFeature(full_info_image, "wind_speed", mask, mask_dilated)

            # Wind Streaks Orientation (WSO) (sinWSO and cosWSO)
            feature_wso = full_info_image.variables["wind_streaks_orientation"][:]
            #print(np.nanmax(feature_wso[0]), np.nanmin(feature_wso[0]))
            if np.isnan(feature_wso).any():
                count +=1
            feature_wso = np.nan_to_num(feature_wso[0])
            #print("feature_wso")
            #plt.imshow(feature_wso[0])
            #plt.show()            
            
            sin_feature_wso = np.sin(feature_wso * np.pi/180.)
            # Scale values from -1 to 1
            sin_feature_wso = (sin_feature_wso - np.min(sin_feature_wso))/(np.max(sin_feature_wso) - np.min(sin_feature_wso))
            
            cos_feature_wso = np.cos(feature_wso * np.pi/180.)
            # Scale values from -1 to 1
            cos_feature_wso = (cos_feature_wso - np.min(cos_feature_wso))/(np.max(cos_feature_wso) - np.min(cos_feature_wso))

            # Extract longitude and latitude features
            #feature_lon = full_info_image.variables["lon"][:]
            #feature_lat = full_info_image.variables["lat"][:]

            # Extract time registered
            tmax_units = datetime.strptime(full_info_image.measurementDate, '%Y-%m-%dT%H:%M:%SZ')
            tmax_units = tmax_units.strftime('%Y-%m-%d %H_%M_%S')

            # Stack matrices along 3rd axis (depth-wise)
            output_image = np.dstack((feature_co, feature_cross, feature_wind))  # VV+VH+WS
            #output_image = np.dstack((feature_wind[0], np.sin(feature_wso[0] * np.pi/180.), np.cos(feature_wso[0] * np.pi/180.)))

            if (output_image < 0).any():
                output_image = np.clip(output_image, a_min=0, a_max=None)
            #print("First output image:")
            #print('Shape:', output_image.shape)
            #print("Normalised: max={}, min={}".format(np.max(output_image), np.min(output_image)))
            #print("Output without NaNs: max={}, min={}".format(np.nanmax(output_image), np.nanmin(output_image)))
            #plt.imshow(output_image)
            #plt.show()

            full_info_image.close()
        except KeyError as err:
            # Creating KeyError instance for book keeping
            print("Error:", err)
            full_info_image.close()
            continue

        # Save information extracted
        plt.imsave("{}/{}.png".format(images_dir, tmax_units), output_image, format='png')
        #np.save("{}/{}_mask.npy".format(images_dir, tmax_units), np.array(mask[0]))
        #np.save("{}/{}_Vmax.npy".format(images_dir, tmax_units), np.max(feature_wind[0]))
        #np.save("{}/{}.npy".format(lon_dir, tmax_units), np.mean(feature_lon[0]))
        #np.save("{}/{}.npy".format(lat_dir, tmax_units), np.mean(feature_lat[0]))
    print('Number of products with NaNs in WSO:', count)

### 2.2. Delete doubled files

In [None]:
for cat in range(5, 1, -1):
    print(f"Category {cat-1}:")
    files_toKeep = glob(f"{config['out_feats']}/category{cat}/*.png")

    folder_toDel = f"{config['out_feats']}/category{cat-1}"
    files_toDel = glob(f"{config['out_feats']}/category{cat-1}/*.png")

    for f in files_toKeep:
        f = f.split(os.sep)[-1]
        for s in files_toDel:
            s = s.split(os.sep)[-1]
            if f == s:
              print('File removed:', s)
              os.remove(f"{config['out_feats']}/category{cat-1}/{s}")

In [None]:
source_files = glob(f"{config['out_feats']}/category5/*.png")
len(source_files)

## 3. Split data between train, validation and test csv files

In [None]:
category = ["category1", "category2", "category3", "category4", "category5"]
labels_path = "labels/via_project_11Jan2021_12h22m_csv.csv"

df = pd.read_csv(labels_path)
cols = df.columns.tolist()
for idx, row in df.iterrows():
    # Retrieve the coordinates of the bounding boxes
    df['region_shape_attributes'][idx] = re.findall(r'\d+', df['region_shape_attributes'][idx])

# Use a seed so to obtain the same splitting every time
random.seed(20)

df_train, df_val, df_test, df_full = [], [], [], []
for cat in category:
    dir = f"{config['out_feats']}/{cat}"
    lon_dir = f"{dir}/lon"
    lat_dir = f"{dir}/lat"

    image_list = glob(f'{dir}/*.png')

    for image_path in image_list:
        #print(image_path)
        image_name = os.path.basename(image_path)

        lon_list = glob('{}/{}.npy'.format(lon_dir, image_name.split('.')[0]))
        lat_list = glob('{}/{}.npy'.format(lat_dir, image_name.split('.')[0]))

        lon = lon_list[0] if lon_list != [] else None
        lat = lat_list[0] if lat_list != [] else None

        aux = [image_path]
        aux.extend([lat, lon])
        #print(df[df.filename == image_name])
        index = df[df.filename == image_name].index[0]
        bb_present = 1 if df['region_count'][index] == 1 else 0
        aux.append(bb_present)
        aux.append(df['region_shape_attributes'][index])

        df_full.append(aux)
        rand = random.randint(0, 100)
        # Split used: 60, 20, 20
        if rand < 60:
            df_train.append(aux)
        elif rand < 80:
            df_val.append(aux)
        else:
            df_test.append(aux) 

col_names = ['image', 'lat', 'lon', 'label', 'bbox_shape']
df_train = pd.DataFrame(df_train, columns = col_names)
df_val = pd.DataFrame(df_val, columns = col_names)
df_test = pd.DataFrame(df_test, columns = col_names)
df_full = pd.DataFrame(df_full, columns = col_names)

if (df_full['lat'].values == None).all() and (df_full['lon'].values == None).all():
    #print("Warning: lat and lon variables not considered.")
    df_full.drop(columns=['lat', 'lon'], inplace=True)

# Create path to save the csv files
CSV_PATH = f"{config['out_feats']}/csv"
os.makedirs(CSV_PATH, exist_ok=True)

if df_train.empty == False:
    df_train.to_csv(f'{CSV_PATH}/training.csv', index=False)

if df_val.empty == False:
    df_val.to_csv(f'{CSV_PATH}/val.csv', index=False)

if df_test.empty == False:
    df_test.to_csv(f'{CSV_PATH}/test.csv', index=False)

if df_full.empty == False:
    df_full.to_csv(f'{CSV_PATH}/full_dataset.csv', index=False)

## 4. Test the location of the bounding boxes around the eye

In [None]:
df = pd.read_csv(f"{config['out_feats']}/csv/full_dataset.csv", converters={'bbox_shape': eval})

for idx, row in df.iterrows():
    print("Image:", df['image'][idx], "label:", df['label'][idx])
    img = cv2.imread(df['image'][idx])

    b = img/np.max(img)
    fig, ax = plt.subplots(figsize=(5,8))
    plt.imshow(b)

    if df['bbox_shape'][idx] != []:
        bbox = df['bbox_shape'][idx]

        # Read dimensions of bbox_eye
        cX = (int)(bbox[0])
        cY = (int)(bbox[1])
        bb_width = (int)(bbox[2])
        bb_height = (int)(bbox[3])

        ax.add_patch(matplotlib.patches.Rectangle((cX, cY), bb_width, bb_height, color ='green', fc='none'))
    plt.show()

## 5. Link .nc files to ARCHER products

### 5.1. Read all txt files from ARCHER and convert them to csv files

In [None]:
# Directory where ARCHER products are stored
archer_dir = f"{config['out_archer']}/ARCHER_products"
folders = glob(f"{archer_dir}/*")
cnt = 0

for folder in folders:
    #print(folder)
    files = glob(f"{folder}/*")
    file = os.path.join(folder, "archerTrackTable.txt")

    if file in files:
        cnt+=1
        try:
            # Read txt file as pandas df
            cols = ["Date/Time (UTC)", "Source Sensor", "Vmax (kts)", "t_off (hrs)", "Geo-ref Lat",
                    "Lon", "Opr Ctr Lat", "Lon2", "50% cert rad", "95% cert rad", "Eye diam (deg)", "% cert eye", "50% cert fxa"]
            df = pd.read_csv(file, sep="\s{2,}", engine='python', skiprows=1, names = cols, index_col=False)
            df["Date/Time (UTC)"] = pd.to_datetime(df["Date/Time (UTC)"], format='%Y-%m-%d %H:%M:%S')
            df.to_csv(f"{folder}/archerTrackTable.csv", index=False)

        except ValueError as err:
            print("Error:", err)
            file = os.path.join(folder, "summaryTable.txt")
            if file in files:
                try:
                    # read .txt file as pandas df
                    cols = ["Date/Time (UTC)", "Source Sensor", "Vmax (kts)", "ARCHER Lat", "Lon", "Geo-ref Lat",
                            "Lon2", "50% cert rad", "95% cert rad", "Eye diam (deg)", "% cert eye", "50% cert fxa"]
                    df = pd.read_csv(file, sep="\s{2,}", engine='python', skiprows=1, names = cols, index_col=False)
                    df["Date/Time (UTC)"] = df["Date/Time (UTC)"].apply(lambda date: date if " *" not in date else date.replace(" *", ""))
                    df["Date/Time (UTC)"] = pd.to_datetime(df["Date/Time (UTC)"], format='%Y-%m-%d %H:%M:%S')
                    df.to_csv(f"{folder}/summaryTable.csv", index=False)
                
                except ValueError as err:
                    print("Second Error:", err)
                    continue
print(cnt)

### 5.2. Link each .nc file to an ARCHER csv file

#### 5.2.1. Gather info from .nc files

In [None]:
categories = ["category1", "category2", "category3", "category4", "category5"]

nc_files = []
date_time = []
extreme_coords = []
for category in categories:
    print(category)
    # Directory where .nc files are
    saved_dir = f"{config['in_nc']}/{category}"

    # Directory where corresponding images will be saved
    images_dir = f"{config['out_feats']}/{category}"

    if category == 'category5':
        # Get .nc files
        nc_list = glob(f"{saved_dir}/*.nc")
    else:
        cat = int(re.findall("\d", category)[0])
        categories_higher = [i for i in range(cat+1, 6)]
        nc_list_higher_all = []
        for cat_h in categories_higher:
            nc_list_higher_all.extend(glob(f"{config['in_nc']}/category{cat_h}/*.nc"))

        nc_list_higher = [os.path.basename(x) for x in nc_list_higher_all]
        nc_list = [file for file in glob(f"{saved_dir}/*.nc") if os.path.basename(file) not in nc_list_higher]
        
    cnt = 0
    for nc_image in nc_list:
        # Read .nc product
        full_info_image = netCDF4.Dataset(nc_image, mode='r') 

        # Getting the information of the feature wanted
        try:
            # Try to read this feature as it usually gives problems
            feature_cross = full_info_image.variables["nrcs_detrend_cross"][:]

            # extract time registered
            tmax_units = datetime.strptime(full_info_image.measurementDate, '%Y-%m-%dT%H:%M:%SZ')
            #tmax_units = tmax_units.strftime('%Y-%m-%d %H_%M_%S')

            # extract longitude and latitude features
            feature_lon = full_info_image.variables["lon"][:]
            #print("feature_lon:", np.max(feature_lon[0]), np.min(feature_lon[0]))
            feature_lat = full_info_image.variables["lat"][:]
            #print("feature_lat:", np.max(feature_lat[0]), np.min(feature_lat[0]))

            coords = [np.min(feature_lat[0]), np.max(feature_lat[0]), np.min(feature_lon[0]), np.max(feature_lon[0])]

            nc_files.append(nc_image)
            date_time.append(tmax_units)
            extreme_coords.append(coords)

            full_info_image.close()
        except KeyError as err:
            # Creating KeyError instance for book keeping
            print("Error:", err)
            full_info_image.close()
            continue
        cnt +=1
    print(cnt)

df = pd.DataFrame({"nc_files": nc_files, "date_time": date_time, "extreme_coords": extreme_coords})
df["image"] = df["date_time"].apply(lambda i: i.strftime('%Y-%m-%d %H_%M_%S'+'.png'))
df["ARCHER product"] = np.empty((len(df), 0)).tolist()
df

In [None]:
df.to_csv(f"{config['out_archer']}/IFREMER_info.csv", index=False)

#### 5.2.2. Link .nc files to ARCHER

In [None]:
df = pd.read_csv(f"{config['out_archer']}/IFREMER_info.csv", converters={'ARCHER product': eval, 'extreme_coords': eval})
df["date_time"] = pd.to_datetime(df["date_time"])

# directory where ARCHER products are stored
archer_dir = f"{config['out_archer']}/ARCHER_products"
folders = glob(f"{archer_dir}/*")
cnt = 0

def link_ARCHER_IFREMER(x, EndDate, StartDate, lat_arr, lon_arr):
    lat_ok = False
    lon_ok = False
    x['extreme_coords'] = np.array(x['extreme_coords'], dtype=np.float)
    print("StartDate:", StartDate, ", EndDate:", EndDate, ", SAR datetime:", x['date_time'])

    if StartDate <= x['date_time'] <= EndDate:
        print("Inside datetime!")
        print(x['extreme_coords'])

        print(lat_arr)
        if any((x['extreme_coords'][0] <= lat_arr) & (lat_arr <= x['extreme_coords'][1])):
            print("LAT OK!")
            lat_ok = True
        #for item in lat_arr:
        #    print(type(item), type(x['extreme_coords'][0]), type(x['extreme_coords'][1]))
        #    if x['extreme_coords'][0] <= item <= x['extreme_coords'][1]:
        #        print("LAT OK! Value: {}".format(item))
        #        lat_ok = True
        #        break

        print(lon_arr)
        if any((x['extreme_coords'][2] <= lon_arr) & (lon_arr <= x['extreme_coords'][3])):
            print("LON OK!")
            lon_ok = True
        #for item in lon_arr:
        #    if x['extreme_coords'][2] <= item <= x['extreme_coords'][3]:
        #        print("LON OK! Value: {}".format(item))
        #        lon_ok = True
        #        break

        if lat_ok and lon_ok:
            #sys.exit()
            return True
        else:
            return False
    else:
        return False

for folder in folders:
    cnt +=1
    #print(folder)
    csv_files = glob(f"{folder}/*.csv")

    if len(csv_files) > 1:
        csv_files = [s for s in csv_files if "new" in s]

    for csv_file in csv_files:
        archer_df = pd.read_csv(csv_file, header=0)
        archer_df["Date/Time (UTC)"] = pd.to_datetime(archer_df["Date/Time (UTC)"])
        archer_df = archer_df.dropna()

        StartDate = np.min(archer_df["Date/Time (UTC)"])
        EndDate = np.max(archer_df["Date/Time (UTC)"])
        if (EndDate - StartDate ) / np.timedelta64(1, 'D') > 30:    # time difference of more than 30 days
            #print(folder, "- StartDate:", StartDate, "EndDate:", EndDate)
        
        inTime_df = df[(df.date_time >= StartDate) & (df.date_time <= EndDate)]

        if not inTime_df.empty:
            lat = archer_df["Geo-ref Lat"].dropna().to_numpy()
            lat_arr = np.delete(lat, np.argwhere(lat == "***"))
            lat_arr = lat_arr.astype(np.float)

            lon_col = "Lon" if "archerTrackTable" in os.path.basename(csv_file) else "Lon2"
            lon = archer_df[lon_col].dropna().to_numpy()
            lon_arr = np.delete(lon, np.argwhere(lon == "***"))
            lon_arr = lon_arr.astype(np.float)

            #df["ARCHER product"] = df["ARCHER product"].apply(lambda i: i if not (df.date_time < EndDate & df.date_time > StartDate) else i.append(folder)
            df.apply(lambda x: x["ARCHER product"].append(csv_file) if link_ARCHER_IFREMER(x, EndDate, StartDate, lat_arr, lon_arr) else x["ARCHER product"], axis = 1)
df

In [None]:
df.to_csv(f"{config['out_archer']}/ARCHER_IFREMER_link_by_date_and_coords.csv", index=False)

#### 5.2.3. Deal with problematic ARCHER products

In [None]:
# Folders with time_diffs between StartDate and EndDate higher than 30 days (problematic)
# link_ANCHER_IFREMER/ARCHER_products/2016_30W - StartDate: 2016-12-21 05:30:00 EndDate: 2017-11-15 11:30:00
# link_ANCHER_IFREMER/ARCHER_products/2018_03S - StartDate: 2018-01-02 17:30:00 EndDate: 2018-11-11 17:30:00
# link_ANCHER_IFREMER/ARCHER_products/2018_09P - StartDate: 2018-02-13 06:00:00 EndDate: 2019-01-07 21:00:00
# link_ANCHER_IFREMER/ARCHER_products/2020_07S - StartDate: 2020-01-11 11:30:00 EndDate: 2020-12-30 17:30:00
# link_ANCHER_IFREMER/ARCHER_products/2020_10S - StartDate: 2020-01-26 17:30:00 EndDate: 2021-01-19 11:30:00

folder = f"{config['out_archer']}/ARCHER_products/2020_10S"
csv_files = glob(f"{folder}/*.csv")

for csv_file in csv_files:

    archer_df = pd.read_csv(csv_file, header=0)
    archer_df["Date/Time (UTC)"] = pd.to_datetime(archer_df["Date/Time (UTC)"])
    archer_df = archer_df.sort_values(by=["Date/Time (UTC)"], ignore_index=True).dropna()

    # check time differences between consecutive obervations
    time_diffs = archer_df['Date/Time (UTC)'].diff().dropna()
    print(time_diffs)
    # get index for when time diff is above 2 days
    index = np.where(time_diffs / np.timedelta64(1, 'D') > 5)[0]

    if index.size > 0:
        df1 = archer_df.iloc[:index[0]+1]
        df2 = archer_df.iloc[index[0]+1:]
        print(df1)
        print(df2)

        input("Press Enter to continue...")
        df1.to_csv("{}/{}_new1.csv".format(folder, os.path.basename(csv_file)[:-4]), index=False)
        df2.to_csv("{}/{}_new2.csv".format(folder, os.path.basename(csv_file)[:-4]), index=False)

### 5.3. Test location of eye according to ARCHER and best track

In [None]:
def find_nearest(array, value):
    #array = np.asarray(array)
    X = np.abs(array - value)
    idx = np.where(X == X.min())
    print("Index:", idx)
    #print("Value wanted: {} and closest value in X: {}".format(value, array[idx[0], idx[1]]))
    #return array[idx[0], idx[1]], idx
    flat_index = np.argmin(np.abs(array - value))
    alt_idx = np.unravel_index(flat_index, array.shape)
    print("Alternatively:", alt_idx)
    print("Value wanted: {} and closest value in X: {}".format(value, array[alt_idx[0], alt_idx[1]]))
    return array[alt_idx[0], alt_idx[1]], alt_idx

In [None]:
request_url = "https://cyclobs.ifremer.fr/app/api/getData?cat_min=cat-1&cat_max=cat-5&mission=S1B,S1A&product_type=swath&include_cols=all"
df_request = pd.read_csv(request_url)

nc_CyclObs_info = pd.DataFrame(data = {'data_url': df_request['data_url'], 'sid': df_request["sid"], 'TC_name': df_request["cyclone_name"]})
nc_CyclObs_info["data_url"] = nc_CyclObs_info["data_url"].apply(lambda x: os.path.basename(x))
nc_CyclObs_info

In [None]:
df = pd.read_csv(f"{config['out_archer']}/ARCHER_IFREMER_link_by_date_and_coords.csv", converters={'ARCHER product': eval})
df["date_time"] = pd.to_datetime(df["date_time"])

best_track_df = pd.read_csv("best_track/ibtracs.since1980.list.v04r00.csv", header=0, skiprows=range(1, 2))
best_track_df["ISO_TIME"] = pd.to_datetime(best_track_df["ISO_TIME"])
best_track_df['ISO_TIME_str'] = best_track_df['ISO_TIME'].apply(lambda i: datetime.strftime(i, "%Y-%m-%d"))

time_diffs = []

cnt = 0
for index, row in df.iterrows():
    print(row)
    cnt+=1

    full_info_image = netCDF4.Dataset(row["nc_files"], mode='r') 

    mask = full_info_image.variables["mask_flag"][:]
    mask[mask != 0] = 1
    # dilate mask
    kernel = np.ones((11, 11), np.int8)
    mask_dilated = cv2.dilate(mask[0], kernel, iterations = 1)

    feature_co = full_info_image.variables["nrcs_detrend_co"][:]
    feature_co[0][mask_dilated != 0] = 0 # mask out the land values
    feature_co = (feature_co[0] - np.min(feature_co[0]))/(np.max(feature_co[0]) - np.min(feature_co[0]))

    feature_cross = full_info_image.variables["nrcs_detrend_cross"][:]
    feature_cross[0][mask_dilated != 0] = 0 # mask out the land values
    feature_cross = (feature_cross[0] - np.min(feature_cross[0]))/(np.max(feature_cross[0]) - np.min(feature_cross[0]))

    feature_wind = full_info_image.variables["wind_speed"][:]
    feature_wind[0][mask_dilated != 0] = 0 # mask out the land values
    feature_wind = (feature_wind[0] - np.min(feature_wind[0]))/(np.max(feature_wind[0]) - np.min(feature_wind[0]))

    # stack matrices along 3rd axis (depth-wise)
    output_image = np.dstack((feature_co, feature_cross, feature_wind))

    # extract longitude and latitude features
    feature_lon = full_info_image.variables["lon"][:]
    #feature_lon = feature_lon[0]
    feature_lat = full_info_image.variables["lat"][:]
    #feature_lat = feature_lat[0]

    nc_info = nc_CyclObs_info.loc[nc_CyclObs_info["data_url"] == os.path.basename(row["nc_files"])]
    nc_sid = nc_info["sid"].values[0].upper()
    print(nc_sid)

    # prepare plot of coordinates
    fig, ax = plt.subplots()
    ax.imshow(output_image)    

    # filter best_track_df by date
    aux = best_track_df.loc[best_track_df['ISO_TIME_str'] == datetime.strftime(row["date_time"], "%Y-%m-%d")][["USA_ATCF_ID", "ISO_TIME", "LAT", "LON"]]
    #print(aux)
    if not aux.empty:
        best_track = aux.loc[aux['USA_ATCF_ID'] == nc_sid] if len(np.unique(aux['USA_ATCF_ID'])) > 1 else aux

        if not best_track.empty:
            best_track = best_track.append({"ISO_TIME": row["date_time"], "LAT": np.nan, "LON": np.nan}, ignore_index=True)
            best_track = best_track.sort_values(by=["ISO_TIME"], ignore_index=True)
            interp_best_track = best_track.interpolate(method='linear')
            bt_lat_value = interp_best_track.loc[interp_best_track["ISO_TIME"] == row["date_time"]]["LAT"]
            bt_lon_value = interp_best_track.loc[interp_best_track["ISO_TIME"] == row["date_time"]]["LON"]
            #print(interp_best_track)
            #print(bt_lat_value.values[0], bt_lon_value.values[0])

            _, bt_idx_lat = find_nearest(feature_lat[0], bt_lat_value.values[0])
            _, bt_idx_lon = find_nearest(feature_lon[0], bt_lon_value.values[0])
            bt_x = bt_idx_lon[1]
            bt_y = bt_idx_lat[0]
            #print("BT Center:", bt_x, bt_y)

            ax.plot(bt_x, bt_y, '^', mfc='none', label = "Best track")

    for item in row['ARCHER product']:
        if row['ARCHER product'] == []:
            continue
        print(item)
        archer_df = pd.read_csv(item)
        archer_df["Date/Time (UTC)"] = pd.to_datetime(archer_df["Date/Time (UTC)"])

        lon_col = "Lon" if "archerTrackTable" in os.path.basename(item) else "Lon2"
        coords = archer_df[["Date/Time (UTC)", "Geo-ref Lat", lon_col]].dropna()
        coords_df = coords[coords["Geo-ref Lat"] != "***"]
        coords_df["Geo-ref Lat"] = coords_df["Geo-ref Lat"].astype(float)
        coords_df[lon_col] = coords_df[lon_col].astype(float)
        coords_df = coords_df.append({"Date/Time (UTC)": row["date_time"], "Geo-ref Lat": np.nan, lon_col: np.nan}, ignore_index=True)
        coords_df = coords_df.sort_values(by=["Date/Time (UTC)"], ignore_index=True)
        print(coords_df)

        # Check time difference between ARCHER records and SAR measuremente Date
        index = coords_df.index[coords_df["Date/Time (UTC)"] == row["date_time"]].tolist()[0]
        test_diff = coords_df.iloc[index-1:index+2] if index > 0 else coords_df.iloc[index:index+2]
        print(min(test_diff['Date/Time (UTC)'].diff().dropna().tolist()))
        time_diffs.append(min(test_diff['Date/Time (UTC)'].diff().dropna().tolist()))

        interp_coords_df = coords_df.interpolate(method='linear')
        lat_value = interp_coords_df.loc[interp_coords_df["Date/Time (UTC)"] == row["date_time"]]["Geo-ref Lat"]
        lon_value = interp_coords_df.loc[interp_coords_df["Date/Time (UTC)"] == row["date_time"]][lon_col]
        #print(interp_coords_df.loc[interp_coords_df["Date/Time (UTC)"] == row["date_time"]])
    
        lat, idx_lat = find_nearest(feature_lat[0], lat_value.values[0])
        lon, idx_lon = find_nearest(feature_lon[0], lon_value.values[0])
        x = idx_lon[1]
        y = idx_lat[0]
        print("ARCHER Center:", x, y)

        ax.plot(x, y, 'o', mfc='none', label = "ARCHER")
        
    ax.legend(title = "Source", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    
    #plots_dir = f"{config['out_archer']}/plot_coords"
    #os.makedirs(plots_dir, exist_ok=True)
    #fig.savefig("{}/TC_center_by_source_{}.jpg".format(plots_dir, cnt), bbox_inches='tight')

#time_diffs

In [None]:
time_diff_df = pd.DataFrame(time_diffs, columns=['Date'])
#print(time_diff_df)

hours = time_diff_df.Date / np.timedelta64(1, 'h')
#hours = hours[hours < 48]

fig,ax = plt.subplots(1, 1, figsize=(15,8))
ax.set(xlabel = 'Time difference (hours)', title='Time difference between SAR and ARCHER products')
hours.plot.hist(bins=100)
plt.grid(True, axis='y')
plt.show()

#fig.savefig(f"{config['out_archer']}/time_diff_hist.png", bbox_inches='tight')

## 6. Test black pixels in image

In [None]:
# Read the image path
image_path = f"{config['out_feats']}/category2/2018-10-24 01_06_05.png"
im = cv2.imread(image_path) # loads images as BGR in float32
image = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)   # BGR -> RGB
plt.imshow(image)
print(image.size)
print(image.shape)
print(image.shape[0]*image.shape[1])
print(cv2.countNonZero(image[:,:,0]))
print(cv2.countNonZero(image[:,:,1]))
print(cv2.countNonZero(image[:,:,2]))
percentage = (image.shape[0]*image.shape[1] - cv2.countNonZero(image[:,:,0]))/(image.shape[0]*image.shape[1]) * 100
percentage

In [None]:
# Read the image path
image_path = f"{config['out_feats']}/category2/2020-09-22 10_17_34.png"
im = cv2.imread(image_path) # loads images as BGR in float32
image = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)   # BGR -> RGB
plt.imshow(image)
print(image.size)
print(image.shape)
print(image.shape[0]*image.shape[1])
print(cv2.countNonZero(image[:,:,0]))
percentage = (image.shape[0]*image.shape[1] - cv2.countNonZero(image[:,:,0]))/(image.shape[0]*image.shape[1]) * 100
percentage