# Gap Analysis for IDW Interpolation in Weekly and Monthly interval

* [1. Weekly Preprocess](#week)
* [2. Monthly Preprocess](#month)

In [10]:
import pyproj
import pandas as pd
import rasterio
import glob
import warnings
# import arcpy
from pyproj import Proj, Transformer
warnings.filterwarnings('ignore')

In [2]:
gis_path = r'F:\SEACAR_WQ_2024/GIS_Data/'
dfDis = pd.read_csv(gis_path + 'OEAT_Discrete_WQ-2024-May-06.csv', low_memory=False)

## 1. Weekly Preprocess <a class="anchor" id="week"></a>
### Aggregate the discrete data in same location as one row, and compute the period.

In [3]:
area_ab = ["GTM","EB","CH","BB","BBS"]
period_type = [" 52 week"," Month"]
def select_data_period(df,area,period):
    sheet_name = str(area) + str(period)
    df_period_table = pd.read_excel(gis_path + "All_Waterbodies_Season_Month_Week_Definitions.xlsx",sheet_name=sheet_name)
    df_select_area = df[df["WbodyAcronym"]==str(area)]
    df_period_table['Start Date'] = pd.to_datetime(df_period_table['Start Date'])
    df_period_table['End Date']   = pd.to_datetime(df_period_table['End Date'])
    sub_dfs = []

    for index, row in df_period_table.iterrows():
        start_date = row['Start Date']
        end_date   = row['End Date']
        sub_df = df_select_area[(df_select_area['SampleDate'] >= start_date) & (df_select_area['SampleDate'] < end_date)]
        sub_df['Period'] = row["Week"]
        sub_dfs.append(sub_df)
        
    df_period = pd.concat(sub_dfs,ignore_index=True)
    return df_period

In [4]:
dfDis['SampleDate'] = pd.to_datetime(dfDis['SampleDate'], format='%Y-%m-%d %H:%M:%S.%f')

In [5]:
sel_week_temp = []
for each in area_ab:
    df_week_temp = select_data_period(dfDis,str(each)," 52 week")
    df_week_temp_group = df_week_temp.groupby(['WaterBody','ParameterName','ParameterUnits',
                                          'Year','Season','Latitude_DD','Longitude_DD','WbodyAcronym',"Period"])["ResultValue"].agg("mean").reset_index()
    sel_week_temp.append(df_week_temp_group)
df_week_select_Mean = pd.concat(sel_week_temp,ignore_index=True)

In [6]:
param_shortnames = {
    'Salinity': 'Sal_ppt',
    'Total Nitrogen': 'TN_mgl',
    'Dissolved Oxygen': 'DO_mgl',
    'Turbidity':'Turb_ntu',
    'Secchi Depth':'Secc_m',
    'Water Temperature':'T_c'
}

In [7]:
df_week_select_Mean["ParaAcronym"] = df_week_select_Mean["ParameterName"].map(param_shortnames)

In [14]:
# Define the EPSG codes for source (EPSG:4326) and target (EPSG:3086) coordinate systems
source_epsg = 'EPSG:4326'
target_epsg = 'EPSG:3086'

# Create a PyProj Transformer for the conversion
transformer = Transformer.from_crs(source_epsg, target_epsg, always_xy=True)

# Define a function to apply the transformation to each row of the DataFrame
def transform_coordinates(row):
    x, y = transformer.transform(row['Longitude_DD'], row['Latitude_DD'])
#     print(f"Transformed coordinates: {x}, {y}")
    return pd.Series({'x': x, 'y': y})

# Apply the transformation function to the DataFrame and create new columns for the converted coordinates
df_week_select_Mean[['x', 'y']] = df_week_select_Mean.apply(transform_coordinates, axis=1)

In [16]:
df_week_select_Mean.head(10)

Unnamed: 0,WaterBody,ParameterName,ParameterUnits,Year,Season,Latitude_DD,Longitude_DD,WbodyAcronym,Period,ResultValue,ParaAcronym,x,y
0,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,25,5.695,DO_mgl,669975.848287,626752.656623
1,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,29,6.36,DO_mgl,669975.848287,626752.656623
2,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,34,6.55,DO_mgl,669975.848287,626752.656623
3,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,25,6.21,DO_mgl,668862.259531,631692.835328
4,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,29,6.195,DO_mgl,668862.259531,631692.835328
5,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,34,6.78,DO_mgl,668862.259531,631692.835328
6,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,25,4.552308,DO_mgl,665019.217025,631919.730604
7,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,29,3.904615,DO_mgl,665019.217025,631919.730604
8,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,32,5.91,DO_mgl,665019.217025,631919.730604
9,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.66822,-81.2631,GTM,29,4.66,DO_mgl,664501.021891,631983.670235


## Compute the total existing tiff files

In [17]:
raster_file_list = glob.glob(gis_path+"raster_output/idw_week/*.tif")
raster_name_list = []
for each in raster_file_list:
    name = each.split("\\")[-1]
    raster_name_list.append(name)

## Get the point value on the corresponding tiff file

In [18]:
value_list = []
for index,row in df_week_select_Mean.iterrows():
    wb     = row["WbodyAcronym"]
    para   = row["ParaAcronym"]
    year   = str(row["Year"])
    period = str(row["Period"])
    name   = str(f"{wb}_{para}_IDW_{year}_{period}.tif")
    x_cor  = row["x"]
    y_cor  = row["y"]
    path   = str(gis_path+"/raster_output/idw_week/"+name)
    if name in raster_name_list:
        with rasterio.open(path) as src:
            row, col = src.index(x_cor, y_cor)
            value = src.read(1)[row, col]
            if value < -9999:
                value = ""
            else:
                value = value
            value_list.append(value)
    else:
        value = ""
        value_list.append(value)

In [19]:
df_week_select_Mean["ConValue"] = value_list

In [20]:
df_week_select_Mean.head(10)

Unnamed: 0,WaterBody,ParameterName,ParameterUnits,Year,Season,Latitude_DD,Longitude_DD,WbodyAcronym,Period,ResultValue,ParaAcronym,x,y,ConValue
0,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,25,5.695,DO_mgl,669975.848287,626752.656623,4.721951
1,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,29,6.36,DO_mgl,669975.848287,626752.656623,4.092683
2,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,34,6.55,DO_mgl,669975.848287,626752.656623,6.492334
3,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,25,6.21,DO_mgl,668862.259531,631692.835328,4.721951
4,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,29,6.195,DO_mgl,668862.259531,631692.835328,4.092683
5,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,34,6.78,DO_mgl,668862.259531,631692.835328,6.492334
6,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,25,4.552308,DO_mgl,665019.217025,631919.730604,4.722013
7,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,29,3.904615,DO_mgl,665019.217025,631919.730604,4.092873
8,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,32,5.91,DO_mgl,665019.217025,631919.730604,5.649822
9,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.66822,-81.2631,GTM,29,4.66,DO_mgl,664501.021891,631983.670235,4.105548


In [21]:
df_week_select_Mean.to_csv(gis_path + "week_difference.csv")

## 2. Monthly Preprocess <a class="anchor" id="month"></a>

In [22]:
area_ab = ["GTM","EB","CH","BB","BBS"]
period_type = [" 52 week"," Month"]
def select_data_period1(df,area,period):
    sheet_name = str(area) + str(period)
    df_period_table = pd.read_excel(gis_path + "All_Waterbodies_Season_Month_Week_Definitions.xlsx",sheet_name=sheet_name)
    df_select_area = df[df["WbodyAcronym"]==str(area)]
    df_period_table['Start Date'] = pd.to_datetime(df_period_table['Start Date'])
    df_period_table['End Date']   = pd.to_datetime(df_period_table['End Date'])
    sub_dfs = []

    for index, row in df_period_table.iterrows():
        start_date = row['Start Date']
        end_date   = row['End Date']
        sub_df = df_select_area[(df_select_area['SampleDate'] >= start_date) & (df_select_area['SampleDate'] < end_date)]
        sub_df['Period'] = row["Month"]
        sub_dfs.append(sub_df)
        
    df_period = pd.concat(sub_dfs,ignore_index=True)
    return df_period

In [23]:
sel_month_temp = []
for each in area_ab:
    df_month_temp = select_data_period1(dfDis,str(each)," Month")
    df_month_temp_group = df_month_temp.groupby(['WaterBody','ParameterName','ParameterUnits',
                                          'Year','Season','Latitude_DD','Longitude_DD','WbodyAcronym',"Period"])["ResultValue"].agg("mean").reset_index()
    sel_month_temp.append(df_month_temp_group)
df_month_select_Mean = pd.concat(sel_month_temp,ignore_index=True)

In [24]:
df_month_select_Mean["ParaAcronym"] = df_month_select_Mean["ParameterName"].map(param_shortnames)

In [25]:
# Define the EPSG codes for source (EPSG:4326) and target (EPSG:3086) coordinate systems
source_epsg = 'EPSG:4326'
target_epsg = 'EPSG:3086'

# Create a PyProj Transformer for the conversion
transformer = pyproj.Transformer.from_crs(source_epsg, target_epsg, always_xy=True)

# Define a function to apply the transformation to each row of the DataFrame
def transform_coordinates(row):
    x, y = transformer.transform(row['Longitude_DD'], row['Latitude_DD'])
    return pd.Series({'x': x, 'y': y})

# Apply the transformation function to the DataFrame and create new columns for the converted coordinates
df_month_select_Mean[['x', 'y']] = df_month_select_Mean.apply(transform_coordinates, axis=1)

In [26]:
df_month_select_Mean.head(10)

Unnamed: 0,WaterBody,ParameterName,ParameterUnits,Year,Season,Latitude_DD,Longitude_DD,WbodyAcronym,Period,ResultValue,ParaAcronym,x,y
0,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,6,5.695,DO_mgl,669975.848287,626752.656623
1,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,7,6.36,DO_mgl,669975.848287,626752.656623
2,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,8,6.55,DO_mgl,669975.848287,626752.656623
3,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,6,6.21,DO_mgl,668862.259531,631692.835328
4,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,7,6.195,DO_mgl,668862.259531,631692.835328
5,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,8,6.78,DO_mgl,668862.259531,631692.835328
6,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,6,4.552308,DO_mgl,665019.217025,631919.730604
7,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,7,3.904615,DO_mgl,665019.217025,631919.730604
8,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,8,5.91,DO_mgl,665019.217025,631919.730604
9,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.66822,-81.2631,GTM,7,4.66,DO_mgl,664501.021891,631983.670235


In [27]:
raster_file_list_m = glob.glob(gis_path+"raster_output/idw_month/*.tif")
raster_name_list_m = []
for each in raster_file_list_m:
    name = each.split("\\")[-1]
    raster_name_list_m.append(name)

In [28]:
value_list_m = []
for index,row in df_month_select_Mean.iterrows():
    wb     = row["WbodyAcronym"]
    para   = row["ParaAcronym"]
    year   = str(row["Year"])
    period = str(row["Period"])
    name   = str(f"{wb}_{para}_IDW_{year}_{period}.tif")
    x_cor  = row["x"]
    y_cor  = row["y"]
    path   = str(gis_path+"/raster_output/idw_month/"+name)
    if name in raster_name_list:
        with rasterio.open(path) as src:
            x,y = (x_cor,y_cor)
            row, col = src.index(x, y)
            value = src.read(1)[row, col]
            if value < -9999:
                value = ""
            else:
                value = value
            value_list_m.append(value)
    else:
        value = ""
        value_list_m.append(value)

In [29]:
df_month_select_Mean["ConValue"] = value_list_m

In [30]:
df_month_select_Mean.head(10)

Unnamed: 0,WaterBody,ParameterName,ParameterUnits,Year,Season,Latitude_DD,Longitude_DD,WbodyAcronym,Period,ResultValue,ParaAcronym,x,y,ConValue
0,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,6,5.695,DO_mgl,669975.848287,626752.656623,4.903136
1,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,7,6.36,DO_mgl,669975.848287,626752.656623,3.726113
2,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.620011,-81.207694,GTM,8,6.55,DO_mgl,669975.848287,626752.656623,6.487805
3,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,6,6.21,DO_mgl,668862.259531,631692.835328,4.903136
4,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,7,6.195,DO_mgl,668862.259531,631692.835328,3.726113
5,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.664722,-81.218056,GTM,8,6.78,DO_mgl,668862.259531,631692.835328,6.487805
6,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,6,4.552308,DO_mgl,665019.217025,631919.730604,4.903208
7,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,7,3.904615,DO_mgl,665019.217025,631919.730604,3.726316
8,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.667541,-81.257755,GTM,8,5.91,DO_mgl,665019.217025,631919.730604,6.487839
9,Guana Tolomato Matanzas,Dissolved Oxygen,mg/L,2016,Fall,29.66822,-81.2631,GTM,7,4.66,DO_mgl,664501.021891,631983.670235,3.73985


In [31]:
df_month_select_Mean.to_csv(gis_path + "month_difference.csv")