In [300]:
import os
import numpy as np
from netCDF4 import Dataset
import pandas as pd
from collections import defaultdict
from calendar import monthrange


In [305]:

# Assuming files are named like 'data_YYYYMMDD.nc'
data_dir = '../results/daily/gcomc/chla'
file_pattern = 'data_{}.nc'  # Placeholder for file name format
lat_min_crop, lat_max_crop = 34.4, 35.7
lon_min_crop, lon_max_crop = 138.35, 140.2

In [332]:
land_mask_accumulator = None
count = 0

# First pass to identify land pixels
for file_name in os.listdir(data_dir):
    if not file_name.endswith('.nc'):
       continue
    
    count += 1
    if count % 100 == 0:
        print(file_name)

    if file_name.endswith('.nc'):
        with Dataset(os.path.join(data_dir, file_name), 'r') as nc:
            lat = nc.variables['lat'][:]
            lon = nc.variables['lon'][:]
            data = nc.variables['chlor_a'][:]  # Adjust for your variable
            p
            # Crop data
            lat_inds = np.where((lat >= lat_min_crop) & (lat <= lat_max_crop))[0]
            lon_inds = np.where((lon >= lon_min_crop) & (lon <= lon_max_crop))[0]
            cropped_data = data[:, lat_inds, :][:, :, lon_inds]
            cropped_data = np.where(cropped_data.mask, np.nan, cropped_data)
            
            if land_mask_accumulator is None:
                land_mask_accumulator = np.isnan(cropped_data)
            else:
                land_mask_accumulator &= np.isnan(cropped_data)

# Final land mask (True where land is detected)
land_mask = land_mask_accumulator.all(axis=0)


In [340]:
missing_data_info = {}
total_data_info = {}
days_in_month_info = {}
data_dir = '../results/daily/gcomc/sst'
days_in_month = 0
current_month = 0
count  = 0
# Second pass to calculate missing data excluding land
for file_name in os.listdir(data_dir):
    if not file_name.endswith('.nc'):
       continue
    count += 1
    if count % 100 == 0:
        print(file_name)

    if file_name.endswith('.nc'):
        year_month = file_name[2:10]

        with Dataset(os.path.join(data_dir, file_name), 'r') as nc:
            lat = nc.variables['lat'][:]
            lon = nc.variables['lon'][:]
            data = nc.variables['sst'][:]  # Adjust for your variable
            # Crop data again
            lat_inds = np.where((lat >= lat_min_crop) & (lat <= lat_max_crop))[0]
            lon_inds = np.where((lon >= lon_min_crop) & (lon <= lon_max_crop))[0]
            cropped_data = data[:, lat_inds, :][:, :, lon_inds]
            # print(cropped_data.min(), cropped_data.max())
            cropped_data = np.where(cropped_data.mask, np.nan, cropped_data)
            
            # Exclude land pixels from the missing data calculation
            valid_data_mask = ~np.isnan(cropped_data) | land_mask[np.newaxis, :, :]
            missing_data = np.isnan(cropped_data) & ~land_mask[np.newaxis, :, :]
            # print(missing_data.sum())
            # missing_data_info[file_name] = missing_data.sum()
            # total_data_info[file_name] = valid_data_mask.sum()
            # print(file_name, year_month)
            # print(missing_data.sum())
            # print(valid_data_mask.sum())
            # print(missing_data.sum()/ valid_data_mask.sum() * 100)
            # check if missind_data_info has the year_month key
            # print(year_month)
            month  = int(year_month[4:6])
            if month != current_month:
                days_in_month_info[year_month[:6]] = 0
                days_in_month = 0
                current_month = month
            if year_month not in missing_data_info:
                missing_data_info[year_month] = {'missing_pixels': 0, 'total_pixels': 0, 'days_counted': 0}
            
            avg_missing_percentage = (missing_data.sum() / valid_data_mask.sum()) * 100
            if avg_missing_percentage < 75:
                days_in_month += 1
            if avg_missing_percentage < 30:
                days_in_month_info[year_month[:6]] = days_in_month_info[year_month[:6]] + 1
                # print("days in month", days_in_month, days_in_month_info)
            missing_data_info[year_month]['missing_pixels'] += missing_data.sum()
            missing_data_info[year_month]['total_pixels'] += valid_data_mask.sum()
            missing_data_info[year_month]['days_counted'] += 1
                
# Calculate monthly average percentage of missing data and days with data
monthly_averages = []
print(missing_data_info)
print(missing_data_info.items())
print(days_in_month_info)
for year_month, info in missing_data_info.items():
    avg_missing_percentage = (info['missing_pixels'] / info['total_pixels']) * 100
    print(year_month)
    date = pd.to_datetime(year_month, format='%Y%m%d')
    if avg_missing_percentage < 75:
        print(avg_missing_percentage)
        monthly_averages.append({
            'date': date,
            'AvgMissingPercentage': avg_missing_percentage,
            'DaysWithData': days_in_month_info[year_month[:6]],
        })
# Convert the results to a DataFrame and export to CSV
df = pd.DataFrame(monthly_averages)
df.to_csv('GCOMC SST monthly_missing_data.csv', index=False)            

GS20180611_SST_NW_day.nc
GS20180924_SST_NW_day.nc
GS20190108_SST_NW_day.nc
GS20190423_SST_NW_day.nc
GS20190807_SST_NW_day.nc
GS20191120_SST_NW_day.nc
GS20200305_SST_NW_day.nc
GS20200618_SST_NW_day.nc
GS20201002_SST_NW_day.nc
GS20210115_SST_NW_day.nc
GS20210501_SST_NW_day.nc
GS20210814_SST_NW_day.nc
GS20211127_SST_NW_day.nc
GS20220313_SST_NW_day.nc
GS20220626_SST_NW_day.nc
GS20221010_SST_NW_day.nc
GS20230123_SST_NW_day.nc
GS20230509_SST_NW_day.nc
GS20230823_SST_NW_day.nc
GS20231210_SST_NW_day.nc
{'20180215': {'missing_pixels': 202411, 'total_pixels': 166869, 'days_counted': 1}, '20180216': {'missing_pixels': 202411, 'total_pixels': 166869, 'days_counted': 1}, '20180217': {'missing_pixels': 167840, 'total_pixels': 201440, 'days_counted': 1}, '20180227': {'missing_pixels': 170771, 'total_pixels': 198509, 'days_counted': 1}, '20180228': {'missing_pixels': 202129, 'total_pixels': 167151, 'days_counted': 1}, '20180301': {'missing_pixels': 139528, 'total_pixels': 229752, 'days_counted': 1}, '

In [262]:

# Initialize a dictionary to hold missing data information
missing_data_info = defaultdict(lambda: {'missing_pixels': 0, 'total_pixels': 0, 'days_counted': 0})

# Loop through the files in the directory
for file_name in os.listdir(data_dir)[:100]:
    if file_name.endswith('.nc'):
        # Extract year and month from the file name
        year_month = file_name.split('_')[1][:6]
        file_path = os.path.join(data_dir, file_name)
        
        # Open the dataset and read the data (assuming the variable of interest is named 'sst')
        with Dataset(file_path, 'r') as nc:
            # data = nc.variables['chlor_a'][:]  # Adjust variable name as necessary
            data = np.ma.squeeze(nc['chlor_a'][:])
            print(data.min(), data.max())

            lat = nc['lat'][:]
            lon = nc['lon'][:]

           # Find indices for cropping
            lat_inds = np.where((lat >= lat_min_crop) & (lat <= lat_max_crop))[0]
            lon_inds = np.where((lon >= lon_min_crop) & (lon <= lon_max_crop))[0]
            # Crop data
            
            # Now add a check to ensure the indices are within the bounds of the array
            if lat_inds.size > 0 and lon_inds.size > 0:
               # Adjust the slicing based on the actual dimensions of sds
               data = data[lat_inds, :][:, lon_inds]  # This is the updated line for a 2D array
               lat_cropped = lat[lat_inds]
               lon_cropped = lon[lon_inds]
            else:
               print("No data within specified crop bounds.")
            data = np.where(data.mask, np.nan, data)

            # Calculate missing data (assuming missing data is represented as NaN)
            missing_pixels = np.isnan(data).sum()
            total_pixels = data.size
            
            # Update the dictionary with information from this file
            missing_data_info[year_month]['missing_pixels'] += missing_pixels
            missing_data_info[year_month]['total_pixels'] += total_pixels
            missing_data_info[year_month]['days_counted'] += 1
# Calculate monthly average percentage of missing data and days with data
monthly_averages = []
print(missing_data_info)
print(missing_data_info.items())
for year_month, info in missing_data_info.items():
    print(info['missing_pixels'] , info['total_pixels'])
    avg_missing_percentage = (info['missing_pixels'] / info['total_pixels']) * 100
    print(avg_missing_percentage)
   #  year, month = int(year_month[:4]), int(year_month[4:])
   #  days_in_month = monthrange(year, month)[1]  # Total days in the month
   #  monthly_averages.append({
   #      'YearMonth': year_month,
   #      'AvgMissingPercentage': avg_missing_percentage,
   #      'DaysWithData': info['days_counted'],
   #      'TotalDaysInMonth': days_in_month
   #  })

# Convert the results to a DataFrame and export to CSV
# df = pd.DataFrame(monthly_averages)
# df.to_csv('/mnt/data/monthly_missing_data.csv', index=False)

0.030041967 22.319437
0.027706027 20.105795
0.030020222 22.997494
0.030462872 97.68098
0.034297086 27.403242
0.033220153 14.801711
0.03140791 13.075827
0.036999993 79.39814
0.04680855 82.14334
0.037427787 15.726149
0.032742433 23.391787
0.032105654 10.281909
0.0366507 18.228678
0.0344561 47.43055
0.039465547 72.337135
0.04132297 99.73867
0.03095945 97.713615
0.035692044 13.093591
0.031299543 15.983326
0.031706292 76.49852
0.059181657 22.31597
0.048888315 82.62365
0.040045336 46.498123
0.035526134 37.91832
0.0281968 88.715004
0.03266299 42.103077
0.030096062 92.19318
0.029853623 94.36174
0.03340136 80.956154
0.028610492 4.83347
0.032162238 93.0079
0.030397061 7.165541
0.02125884 94.86379
0.021842936 97.43935
0.025847958 82.15829
0.033365548 90.14665
0.03225382 68.772865
0.03786139 19.991955
0.039843068 21.173233
0.024994897 90.45312
0.035196654 99.81971
0.038958002 92.48344
0.04345656 5.6131153
0.0342866 99.46323
0.033263605 97.50878
0.035532735 20.059694
0.03472257 16.883205
0.04102336