In [1]:
import os
import numpy as np
from netCDF4 import Dataset
import pandas as pd
from collections import defaultdict
from calendar import monthrange

In [18]:

# Assuming files are named like 'data_YYYYMMDD.nc'
data_dir = '../results/daily/Modis/chla'
file_pattern = 'data_{}.nc'  # Placeholder for file name format
lat_min_crop, lat_max_crop = 34.4, 35.7
lon_min_crop, lon_max_crop = 138.35, 140.2

In [25]:
land_mask_accumulator = None
count = 0

# First pass to identify land pixels
for file_name in os.listdir(data_dir)[1400:4000]:
    if not file_name.endswith('.nc'):
       continue
    count += 1
    if count % 100 == 0:
        print(file_name)

    if file_name.endswith('.nc'):
        with Dataset(os.path.join(data_dir, file_name), 'r') as nc:
            lat = nc.variables['lat'][:]
            lon = nc.variables['lon'][:]
            data = nc.variables['chlor_a'][:]  # Adjust for your variable
            
            # Crop data
            lat_inds = np.where((lat >= lat_min_crop) & (lat <= lat_max_crop))[0]
            lon_inds = np.where((lon >= lon_min_crop) & (lon <= lon_max_crop))[0]
            cropped_data = data[:, lat_inds, :][:, :, lon_inds]
            cropped_data = np.where(cropped_data.mask, np.nan, cropped_data)
            if cropped_data.shape != (1, 144, 160):
                print("crop error ", cropped_data.shape,  file_name)
                continue
                
            # print(cropped_data.shape, file_name)
            if land_mask_accumulator is None:
                land_mask_accumulator = np.isnan(cropped_data)
            else:
                land_mask_accumulator &= np.isnan(cropped_data)

# Final land mask (True where land is detected)
land_mask = land_mask_accumulator.all(axis=0)


crop error  (1, 145, 159) A20060705_CHL_NW_day.nc
A20060917_CHL_NW_day.nc
A20061227_CHL_NW_day.nc
A20070409_CHL_NW_day.nc
A20070719_CHL_NW_day.nc
A20071029_CHL_NW_day.nc
A20080210_CHL_NW_day.nc
A20080521_CHL_NW_day.nc
A20080901_CHL_NW_day.nc
A20081211_CHL_NW_day.nc
A20090323_CHL_NW_day.nc
A20090703_CHL_NW_day.nc
A20091013_CHL_NW_day.nc
A20100123_CHL_NW_day.nc
A20100505_CHL_NW_day.nc
A20100815_CHL_NW_day.nc
A20101125_CHL_NW_day.nc
A20110307_CHL_NW_day.nc
A20110617_CHL_NW_day.nc
A20110927_CHL_NW_day.nc
A20120107_CHL_NW_day.nc
A20120418_CHL_NW_day.nc
A20120728_CHL_NW_day.nc
A20121108_CHL_NW_day.nc
A20130218_CHL_NW_day.nc
A20130530_CHL_NW_day.nc
A20130910_CHL_NW_day.nc


In [30]:
data_dir = '../results/daily/Modis/sst'

missing_data_info = {}
total_data_info = {}
days_in_month_info = {}

days_in_month = 0
current_month = 0
count  = 0

# Second pass to calculate missing data excluding land
for file_name in os.listdir(data_dir):
    if not file_name.endswith('.nc'):
       continue
    count += 1
    if count % 100 == 0:
        print(file_name)
    if file_name.endswith('.nc'):
        # year_month = file_name.split('_')[1][:6]
        year_month = file_name[1:9]
        try:
            with Dataset(os.path.join(data_dir, file_name), 'r') as nc:
                lat = nc.variables['lat'][:]
                lon = nc.variables['lon'][:]
                data = nc.variables['sst'][:]  # Adjust for your variable
                # Crop data again
                lat_inds = np.where((lat >= lat_min_crop) & (lat <= lat_max_crop))[0]
                lon_inds = np.where((lon >= lon_min_crop) & (lon <= lon_max_crop))[0]
                cropped_data = data[:, lat_inds, :][:, :, lon_inds]
                # print(cropped_data.min(), cropped_data.max())
                cropped_data = np.where(cropped_data.mask, np.nan, cropped_data)
                if cropped_data.shape != (1, 144, 160):
                    print("crop error ", cropped_data.shape,  file_name)
                    continue
                # Exclude land pixels from the missing data calculation
                valid_data_mask = ~np.isnan(cropped_data) | land_mask[np.newaxis, :, :]

                
                missing_data = np.isnan(cropped_data) & ~land_mask[np.newaxis, :, :]
                # print(missing_data.sum())
                # print(valid_data_mask.sum())

                # missing_data_info[file_name] = missing_data.sum()
                # total_data_info[file_name] = valid_data_mask.sum()
                # print(file_name, year_month)
                # print(missing_data.sum())
                # print(valid_data_mask.sum())
                # print(missing_data.sum()/ valid_data_mask.sum() * 100)
                # check if missind_data_info has the year_month key
                month  = int(year_month[4:6])
                if month != current_month:
                    days_in_month_info[year_month[:6]] = 0
                    days_in_month = 0
                    current_month = month
                if year_month not in missing_data_info:
                    missing_data_info[year_month] = {'missing_pixels': 0, 'total_pixels': 0, 'days_counted': 0}
                avg_missing_percentage = (missing_data.sum() / valid_data_mask.sum()) * 100
                if avg_missing_percentage < 75:
                    days_in_month += 1
                    # print("days in month", days_in_month, days_in_month_info)
                if avg_missing_percentage < 30:
                    days_in_month_info[year_month[:6]] = days_in_month_info[year_month[:6]] + 1
                missing_data_info[year_month]['missing_pixels'] += missing_data.sum()
                missing_data_info[year_month]['total_pixels'] += valid_data_mask.sum()
                missing_data_info[year_month]['days_counted'] += 1
        except Exception as e:
            print(e)
            print("error in file", file_name)
            continue        
# # Calculate and store the results
# results = [{'file_name': fn, 'missing_percentage': (missing_data_info[fn] / total_data_info[fn] * 100)} for fn in missing_data_info]
# # Optionally, convert results to a DataFrame and save to CSV
# import pandas as pd
# df = pd.DataFrame(results)
# csv_file_path = 'missing_data_report.csv'
# df.to_csv(csv_file_path, index=False)
# print(f"Missing data report saved to: {csv_file_path}")
# Calculate monthly average percentage of missing data and days with data
monthly_averages = []
print(missing_data_info)
print(missing_data_info.items())
print(days_in_month_info)
for year_month, info in missing_data_info.items():
    # print(info['missing_pixels'] , info['total_pixels'])
    avg_missing_percentage = (info['missing_pixels'] / info['total_pixels']) * 100
    # year, month = int(year_month[:4]), int(year_month[4:6])
    # print(year, month)
    print(year_month)
    date = pd.to_datetime(year_month, format='%Y%m%d')
    if avg_missing_percentage < 75:
        print(avg_missing_percentage)
        monthly_averages.append({
            'date': date,
            'AvgMissingPercentage': avg_missing_percentage,
            'DaysWithData': days_in_month_info[year_month[:6]],
        })
# Convert the results to a DataFrame and export to CSV
df = pd.DataFrame(monthly_averages)
df.to_csv('MODIS SST monthly_missing_data2.csv', index=False)            



A20021022_SST_NW_day.nc
A20030206_SST_NW_day.nc
A20030520_SST_NW_day.nc
A20030904_SST_NW_day.nc
A20031217_SST_NW_day.nc
A20040401_SST_NW_day.nc
A20040714_SST_NW_day.nc
A20041027_SST_NW_day.nc
A20050211_SST_NW_day.nc
A20050525_SST_NW_day.nc
A20050909_SST_NW_day.nc
A20051222_SST_NW_day.nc
A20060407_SST_NW_day.nc
A20060720_SST_NW_day.nc
A20061104_SST_NW_day.nc
A20070217_SST_NW_day.nc
A20070602_SST_NW_day.nc
A20070915_SST_NW_day.nc
A20071229_SST_NW_day.nc
A20080413_SST_NW_day.nc
A20080726_SST_NW_day.nc
A20081110_SST_NW_day.nc
A20090223_SST_NW_day.nc
A20090608_SST_NW_day.nc
A20090921_SST_NW_day.nc
A20100105_SST_NW_day.nc
A20100419_SST_NW_day.nc
A20100803_SST_NW_day.nc
A20101116_SST_NW_day.nc
A20110301_SST_NW_day.nc
A20110614_SST_NW_day.nc
A20110927_SST_NW_day.nc
A20120111_SST_NW_day.nc
A20120424_SST_NW_day.nc
A20120808_SST_NW_day.nc
A20121121_SST_NW_day.nc
A20130306_SST_NW_day.nc
A20130619_SST_NW_day.nc
A20131003_SST_NW_day.nc
A20140116_SST_NW_day.nc
A20140501_SST_NW_day.nc
A20140814_SST_NW