In [13]:
import os
import numpy as np
import xarray as xr
import pandas as pd
from functools import partial
from joblib import Parallel, delayed
from pathos.multiprocessing import ProcessingPool as Pool
# Load the ncdf4 package
import netCDF4 as nc
import rasterio
from shapely.geometry import Point
import geopandas as gpd
from rasterio.transform import from_origin
from rasterio.features import rasterize
from rasterio.transform import from_bounds

## File path and settings

In [14]:
# Set the file paths
ncname_cdi = "/Users/sabinmaharjan/projects/python/do/static/do/cdi_3.nc"
ncname_rain = "/Users/sabinmaharjan/projects/python/do/static/do/2024.forecast.nc"
pd.set_option('display.precision',2)
pd.set_option('display.float_format','{:.2f}'.format)


## Reading CDI data and creating dataframe

In [15]:
# Open the CDI file
with nc.Dataset(ncname_cdi, 'r') as nc_cdi:
    lon = nc_cdi.variables['longitude'][:]
    lat = nc_cdi.variables['latitude'][:]
    time=305
    cdi_array = nc_cdi.variables['cdi'][:, :, time]
    cdi_vec = cdi_array.flatten()
    lat_grid, lon_grid=np.meshgrid(lat,lon)
    print(len(cdi_vec))
    print(len(lat_grid.flatten()),len(lon_grid.flatten()))
    cdi_df = pd.DataFrame({
        'lat': np.round(lat_grid,2).flatten(), 
        'lon': np.round(lon_grid,2).flatten(),
        'cdi': cdi_vec
    })
    print(cdi_df.shape)
    cdi_df_crop=cdi_df.dropna()
    print(cdi_df_crop.shape)


572721
572721 572721
(572721, 3)
(273625, 3)


## Reading forecast data and creating dataframe

In [16]:
# Open the rainfall file
with nc.Dataset(ncname_rain, 'r') as ncrain:
   
    # Example: Extract latitude and longitude data from the NetCDF file
    lon_r = ncrain.variables['lon'][:]
    lat_r = ncrain.variables['lat'][:]
    lat_rounded=np.round(lat_r,2)
    lon_rounded=np.round(lon_r,2)
    print(lat_rounded[:10])
    # Define indices for selecting a specific subset of data
    time = 0
    nbins = 2

    time_r = ncrain.variables['time'][time]  # Extract the time at n=0
    time_r = nc.num2date(time_r, ncrain.variables['time'].units)  # Convert the time number to a datetime object
    month_name = time_r.strftime('%B')  # Get the month name
    print(f"The month at n=0 is: {month_name}")

    # Extract rain data for the specified indices
    rain_array = ncrain.variables['percentage_of_ensembles'][nbins-1, time-1, :, :]
    rain_vec = rain_array.flatten()
    print(len(rain_vec))
    # Tile latitude and repeat longitude to match the shape of rain_array
    lat_r_grid, lon_r_grid=np.meshgrid(lat_rounded,lon_rounded)

    print(len(lat_r_grid.flatten()),len(lon_r_grid.flatten()))
    # Create DataFrame
    rain_df = pd.DataFrame({
        'lat': lat_r_grid.flatten(), 
        'lon': lon_r_grid.flatten(), 
        'rain': rain_vec
    })

 
    print(rain_df.head(10))
    print("rain_df shape")
    print(rain_df.shape)

    rain_df_crop=rain_df.dropna()
    print(rain_df_crop.shape)
    print(rain_df_crop.head(10))

[-44.5  -44.45 -44.4  -44.35 -44.3  -44.25 -44.2  -44.15 -44.1  -44.05]
The month at n=0 is: April
612226
612226 612226
     lat    lon  rain
0 -44.50 112.00   NaN
1 -44.45 112.00   NaN
2 -44.40 112.00   NaN
3 -44.35 112.00   NaN
4 -44.30 112.00   NaN
5 -44.25 112.00   NaN
6 -44.20 112.00   NaN
7 -44.15 112.00   NaN
8 -44.10 112.00   NaN
9 -44.05 112.00   NaN
rain_df shape
(612226, 3)
(281760, 3)
         lat    lon  rain
15756 -16.80 113.10 39.39
15757 -16.75 113.10 39.39
15759 -16.65 113.10 39.39
16635 -41.95 113.20 42.42
16641 -41.65 113.20 39.39
16642 -41.60 113.20 39.39
16643 -41.55 113.20 39.39
16644 -41.50 113.20 39.39
16645 -41.45 113.20 39.39
16646 -41.40 113.20 38.38


## merging CDI and forecast using lat and lon

In [17]:
# Join the CDI and rainfall dataframes
join_df = pd.merge(rain_df,cdi_df, on=['lat', 'lon'], how='left')
print(join_df.shape)
print("join_df.head(10)")
print(join_df.head(10))
# Count the number of rows with NaN values
num_rows_with_nan = join_df.isna().any(axis=1).sum()

# Count the number of rows without NaN values
num_rows_without_nan = (~join_df.isna().any(axis=1)).sum()

print(f"Number of rows with NaN values: {num_rows_with_nan}")
print(f"Number of rows without NaN values: {num_rows_without_nan}")

rmna_df=join_df.dropna(subset=['rain'])
print(rmna_df.shape)
nan_count = join_df['rain'].isna().sum()
nan__cdi_count = join_df['cdi'].isna().sum()
print("Number of rain NaN values:", nan_count)
print("Number of CDI NaN values:", nan__cdi_count)

(612226, 4)
join_df.head(10)
     lat    lon  rain  cdi
0 -44.50 112.00   NaN  NaN
1 -44.45 112.00   NaN  NaN
2 -44.40 112.00   NaN  NaN
3 -44.35 112.00   NaN  NaN
4 -44.30 112.00   NaN  NaN
5 -44.25 112.00   NaN  NaN
6 -44.20 112.00   NaN  NaN
7 -44.15 112.00   NaN  NaN
8 -44.10 112.00   NaN  NaN
9 -44.05 112.00   NaN  NaN
Number of rows with NaN values: 604374
Number of rows without NaN values: 7852
(281760, 4)
Number of rain NaN values: 330466
Number of CDI NaN values: 601174


In [18]:
def classify_drought(row):
    cdi, rain = row['cdi'], row['rain']
    if cdi < 0.2:
        if rain < 50:
            if cdi < 0.02:
                return 5  # Persists
            else:
                return 6  # Worsens
        elif rain < 70:
            return 5  # Persists
        else:
            if 0.1 <= cdi < 0.2:
                return 2  # Removed
            else:
                return 3  # Improved
    else:
        if rain < 30:
            return 4  # Develops
        else:
            return 1  # No drought

In [19]:
ncell = len(rmna_df)
ncores = 4 # Adjust based on your system's capabilities

# Use pathos for multiprocessing
with Pool(ncores) as p:
    try:
        classified = p.map(classify_drought, [rmna_df.iloc[i] for i in range(ncell)])
    except Exception as e:
        print(f"An error occurred during multiprocessing: {e}")
        p.close() # Close the pool
        p.join() # Wait for the worker processes to exit
        raise # Re-raise the exception

print(len(classified))

281760


In [20]:
# Create the dataframe
df_out = pd.DataFrame({ 'lat': rain_df['lat'],'lon': rain_df['lon'], 'category': np.nan})


classified = np.array(classified)
df_out.loc[rmna_df.index,'category'] = classified.astype(int)

print(df_out['category'].value_counts())
print(df_out.head(10))
print(df_out.shape)

1.00    245057
4.00     36166
5.00       297
6.00       240
Name: category, dtype: int64
     lat    lon  category
0 -44.50 112.00       NaN
1 -44.45 112.00       NaN
2 -44.40 112.00       NaN
3 -44.35 112.00       NaN
4 -44.30 112.00       NaN
5 -44.25 112.00       NaN
6 -44.20 112.00       NaN
7 -44.15 112.00       NaN
8 -44.10 112.00       NaN
9 -44.05 112.00       NaN
(612226, 3)


In [21]:
# Create a DataArray from the DataFrame
da = xr.DataArray(df_out['category'].values.reshape(rain_array.shape),
                  coords=[('latitude', lat_rounded), ('longitude', lon_rounded)],
                  name='outlook')
# Add attributes
da.attrs['varunit'] = ''
da.attrs['longname'] = 'drought outlook'

# Create a Dataset from the DataArray
ds = da.to_dataset()

In [22]:
# Get the rows where NAs were removed
order = rmna_df.index.astype(int)

# Replace the category value
classified = np.array(classified)
# Check the lengths of order and classified
print("Length of order:", len(order))
print("Length of classified:", len(classified))

# Check the data types of order and classified
print("Data type of order:", order.dtype)
print("Data type of classified:", classified.dtype)

# Inspect the data
print("Order:", order)
print("Classified:", classified)

# Ensure that the index of rmna_df contains integer values
print("Index of rmna_df:", rmna_df.index)

Length of order: 281760
Length of classified: 281760
Data type of order: int64
Data type of classified: int64
Order: Int64Index([ 15756,  15757,  15759,  16635,  16641,  16642,  16643,  16644,
             16645,  16646,
            ...
            600428, 600435, 601311, 601312, 601313, 601314, 601315, 602198,
            602199, 602200],
           dtype='int64', length=281760)
Classified: [1 1 1 ... 1 1 1]
Index of rmna_df: Int64Index([ 15756,  15757,  15759,  16635,  16641,  16642,  16643,  16644,
             16645,  16646,
            ...
            600428, 600435, 601311, 601312, 601313, 601314, 601315, 602198,
            602199, 602200],
           dtype='int64', length=281760)


In [23]:
# Add the time variable to the Dataset
ds['time'] = (('time'), [time_r])
out_ncname = "/Users/sabinmaharjan/projects/python/do/static/nc/3_months/"+month_name+"_Final_2024.nc"

In [24]:
# Save the Dataset as a NetCDF file

try:
    ds.to_netcdf(out_ncname)
    print(f"file saved with name: {out_ncname}")
except Exception as e:
    print(f"An error occurred while saving the Dataset: {e}")

file saved with name: /Users/sabinmaharjan/projects/python/do/static/nc/3_months/April_Final_2024.nc
