In [1]:
import os
import numpy as np
import xarray as xr
import pandas as pd
from functools import partial
from joblib import Parallel, delayed
import multiprocessing
from pathos.multiprocessing import ProcessingPool as Pool
import matplotlib.pyplot as plt
from IPython.display import display
# Load the ncdf4 package
import netCDF4 as nc
import rasterio
from shapely.geometry import Point
import geopandas as gpd
from rasterio.transform import from_origin
from rasterio.features import rasterize
from rasterio.transform import from_bounds

## File path and settings

In [2]:
# Set the file paths
ncname_cdi = "/Users/sabinmaharjan/projects/python/do/static/file/cdi_1.nc"
ncname_rain = "/Users/sabinmaharjan/projects/python/do/static/file/p_atmos_q5_pr_s_maq5_pumedian_20240501_rt.nc"



# Get the average of 3 month

In [3]:
# Open the rain forecast data
ds2 = xr.open_dataset(ncname_rain)
rain_var = ds2["percentage_of_ensembles"]

# Extract the data for the first three months
first_three_months = rain_var[1, :3, :, :]

# Calculate the average across the first three months
rain_avg = first_three_months.mean(dim="time")
print(rain_avg.shape)
# Get the latitude and longitude coordinates
lats = ds2.lat
lons = ds2.lon

# Get the time coordinate from first_three_months
time = first_three_months.time.values

time_r = time[1]


# Convert the time value to a datetime object
time_r = pd.to_datetime(time_r)


# Extract the month name from the datetime object

month_name = time_r.strftime('%B')
print(month_name)
# Create a new Dataset with the averaged rain data and time as a separate variable
ds_avg = xr.Dataset(
    {
        "rain": (("lat", "lon"), rain_avg.values),
        "time": time,
    },
    coords={"lat": lats, "lon": lons},
)

# Rename 'lat' to 'latitude' and 'lon' to 'longitude'
ds_avg = ds_avg.rename({'lat': 'latitude', 'lon': 'longitude'})

# Convert latitude and longitude variables to double and round their values
ds_avg['latitude'] = ds_avg['latitude'].astype('double')
ds_avg['longitude'] = ds_avg['longitude'].astype('double')
ds_avg['latitude'] = ds_avg['latitude'].round(decimals=2)
ds_avg['longitude'] = ds_avg['longitude'].round(decimals=2)

ncname_rain_final = "/Users/sabinmaharjan/projects/python/do/static/file/APP2024_"+month_name+"_average.nc"


# Save the modified dataset to a new NetCDF file
ds_avg.to_netcdf(ncname_rain_final)
print(f"New file saved as: {ncname_rain_final}")

(691, 886)
June
New file saved as: /Users/sabinmaharjan/projects/python/do/static/file/APP2024_June_average.nc


## Reading CDI data and creating dataframe

In [4]:

# Open the first NetCDF file

ds1 = xr.open_dataset(ncname_cdi)
 
# Get longitude and latitude
lon = ds1["longitude"].values
lat = ds1["latitude"].values
 
# Get cdi
dname1 = "cdi"
cdi_array = ds1[dname1].values
 
# Get a single slice or layer
cdi_slice = cdi_array[:, :, 305]  # Assuming 305 is the index of the slice you want


cdi_vec = np.ravel(cdi_slice)
 
# Create DataFrame for cdi data
lon_grid, lat_grid = np.meshgrid(lon, lat)
cdi_df = pd.DataFrame({'lon': lon_grid.flatten(), 'lat': lat_grid.flatten(), 'cdi': cdi_vec})


print(cdi_slice.shape)
print(cdi_df.shape)

# Open the second NetCDF file


(681, 841)
(572721, 3)


In [5]:
print(ncname_rain_final)
ds2 = xr.open_dataset(ncname_rain_final)
 
# Get longitude and latitude
lon_r = ds2["longitude"].values
lat_r = ds2["latitude"].values
rain_avg=ds2["rain"].values
# Get rain

time_r=ds2["time"].values

rain_vec = np.ravel(rain_avg)
 
# Create DataFrame for rain data
lon_grid, lat_grid = np.meshgrid(lon_r, lat_r)

rain_df = pd.DataFrame({'lat': lat_grid.flatten(), 'lon': lon_grid.flatten(), 'rain': rain_vec})

# rain_df= rain_df.replace(np.nan, 'NA', regex=True)




print("First month:", month_name)



/Users/sabinmaharjan/projects/python/do/static/file/APP2024_June_average.nc
DatetimeIndex(['2024-05-16 12:00:00', '2024-06-16 00:00:00',
               '2024-07-16 12:00:00'],
              dtype='datetime64[ns]', freq=None)
(612226, 3)
First month: May


## merging CDI and forecast using lat and lon

In [6]:

# Perform the merge
join_df = cdi_df.merge(rain_df, how='left', on=['lon', 'lat'])
# Merge DataFrames on longitude and latitude
print(join_df.shape)
# join_dims={"lon":"lon","lat":"lat"}
# tes=xr.merge([cdi_df,rain_df],join='left',compat="override")
print("join_df.head(10)")
display(join_df)
# Count the number of rows with NaN values
num_rows_with_nan = join_df.isna().sum().sum()

# Count the number of rows without NaN values
num_rows_without_nan = join_df.notna().sum().sum()

print(f"Number of rows with NaN values: {num_rows_with_nan}")
print(f"Number of rows without NaN values: {num_rows_without_nan}")

rmna_df=join_df.dropna()
print(rmna_df.shape)
nan_count = rmna_df.isna().sum().sum()
nan__cdi_count = rmna_df.notna().sum().sum()
nan__cdi_count = rmna_df.notna().sum()
print("Number of  NaN values:", nan_count)
print("Number of NaN values:", nan__cdi_count)



(572721, 4)
join_df.head(10)


Unnamed: 0,lon,lat,cdi,rain
0,112.00,-44.0,,
1,112.05,-44.0,,
2,112.10,-44.0,,
3,112.15,-44.0,,
4,112.20,-44.0,,
...,...,...,...,...
572716,153.80,-10.0,,
572717,153.85,-10.0,,
572718,153.90,-10.0,,
572719,153.95,-10.0,,


Number of rows with NaN values: 590090
Number of rows without NaN values: 1700794
(273592, 4)
Number of  NaN values: 0
Number of NaN values: lon     273592
lat     273592
cdi     273592
rain    273592
dtype: int64


In [7]:
def classify_drought(row):
    cdi, rain = row['cdi'], row['rain']
    if cdi < 0.2:
        if rain < 50:
            if cdi < 0.02:
                return 5  # Persists
            else:
                return 6  # Worsens
        elif rain < 70:
            return 5  # Persists
        else:
            if 0.1 <= cdi < 0.2:
                return 2  # Removed
            else:
                return 3  # Improved
    else:
        if rain < 30:
            return 4  # Develops
        else:
            return 1  # No drought

In [8]:
ncell = len(rmna_df)

ncores = min(multiprocessing.cpu_count(), 4) 
# Use pathos for multiprocessing
with Pool(ncores) as p:
    try:
        classified = p.map(classify_drought, [rmna_df.iloc[i] for i in range(rmna_df.shape[0])])
    except Exception as e:
        print(f"An error occurred during multiprocessing: {e}")
        p.close() # Close the pool
        p.join() # Wait for the worker processes to exit
        raise # Re-raise the exception

print(len(classified))

273592


In [9]:
# Create the dataframe df_out
df_out = pd.DataFrame({ 'lat': cdi_df['lat'], 'lon': cdi_df['lon'],'outlook': np.nan})
print(len(rmna_df.index))

# Get the rows where NAs were removed
order = rmna_df.index.astype(int)

# Replace the category value
classified = np.array(classified)
df_out.loc[order, 'outlook'] = classified.astype(int)
print(df_out['outlook'].value_counts())



273592
1.0    235418
4.0     19749
6.0     12958
5.0      5467
Name: outlook, dtype: int64


In [10]:
print(cdi_slice.shape)
print(len(lon))
print(len(lat))

(681, 841)
841
681


In [11]:
# Create a DataArray from the DataFrame
da = xr.DataArray(df_out['outlook'].values.reshape(cdi_slice.shape),
                 
                  coords=[('lat', lat),('lon', lon)],
                  name='outlook')

# da = xr.DataArray(df_out['outlook'].values, coords=[('lon', cdi_df['lon']), ('lat', cdi_df['lat'])], dims=['lon', 'lat'])

# Add attributes
da.attrs['varunit'] = ''
da.attrs['longname'] = 'drought outlook'

# Create a Dataset from the DataArray
ds = da.to_dataset()


In [12]:
# Save the Dataset as a NetCDF file

out_ncname = "/Users/sabinmaharjan/projects/python/do/static/result/nc/3_months/app"+month_name+"_Final_2024.nc"

try:
    ds.to_netcdf(out_ncname)
    print(f"file saved with name: {out_ncname}")
except Exception as e:
    print(f"An error occurred while saving the Dataset: {e}")

file saved with name: /Users/sabinmaharjan/projects/python/do/static/result/nc/3_months/appMay_Final_2024.nc
