## Libraries installation

In [None]:
!pip install rioxarray
!pip install ruptures
!pip install scikit-learn
!pip install bayesian-changepoint-detection
!pip install netCDF4
!pip install xarray
!pip install basemap
!pip install geopandas
!pip install cartopy
!pip install pandas
!pip install dask
!pip install --upgrade xarray dask
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras nltk spacy beautifulsoup4 requests scipy statsmodels plotly bokeh gensim Pillow
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras nltk spacy beautifulsoup4 requests
!pip install numpy pandas matplotlib seaborn scikit-learn
!pip install dask[dataframe]
!pip install openpyxl
!pip install pandas

## NetCDF4 file reading (info)

#### You can use anyone of the below code to see the details of the NetCDF4 file 

In [None]:
import xarray as xr
ds = xr.open_dataset("your_file.nc")  # Replace "your_file.nc" with the actual file path of your NetCDF file
ds.info()                             # Provides a concise summary of the dataset’s structure and metadata

In [None]:
import netCDF4  
print(netCDF4.Dataset("your_file.nc"))  # Replace "your_file.nc" with the path to your NetCDF file

#### To know the details about a particular variable

In [None]:
import netCDF4 as nc 
file_path = "your_file.nc"  # Replace with the path to your NetCDF file
dataset = nc.Dataset(file_path, 'r')  
print(dataset)  
print("Here are all the variables of your data") 
print(dataset.variables.keys())  

In [None]:
# Replace 'your_variable' with the actual variable name you want to access from the dataset
your_variable = dataset.variables['your_variable']  
your_variable_data = your_variable[:]  

print("Shape of your_variable_data:", your_variable_data.shape)      # Prints the shape of the data
print("Data type of your_variable_data:", your_variable_data.dtype)  # Prints the data type of the data

### NetCDF4 (.nc) file to data frame (df)

#### Creation of DataFrame  

In [None]:
import xarray as xr 
ds = xr.open_dataset("your_file.nc")  # Replace "your_file.nc" with the path to your NetCDF file

df = ds.to_dataframe().reset_index()  # Replace 'df' with the name you want to initialize for your DataFrame

# Display the DataFrame
df  


#### To create the data frame for a particular variable

In [None]:
import pandas as pd  
import netCDF4 as nc  
import numpy as np  

file_path = "your_file.nc"                       # Replace 'your_file.nc' with the path to your actual NetCDF file

# Open the NetCDF file for reading
with nc.Dataset(file_path, 'r') as dataset:  
    time_var = dataset.variables['time'][:]      # Replace 'time' with the actual variable name for the time in your file
    lat_var = dataset.variables['lat'][:]        # Replace 'lat' with the actual variable name for latitude
    lon_var = dataset.variables['lon'][:]        # Replace 'lon' with the actual variable name for longitude
    data_var = dataset.variables['variable'][:]  # Replace 'variable' with the actual variable name for data

time_2d, lat_2d, lon_2d = np.meshgrid(time_var, lat_var, lon_var, indexing='ij')  
data_1d = data_var.flatten()  
df = pd.DataFrame({                              # Replace 'df' with the name you want to initialize for your DataFrame
    'time': time_2d.flatten(),  
    'lat': lat_2d.flatten(),   
    'lon': lon_2d.flatten(),    
    'variable': data_1d        
})
df  

## Data Transformation and Management

### Data conversion

#### CSV to NetCDF(nc)

In [None]:
import pandas as pd  
import xarray as xr 

# Read the CSV file into a Pandas DataFrame
csv_file = "your_file.csv"      # Specify the path to your CSV file
df = pd.read_csv(csv_file)  

# Convert the DataFrame into an xarray Dataset
ds = xr.Dataset.from_dataframe(df) 

# Save the xarray Dataset as a NetCDF file
netcdf_file = "output_file.nc"  # Specify the path where you want to save the NetCDF file
ds.to_netcdf(netcdf_file)  

# Print a message indicating that the conversion is complete
print(f"CSV file '{csv_file}' has been converted to NetCDF '{netcdf_file}'")  


#### DataFrame(df) to NetCDF

In [None]:
import xarray as xr 

# Replace 'ds' with the desired name to save the NetCDF file.
ds = xr.Dataset.from_dataframe(df)        # Replace `df` with the name of your DataFrame

ds.to_netcdf('your_output_file.nc')       # Replace 'your_output_file.nc' with the desired path where you want to save the NetCDF file

#### Excel to NetCDF

In [None]:
import pandas as pd  
import xarray as xr  

# Read the Excel file into a Pandas DataFrame
excel_file = "your_file.xlsx"  # Specify the path to your Excel file
df = pd.read_excel(excel_file)

# Convert the DataFrame into an xarray Dataset
ds = xr.Dataset.from_dataframe(df) 

# Save the xarray Dataset as a NetCDF file
netcdf_file = "output_file.nc"  # Specify the path where you want to save the NetCDF file
ds.to_netcdf(netcdf_file)  

# Print a message indicating that the conversion is complete
print(f"Excel file '{excel_file}' has been converted to NetCDF '{netcdf_file}'") 


### Resampling of  file from one resolution to another resolution

In [None]:
import xarray as xr  
import numpy as np 

# Load the NetCDF data
data = xr.open_dataset("your_input_file.nc")  # Replace 'your_input_file.nc' with your actual NetCDF file path

# Define new latitude and longitude coordinates with 0.25° resolution
# (You can adjust these values(0.25°) based on your resolution requirement)
new_lon = np.arange(data.lon.min(), data.lon.max(), 0.25)  # Longitudes
new_lat = np.arange(data.lat.min(), data.lat.max(), 0.25)  # Latitudes 

# Resample the data to new spatial resolution (0.25° × 0.25°)
# (You can change 'linear' to another interpolation method if needed)
resampled_data = data.interp(lon=new_lon, lat=new_lat, method='linear')  # Linear

print(resampled_data)  

# Save the resampled data to a new NetCDF file
resampled_data.to_netcdf('your_output_file.nc')  # Replace 'your_output_file.nc' with your desired output file name


### Longitude format change

#### Convert longitude from -180 to 180 to 0 to 360

In [None]:
import pandas as pd  

resolution = 0.25  ### Replace '0.25' with the resolution of your data ###

adjustment = resolution / 2
df['lon'] = df['lon'].apply(lambda x: ((x + 180) % 360) - adjustment)   # Replace `df` with your DataFrame name
df

#### Convert longitude from 0 to 360 to -180 to 180

In [None]:
import pandas as pd  

resolution = 0.25  ### Replace '0.25' with the resolution of your data ###

adjustment = resolution / 2
df['lon'] = df['lon'].apply(lambda x: ((x + adjustment) % 360) - 180)   # Replace `df` with your DataFrame name
df

### Finding missing values

In [None]:
import xarray as xr  
import numpy as np 

# Replace 'your_file.nc' with the actual file path
your_dataset = xr.open_dataset('your_file.nc')  

# Replace 'your_time_variable' with the name of the time variable in your NetCDF file
time = your_dataset['your_time_variable']  

# Convert the numeric time values to datetime format
base_date = np.datetime64('2002-01-01T00:00:00') # Define the base date (replace this with the base date if needed, here it's set to 2002-01-01)
time_values = time[:]  
time_datetimes = base_date + np.timedelta64(1, 'D') * time_values  
time_np = time_datetimes.values  

# Generate the expected sequence of monthly time steps between the minimum and maximum time points
start_time = time_np.min() 
end_time = time_np.max()  
expected_times = np.arange(start_time, end_time, np.timedelta64(1, 'M'), dtype='datetime64[M]')  
missing_times = np.setdiff1d(expected_times, time_np.astype('datetime64[M]')) 
missing_numerical_time_steps = ((missing_times - base_date) / np.timedelta64(1, 'D')).astype(int)  

# Print the missing numerical time steps
print("Missing numerical time steps:")  
for missing_time_step in missing_numerical_time_steps:
    print(missing_time_step)  # Print each missing time step


### Interpolate NaN values

In [None]:
# Interpolate NaN values using interpolation

# Replace 'df' with your actual DataFrame name containing NaN values
df = df.interpolate(method='linear')  # You can change 'linear' to another interpolation method if needed

# Display the resulting DataFrame with interpolated values
df


### Clipping the data for shapefile

In [None]:
import xarray as xr  
import geopandas as gpd  
from affine import Affine  
from rasterio.features import geometry_mask  

# Function to calculate the transform from the coordinates
def calculate_transform(ds):
    lon = ds['lon'].values  
    lat = ds['lat'].values  
    lon_res = (lon[1] - lon[0])  
    lat_res = (lat[1] - lat[0])  
    transform = Affine.translation(lon[0] - lon_res / 2, lat[0] - lat_res / 2) * Affine.scale(lon_res, lat_res)
    return transform

# Function to clip NetCDF data using a shapefile
def clip_netcdf_with_shapefile(ds, shapefile):
    transform = calculate_transform(ds)
    geoms = shapefile.geometry.values  
    mask = geometry_mask([geom for geom in geoms],  
                         transform=transform,  
                         invert=True,  
                         out_shape=(ds.dims['lat'], ds.dims['lon']))
    
    mask_da = xr.DataArray(mask, dims=("lat", "lon"), coords={"lat": ds["lat"], "lon": ds["lon"]})
    clipped_ds = ds.where(mask_da, drop=True)  
    return clipped_ds


In [None]:
# Define file paths
nc_file_path = "D:/path/to/your_nc_file.nc"  # Replace with the path to your NetCDF file
shapefile_path = "D:/path/to/your_shapefile.shp"  # Replace with the path to your shapefile
shapefile = gpd.read_file(shapefile_path) 
ds = xr.open_dataset(nc_file_path, engine='netcdf4')  

# Clip the NetCDF data
clipped_ds = clip_netcdf_with_shapefile(ds, shapefile)  # Call the clipping function

# Save the clipped data to a new NetCDF file
clipped_ds.to_netcdf('path/to/your_output_file.nc', engine='netcdf4')  # Specify the output file path where the clipped data will be saved

print("Clipping completed and saved to 'path/to/your_output_file.nc'")

### Unit conversion

In [None]:
# Assuming 'df' is your DataFrame with the following columns:
# 'Variable1' is in meters (m), 'Variable2' is in cubic meters per hour (m³/hr),
# 'Variable3' is in centimeters (cm), and 'Variable4' is in cubic meters per second (m³/sec).

# Convert 'Variable1' from meters (m) to millimeters (mm)
df['Variable1_mm'] = df['Variable1'] * 1000          # 1 meter = 1000 millimeters

# Convert 'Variable2' from cubic meters per hour (m³/hr) to cubic meters per second (m³/sec)
df['Variable2_m3_sec'] = df['Variable2'] / 3600      # 1 hour = 3600 seconds

# Convert 'Variable3' from centimeters (cm) to feet (ft)
df['Variable3_ft'] = df['Variable3'] / 30.48         # 1 foot = 30.48 centimeters

# Convert 'Variable4' from cubic meters per second (m³/sec) to cubic feet per second (ft³/sec)
df['Variable4_ft3_sec'] = df['Variable4'] * 35.3147  # 1 m³ = 35.3147 ft³

# Display the DataFrame with the new columns
df

### Merging the NetCDF4 files 

In [None]:
import os  
import xarray as xr  

# File paths for your NetCDF files (replace these with actual file paths if necessary)
filepaths = [
    "your_nc_file1.nc4",  
    "your_nc_file2.nc4",  
    "your_nc_file3.nc4"  
]

# Merge the specified NetCDF files by their coordinates
merged_ds = xr.open_mfdataset(filepaths, combine='by_coords')
merged_filepath = "your_merged_file.nc"  # Define the path to save the merged NetCDF file
merged_ds.to_netcdf(merged_filepath)  

# Print a success message to confirm the file has been saved
print("Merged file saved successfully:", merged_filepath)  


### Splitting the NetCDF4 files

In [None]:
import xarray as xr  

# Replace 'your_nc_file.nc' with your actual file path
nc_file = "your_nc_file.nc"
ds = xr.open_dataset(nc_file)  

# Split the data based on time

# For example >> If 'your_nc_file.nc' has data from 1980-2020 
# If we want to split the original file into two files from 1980-2000 and  2001-2020.

# Select data from 1980 to 2000
ds_1980_2000 = ds.sel(time=slice('1980-01-01', '2000-12-31'))  

# Select data from 2001 to 2020
ds_2001_2020 = ds.sel(time=slice('2001-01-01', '2020-12-31'))  

# Define file paths for the split files
split_file_1 = "split_file_1980_2000.nc"  # Path for the first split file
split_file_2 = "split_file_2001_2020.nc"  # Path for the second split file

# Save the split datasets to separate NetCDF files
ds_1980_2000.to_netcdf(split_file_1)  # Save the 1980-2000 dataset
ds_2001_2020.to_netcdf(split_file_2)  # Save the 2001-2020 dataset

# Print confirmation messages to indicate successful saving
print(f"First split file saved successfully: {split_file_1}")  
print(f"Second split file saved successfully: {split_file_2}")  

## Spatial plotting

#### For World

##### Spatial Averaging by latitude and Longitude

In [None]:
import pandas as pd 

# Replace 'df' with the actual name of your DataFrame
# Replace 'lat with your actual latitude column name
# Replace 'lon' with your actual longitude column name
averaged_df = df.groupby(['lat', 'lon']).mean().reset_index()

# Display the resulting DataFrame with the average values for each latitude and longitude
averaged_df

##### Spatial Plot of Spatial Averaged Data

In [None]:
import matplotlib.pyplot as plt  
from mpl_toolkits.basemap import Basemap  
import numpy as np  

# Function to create the plot
def create_plot(color_scale, color_map):
    plt.figure(figsize=(10, 8))  # Create a new figure with specified size

# Available map projection types: ['robin', 'mill', 'merc', 'tmerc', 'aea', 'lcc', 'stere', 'aeqd', 'laea', 'moll', 'sinu', 'goes', 'wink3', 'fll', 'eck4', 'aitoff']

    # Initializing a Basemap with Robinson projection
    m = Basemap(projection='robin', resolution='c', lat_0=0, lon_0=0)
    
    # Drawing map boundaries and features
    m.drawcoastlines() 
    m.drawcountries()  
    m.fillcontinents(color='lightgray', lake_color='white')  
    m.drawmapboundary(fill_color='white')  
    
    # Convert lat/lon to map projection coordinates
    x, y = m(averaged_df['lon'].values, averaged_df['lat'].values) 
    
    ### Replace 'variable name' with your data column name ###
    sc = m.scatter(x, y, c=averaged_df['variable name'], cmap=color_map, vmin=-color_scale, vmax=color_scale, s=5, alpha=0.7)  

    # Adding colorbar
    cbar = plt.colorbar(sc, label='Variable Name', extend='both', shrink=0.6) ### Set label name as required ###
    cbar.ax.invert_yaxis()  
    
    plt.title('Spatial Plot of Variable Name Data')  ### Set plot title ###
    plt.savefig('output_figure.png', dpi=300)        ### Save the plot  ###
    plt.show()                                       # Display the plot 

# Set your desired color scale and colormap
color_scale = 30        # Example value for color scale

# Color map options : ['PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'jet_r', 'RdBu', 'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']
color_map = 'Spectral'  # Choose a color map option

# Create the plot
create_plot(color_scale, color_map)

#### For Shape file 

In [None]:
import numpy as np  
import xarray as xr 
import geopandas as gpd  
import pandas as pd  
import matplotlib.pyplot as plt  
import cartopy.crs as ccrs  
from shapely.geometry import Point  
from scipy.interpolate import griddata  
from matplotlib.font_manager import FontProperties  

ds = xr.open_dataset(r"your_file_path.nc")    ### Replace 'your_file_path.nc' with the path to your NetCDF file
spatial_df= ds.to_dataframe().reset_index()
shapefile_path = r"your_shapefile_path.shp"   ### Replace 'your_shapefile_path.shp' with the path to your shapefile
gdf = gpd.read_file(shapefile_path)

# Calculate mean values for a specific variable 
spatial_df['mean_variable'] = spatial_df[['your variable']].mean(axis=1)  ### Replace 'your variable' and 'mean_variable' with your variable name ###

# Group by latitude and longitude and compute the mean
averaged_df = spatial_df.groupby(['lat', 'lon']).mean().reset_index()     # Replace 'lat' and 'lon' with your latitude and longitude column names

# Extract averaged latitude, longitude, and variable values
latitudes = averaged_df['lat'].values  
longitudes = averaged_df['lon'].values  
avg_values = averaged_df['mean_variable'].values                        

min_lon, min_lat, max_lon, max_lat = gdf.total_bounds 

# Create a mesh grid covering the entire area of the shapefile
# Change the 'num_lon' and 'num_lat' for smoothing variations 
num_lon = 500  
num_lat = 500  
grid_lons = np.linspace(min_lon, max_lon, num_lon)  
grid_lats = np.linspace(min_lat, max_lat, num_lat)  
grid_lons, grid_lats = np.meshgrid(grid_lons, grid_lats)  
interpolated_variable = griddata((longitudes, latitudes), avg_values, (grid_lons, grid_lats), method='linear')  

# Create a PlateCarree projection for mapping
projection = ccrs.PlateCarree()

# Create a mask for points within the shapefile polygons
mask = np.zeros_like(interpolated_variable, dtype=bool)  
points = np.vstack([grid_lons.ravel(), grid_lats.ravel()]).T 
for geom in gdf.geometry:
    mask = mask | np.array([geom.contains(Point(x, y)) for x, y in points]).reshape(grid_lons.shape)  
interpolated_variable[~mask] = np.nan  

# Plotting the figure
plt.figure(figsize=(8, 8))            ### Set figure size
ax = plt.axes(projection=projection)  

# Add gridlines with customization
gridlines = ax.gridlines(draw_labels=False, color='gray', linestyle='--', linewidth=0.5, zorder=1)  
gridlines.top_labels = False  
gridlines.right_labels = False
gridlines.xlabel_style = {'size': 20, 'color': 'black', 'weight': 'bold'}  
gridlines.ylabel_style = {'size': 20, 'color': 'black', 'weight': 'bold'}  

# Plot the interpolated data
### Adjust color map('cmap') and range('vmin' and 'vmax')
pcm = ax.pcolormesh(grid_lons, grid_lats, interpolated_variable, cmap='Spectral', transform=projection, vmin=25, vmax=310, zorder=2)  

# Add color bar with a heading
cbar = plt.colorbar(pcm, ax=ax, orientation='vertical', shrink=0.6)  
cbar.set_label('your variable', fontsize=20, fontweight='bold', color='black')  ### Replace 'your variable' with required colorbar label ###

# Customize color bar ticks
cbar.ax.yaxis.set_tick_params(labelsize=20, width=1.5, color='black', labelcolor='black')  
tick_font = FontProperties(weight='bold', size=20)  
cbar.ax.yaxis.set_ticklabels(cbar.ax.yaxis.get_ticklabels(), fontproperties=tick_font)  

# Add shapefile polygons to the map
gdf.plot(ax=ax, facecolor='none', edgecolor='black', zorder=3)  

# Set latitude and longitude tick labels with direction
lat_step = 4  # Control the number of latitude ticks
lon_step = 7  # Control the number of longitude ticks
ax.set_xticks(np.arange((np.floor(min_lon), np.ceil(max_lon) + 1, lon_step))                # Set x-axis ticks
ax.set_yticks(np.arange(np.floor(min_lat), np.ceil(max_lat) + 1, lat_step))                 # Set y-axis ticks
ax.set_xticklabels([f"{abs(int(x))}°{'E' if x > 0 else 'W'}" for x in ax.get_xticks()], fontsize=20, fontweight='bold', color='black')  # Format x-axis labels
ax.set_yticklabels([f"{abs(int(y))}°{'N' if y > 0 else 'S'}" for y in ax.get_yticks()], fontsize=20, fontweight='bold', color='black')  # Format y-axis labels

# Set the plot extent
ax.set_extent([min_lon, max_lon, min_lat, max_lat], crs=projection)  

# Save the figure
plt.savefig('output_plot.tiff', dpi=300, bbox_inches='tight', facecolor='white')           # Save figure as TIFF file
plt.show()  # Display the plot


#### Spatial plotting for single time step using the index number

In [None]:
import numpy as np  
import geopandas as gpd  
import matplotlib.pyplot as plt  
import cartopy.crs as ccrs  
import xarray as xr  
from cartopy.mpl.gridliner import LATITUDE_FORMATTER, LONGITUDE_FORMATTER  
from scipy.interpolate import griddata  
from matplotlib.font_manager import FontProperties  
from shapely.geometry import Point  # Import Point class

# Load NetCDF data and shapefile
your_nc_data = xr.open_dataset("grace data/clip1_anjigrace.nc")  # Replace with actual NetCDF file path
your_shapefile = gpd.read_file("Export_Output.shp")              # Replace with actual shapefile path

# Specify time index
time_index = 11  # Change as needed

# Convert data to DataFrame and calculate mean
spatial_df = your_nc_data.to_dataframe().reset_index()
spatial_df['mean_variable'] = spatial_df[['your variable']].mean(axis=1)  ### Replace 'your variable' and 'mean_variable' with your variable name ### 

# Group by latitude and longitude
averaged_df = spatial_df.groupby(['lat', 'lon']).mean().reset_index()

# Extract averaged values
latitudes = averaged_df['lat'].values  
longitudes = averaged_df['lon'].values  
avg_values = averaged_df['mean_variable'].values                        

min_lon, min_lat, max_lon, max_lat = your_shapefile.total_bounds 

# Create mesh grid
num_lon, num_lat = 500, 500  
grid_lons = np.linspace(min_lon, max_lon, num_lon)  
grid_lats = np.linspace(min_lat, max_lat, num_lat)  
grid_lons, grid_lats = np.meshgrid(grid_lons, grid_lats)  

# Interpolate variable
interpolated_variable = griddata((longitudes, latitudes), avg_values, (grid_lons, grid_lats), method='linear')

# Create PlateCarree projection for mapping
projection = ccrs.PlateCarree()

# Create a mask for points within the shapefile
mask = np.zeros_like(interpolated_variable, dtype=bool)  
points = np.vstack([grid_lons.ravel(), grid_lats.ravel()]).T 
for geom in your_shapefile.geometry:
    mask = mask | np.array([geom.contains(Point(x, y)) for x, y in points]).reshape(grid_lons.shape)  
interpolated_variable[~mask] = np.nan  

# Plotting
plt.figure(figsize=(8, 8))  ### Set figure size ###
ax = plt.axes(projection=projection)  

# Add gridlines
gridlines = ax.gridlines(draw_labels=True, color='gray', linestyle='--', linewidth=0.5, zorder=1)  
gridlines.top_labels = False  
gridlines.right_labels = False
gridlines.xlabel_style = {'size': 20, 'color': 'black', 'weight': 'bold'}  
gridlines.ylabel_style = {'size': 20, 'color': 'black', 'weight': 'bold'}  

# Plot the interpolated data
pcm = ax.pcolormesh(grid_lons, grid_lats, interpolated_variable, cmap='Spectral', transform=projection)  

# Add color bar
cbar = plt.colorbar(pcm, ax=ax, orientation='vertical', shrink=0.6)  
cbar.set_label('your variable', fontsize=20, fontweight='bold', color='black')   ### Replace 'your variable' with required colorbar label ###

# Customize color bar ticks
cbar.ax.yaxis.set_tick_params(labelsize=20, width=1.5, color='black', labelcolor='black')  
tick_font = FontProperties(weight='bold', size=20)  
cbar.ax.yaxis.set_ticklabels(cbar.ax.yaxis.get_ticklabels(), fontproperties=tick_font)  

# Add shapefile polygons
your_shapefile.plot(ax=ax, facecolor='none', edgecolor='black', zorder=3)  

# Set latitude and longitude tick labels
lat_step = 4  
lon_step = 7  
ax.set_xticks(np.arange(np.floor(min_lon), np.ceil(max_lon) + 1, lon_step))  
ax.set_yticks(np.arange(np.floor(min_lat), np.ceil(max_lat) + 1, lat_step))  
ax.set_xticklabels([f"{abs(int(x))}°{'E' if x > 0 else 'W'}" for x in ax.get_xticks()], fontsize=20, fontweight='bold', color='black')  
ax.set_yticklabels([f"{abs(int(y))}°{'N' if y > 0 else 'S'}" for y in ax.get_yticks()], fontsize=20, fontweight='bold', color='black')  

# Set the plot extent
ax.set_extent([min_lon, max_lon, min_lat, max_lat], crs=projection)  

# Save the figure
plt.savefig('output_plot.tiff', dpi=300, bbox_inches='tight', facecolor='white')  
plt.show()  # Display the plot


## Time series analysis

#### Trend analysis

In [None]:
 import pandas as pd  
import netCDF4 as nc  
import numpy as np  

file_path = "your_file.nc"                       # Replace 'your_file.nc' with the path to your actual NetCDF file

# Open the NetCDF file for reading
with nc.Dataset(file_path, 'r') as dataset:  
    time_var = dataset.variables['time'][:]      # Replace 'time' with the actual variable name for the time in your file
    lat_var = dataset.variables['lat'][:]        # Replace 'lat' with the actual variable name for latitude
    lon_var = dataset.variables['lon'][:]        # Replace 'lon' with the actual variable name for longitude
    data_var = dataset.variables['variable'][:]  # Replace 'variable' with the actual variable name for data

time_2d, lat_2d, lon_2d = np.meshgrid(time_var, lat_var, lon_var, indexing='ij')  
data_1d = data_var.flatten()  
df = pd.DataFrame({                              # Replace 'df' with the name you want to initialize for your DataFrame
    'time': time_2d.flatten(),  
    'lat': lat_2d.flatten(),   
    'lon': lon_2d.flatten(),    
    'variable': data_1d        
})
df  

In [None]:
import xarray as xr  
import pandas as pd  
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates  
import seaborn as sns  
from scipy.stats import linregress  

# Calculate the trend line for the average data
### Replace 'df' with your actual DataFrame name and 'average' with the name you want for average data ###
slope_mean, intercept_mean, r_value_mean, p_value_mean, std_err_mean = linregress(df.index, df['average'])  
trend_line_mean = intercept_mean + slope_mean * df.index  

# Print the slope of the trend line
print(f"Slope of the trend line: {slope_mean:.3f} mm/month")  
sns.set(style="whitegrid")  

### Create a figure for the plot ###
plt.figure(figsize=(14, 8))   

# Replace 'df' with your DataFrame and 'average' with your column name for average data
plt.plot(df['datetime'], df['average'], label='Mean P', color='#FF5733', linewidth=2)  

# Plot the trend line for the mean precipitation
df['datetime'] = pd.to_datetime(df['time'])
plt.plot(df['datetime'], trend_line_mean, color='blue', linestyle='--', label='Mean P Trend Line')  

# Replace 'min' and 'max' with the appropriate column names for minimum and maximum of your data 
plt.fill_between(df['datetime'], df['min'], df['max'], color='lightgray', alpha=0.5, label='Range of P')  

# Formatting the x-axis to display specific years
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Set x-axis to show year format
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) 
specific_years = [1999, 2002, 2005, 2008, 2011, 2014, 2017, 2020, 2023]  ### You can choose to display which years you want on x axis ###
plt.xticks(pd.to_datetime(specific_years, format='%Y'), fontsize=28, fontweight='bold', color='black')  

# Label the x-axis
plt.xlabel('Year', fontsize=28, fontweight='bold', color='black')  

# Formatting the y-axis
plt.yticks(fontsize=28, fontweight='bold', color='black')  
plt.ylim(min(df['average'].min(), trend_line_mean.min()), max(df['average'].max(), trend_line_mean.max())) 

# Label the y-axis
plt.ylabel('Your variable', fontsize=28, fontweight='bold', color='black')  

# Customize the legend
handles, labels = plt.gca().get_legend_handles_labels()  
handles.append(plt.Line2D([0], [0], color='none'))  
plt.legend(handles=handles, loc='upper left', frameon=False, fancybox=False, shadow=False, prop={'size': 28, 'weight':'bold'}, ncol=2)  

# Add grid lines and customize the axis spines
plt.grid(True, which='both', linestyle=':', linewidth=0.5, color='gray')  
plt.gca().spines['right'].set_color('black')  
plt.gca().spines['bottom'].set_color('black')  
plt.gca().spines['left'].set_color('black')  

# Replace 'df' with your DataFrame
plt.xlim(left=df['datetime'].min(), right=df['datetime'].max())  

### Set the title of the plot ### 
plt.title('Plot Title', fontsize=16, fontweight='bold')  

# Save the plot as a .tiff file
plt.savefig('Output_file.tiff', dpi=300, bbox_inches='tight')  # Replace 'Output_file.tiff' with your desired file name when saving the plot

# Display the plot
plt.show() 

#### Trend change point analysis

In [None]:
import pandas as pd  
import netCDF4 as nc  
import numpy as np  

file_path = "your_file.nc"                       # Replace 'your_file.nc' with the path to your actual NetCDF file

# Open the NetCDF file for reading
with nc.Dataset(file_path, 'r') as dataset:  
    time_var = dataset.variables['time'][:]      # Replace 'time' with the actual variable name for the time in your file
    lat_var = dataset.variables['lat'][:]        # Replace 'lat' with the actual variable name for latitude
    lon_var = dataset.variables['lon'][:]        # Replace 'lon' with the actual variable name for longitude
    data_var = dataset.variables['variable'][:]  # Replace 'variable' with the actual variable name for data

time_2d, lat_2d, lon_2d = np.meshgrid(time_var, lat_var, lon_var, indexing='ij')  
data_1d = data_var.flatten()  
df = pd.DataFrame({                              # Replace 'df' with the name you want to initialize for your DataFrame
    'time': time_2d.flatten(),  
    'lat': lat_2d.flatten(),   
    'lon': lon_2d.flatten(),    
    'variable': data_1d        
})
df  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import ruptures as rpt

# Replace 'df' with the actual DataFrame name in your analysis
df['datetime'] = pd.to_datetime(df['time'])  
df.set_index('datetime', inplace=True)  
df['Year'] = df.index.year + df.index.dayofyear / 365  
df['Month'] = df['Year'] * 12  

# Detect change points in the average data using the PELT method with an RBF kernel

signal = df['variable name'].values       ### Replace 'variable name' with your variable name ###
algo = rpt.Pelt(model="rbf").fit(signal)  # Apply PELT method for change point detection using the Radial Basis Function (RBF) model
change_points = algo.predict(pen=6)       ### Adjust the penalty value to control the sensitivity of change point detection ###

plt.figure(figsize=(14, 8))               ### Set the figure size ###

# Replace the 'your variable' with your variable name
plt.plot(df['Month'], df['your variable'], label='Variable Name', color='#FF5733', linewidth=2)  # Change the color code as per your requirment

# Replace the 'min', 'max' and 'your variable' with your variable maximum and minimum column name and variable column name 
plt.fill_between(df['Month'], df['min'], df['max'], color='lightgray', alpha=0.7, label='Range of your variable')  

# Plot vertical lines at detected change points
for cp in change_points[:-1]:  
    plt.axvline(df['Month'].iloc[cp], color='purple', linestyle='--', label='Change Point' if cp == change_points[0] else "")  
trend_colors = ['blue', 'green', 'orange', 'purple']  

# Calculate and plot linear trend lines for each segment between change points
time_values = df['Month'].values.reshape(-1, 1)  
previous_cp = 0  
slopes = []  
handles = []  
labels = []  

# Loop through each change point to create segments
for idx, cp in enumerate(change_points):  
    segment = slice(previous_cp, cp) 
    model_segment = LinearRegression().fit(time_values[segment], df['average'].values[segment])  # Fit a linear regression model to the segment
    slope = model_segment.coef_[0]  
    slopes.append(slope)  
    trend_label = f'Trend {idx + 1}'  
    handles.append(plt.Line2D([0], [0], linestyle='--', color=trend_colors[idx % len(trend_colors)], label=trend_label))  
    plt.plot(df['Month'].values[segment], model_segment.predict(time_values[segment]), linestyle='--', color=trend_colors[idx % len(trend_colors)], label=trend_label)  
    previous_cp = cp  

# Print slopes of each segment
for i, slope in enumerate(slopes):  
    print(f"Slope of Trend {i + 1}: {slope:.3f} mm/month")  

# Add a legend to the plot with 3 rows per column
handles, labels = plt.gca().get_legend_handles_labels()  

# Arrange handles and labels for a 2-column layout with 3 rows per column
plt.legend(handles=handles, labels=labels, loc='upper left', bbox_to_anchor=(0.0, 1.02), 
           frameon=False, fancybox=False, shadow=False, prop={'size': 28, 'weight': 'bold'}, 
           ncol=2, handlelength=2.5, handletextpad=1, columnspacing=2) 

# Label x and y axes
plt.xlabel('Year', fontsize=28, fontweight='bold', color='black')  
plt.ylabel('Variable Name', fontsize=28, fontweight='bold', color='black')  

# Customize x and y ticks
plt.xticks(fontsize=28, fontweight='bold', color='black')  
plt.yticks(fontsize=28, fontweight='bold', color='black')  


specific_years = [1999, 2002, 2005, 2008, 2011, 2014, 2017, 2020, 2023]   ### Set specific years as x-ticks ###
specific_months = [year * 12 for year in specific_years] 
plt.xticks(specific_months, specific_years)  

# Set the limits for the x and y axes
plt.xlim(df['Month'].min(), df['Month'].max())  
plt.ylim(0, 210)  

# Enhance grid and spines for a cleaner appearance
plt.grid(True, which='both', linestyle=':', linewidth=0.5, color='gray') 
plt.gca().spines['top'].set_color('black') 
plt.gca().spines['right'].set_color('black') 
plt.gca().spines['bottom'].set_color('black')  
plt.gca().spines['left'].set_color('black')  

# Save the plot as a TIFF file
plt.savefig('your_filename.tiff', dpi=2000, bbox_inches='tight', facecolor='white')  # Replace 'your_filename.tiff' with the desired filename when saving the plot

# Show the plot
plt.show()  

#### Seasonal analysis

In [None]:
import pandas as pd  
import netCDF4 as nc  
import numpy as np  

file_path = "your_file.nc"                       # Replace 'your_file.nc' with the path to your actual NetCDF file

# Open the NetCDF file for reading
with nc.Dataset(file_path, 'r') as dataset:  
    time_var = dataset.variables['time'][:]      # Replace 'time' with the actual variable name for the time in your file
    lat_var = dataset.variables['lat'][:]        # Replace 'lat' with the actual variable name for latitude
    lon_var = dataset.variables['lon'][:]        # Replace 'lon' with the actual variable name for longitude
    data_var = dataset.variables['variable'][:]  # Replace 'variable' with the actual variable name for data

time_2d, lat_2d, lon_2d = np.meshgrid(time_var, lat_var, lon_var, indexing='ij')  
data_1d = data_var.flatten()  
df = pd.DataFrame({                              # Replace 'df' with the name you want to initialize for your DataFrame
    'time': time_2d.flatten(),  
    'lat': lat_2d.flatten(),   
    'lon': lon_2d.flatten(),    
    'variable': data_1d        
})
df  

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'df' is the DataFrame with a column 'your variable'

# Plot the seasonality data for one dataset
plt.figure(figsize=(14, 8))  

# Plot the 'your variable' column from the DataFrame
plt.plot(df.index, df['your variable'], marker='o', label='Mean SMS', linewidth=2.5)  

# Customize the plot
plt.title('Your Variable Name Seasonality for One Dataset', fontsize=16, fontweight='bold')  ### Change the title as per your requirement ###
plt.xlabel('Month', fontsize=26, fontweight='bold', color='black')  
plt.ylabel('Your Variable', fontsize=26, fontweight='bold', color='black')                   ### Replace 'Your Variable' with the actual variable name ###
plt.xticks(np.arange(1, 13), 
           ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
           fontsize=26, fontweight='bold', color='black')  
plt.yticks(fontsize=26, fontweight='bold', color='black')  
plt.legend(loc='upper right', bbox_to_anchor=(1, 1.0), prop={'size': 26, 'weight': 'bold'}, frameon=False)  

# Enhance grid and spines for better visibility
plt.grid(True, which='both', linestyle=':', linewidth=0.5, color='gray')  
plt.gca().spines['top'].set_color('black')  
plt.gca().spines['right'].set_color('black')  
plt.gca().spines['bottom'].set_color('black')  
plt.gca().spines['left'].set_color('black')  

# Set x-axis limits from 1 to 12 (for months)
plt.xlim(1, 12)  
plt.tight_layout()  

# Replace 'your_filename.tiff' with the desired filename when saving the plot
plt.savefig('your_filename.tiff', dpi=200, bbox_inches='tight') 
plt.show()


#### Residual error 

In [None]:
# Assuming you have a monthly averaged dataframe with variables P, ET, Runoff and ΔS

# Calculate the residual by subtracting ET, runoff, and ΔS from precipitation

### Chnage the below variable names with your actual variable names ###
residual_df['res'] = residual_df['precip'] - residual_df['et'] - residual_df['runoff'] - residual_df['deltaS']

# Group the residual_df by month and calculate the mean for each month
residual_season = residual_df.groupby('month').mean().reset_index()  

# Display the resulting DataFrame containing monthly averages
residual_season


In [None]:
import matplotlib.pyplot as plt 
import numpy as np 

# Define the columns you want to plot (replace with your variable names)
columns_to_plot = ['precip', 'et', 'runoff', 'deltaS', 'res'] 

# Define a dictionary to map column names to specific colors for the plot
colors = {
    'precip': '#1f77b4',  
    'et': '#ff7f0e',      
    'runoff': '#2ca02c', 
    'deltaS': '#d62728', 
    'residual': '#9467bd'     
}

# Define a dictionary to map original column names to human-readable legend names
legend_names = {
    'precip': 'Precipitation',  
    'et': 'ET',                 
    'runoff': 'Runoff',         
    'deltaS': 'ΔS',             
    'residual': 'Residual'           
}

plt.figure(figsize=(14, 8))  ### Set the figure size ### 

# Loop through each column 
for column in columns_to_plot:
    plt.plot(
        residual_season['month'],    
        residual_season[column],     
        color=colors[column],       
        linestyle='--' if column != 'residual' else '-',  # Solid line for residual, dashed for others
        linewidth=3 if column == 'residual' else 2,       # Thicker line for residual
        label=legend_names[column]  
    )

# Customize plot axes labels and their appearance
plt.xlabel('Month', fontsize=18, fontweight='bold', color='black')  
plt.ylabel('Value (mm/month)', fontsize=18, fontweight='bold', color='black')  

# Set custom ticks for the X-axis to show month names
plt.xticks(
    np.arange(1, 13),  
    ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],  
    fontsize=18, fontweight='bold', color='black'  
)

# Set font settings for Y-axis ticks
plt.yticks(fontsize=18, fontweight='bold', color='black')

# Add a legend to the plot with custom font size and position
plt.legend(
    prop={'size': 18, 'weight': 'bold'},  
    loc='upper center',  
    bbox_to_anchor=(0.5, 1.0),  
    ncol=2  # Use two columns for the legend
)

# Add a grid with specific settings to the plot
plt.grid(True, which='both', linestyle=':', linewidth=0.5, color='gray')  

# Customize the appearance of the plot spines 
plt.gca().spines['top'].set_color('black')  
plt.gca().spines['right'].set_color('black')  
plt.gca().spines['bottom'].set_color('black')  
plt.gca().spines['left'].set_color('black')  

# Set the X-axis limits to ensure it covers from January (1) to December (12)
plt.xlim(1, 12)

# Tighten the layout to reduce white space
plt.tight_layout()

# Replace 'your_residual_seasonal_change_plot.tiff' with the desired filename when saving the plot
plt.savefig('your_residual_seasonal_change_plot.tiff', dpi=300, bbox_inches='tight', facecolor='white')

# Display the plot
plt.show()

### Validation with observed data

In [None]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt  
import matplotlib.cm as cm  

# Define a TaylorDiagram class
class TaylorDiagram:
    def __init__(self, refstd, fig=None, rect=111, label=''):
        self.refstd = refstd  
        self.fig = fig if fig is not None else plt.figure()  
        self.ax = self.fig.add_subplot(rect, polar=True)  
        self.ax.set_theta_zero_location('N')  
        self.ax.set_theta_direction(-1)  

        # Plot reference point and standard deviation contour
        self.ax.plot([0], [refstd], 'ko', label=label)  
        l, = self.ax.plot(np.linspace(0, np.pi/2), [refstd]*50, 'k--')  
        l.set_dashes([2, 2]) 

        # Set axis limits and labels
        self.ax.set_ylim(0, 1.5*refstd)  
        self.ax.set_xlim(0, np.pi/2)  

        # Configure grid and ticks
        self.ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{np.cos(x):.2f}'))  
        self.ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.2f}'))  

        # Labels for the diagram
        self.ax.set_xlabel('Standard Deviation', fontsize=14, fontweight='bold')  
        self.ax.set_ylabel('Standard Deviation', fontsize=14, fontweight='bold')  

        # Add correlation axis label
        self.add_correlation_axis_label()

    def add_correlation_axis_label(self):
        # Add a label for the correlation axis
        self.ax.text(np.pi/4, self.refstd * 1.2, 'Correlation', fontsize=14, fontweight='bold', color='black', ha='center', va='center')

    def add_sample(self, stddev, corrcoef, rmse, label=None, **kwargs):
        theta = np.arccos(corrcoef)  
        self.ax.plot(theta, stddev, 'o', label=label, **kwargs)  

    def add_grid(self):
        self.ax.grid(True)  

    def add_legend(self):
        # Add a legend to the plot
        legend = self.ax.legend(loc='upper right', bbox_to_anchor=(1.65, 0.95))
        for text in legend.get_texts():
            text.set_fontsize(14)
            text.set_fontweight('bold')
            text.set_color('black')

# Function to compute standard deviation, correlation coefficient, and RMSE
def compute_metrics(obs, model):
    stddev = np.std(model)  
    corrcoef = np.corrcoef(obs, model)[0, 1]  
    rmse = np.sqrt(np.mean((obs - model)**2))  
    return stddev, corrcoef, rmse

In [None]:
# Assuming you have a DataFrame named 'df' containing all the datasets 
# that you want to use for model validation, including the observed variable 

observed_column = "Observed_Variable"            ### Replace "Observed_Variable" with your observed variable name ###
# Replace df with your actual DataFrame
model_columns = [col for col in df.columns if col not in [observed_column, 'datetime', 'month']]

print("Model columns:", model_columns)

# Convert all columns to numeric (use errors='coerce' to handle non-numeric entries)
df = df.apply(pd.to_numeric, errors='coerce')

# Check for NaN or Inf values in the observed data
nan_count = df[observed_column].isna().sum()
inf_count = np.isinf(df[observed_column]).sum()

print(f"NaN values in observed data: {nan_count}")
print(f"Inf values in observed data: {inf_count}")

if nan_count > 0 or inf_count > 0:
    # Handle NaN or Inf values
    df[observed_column].fillna(df[observed_column].mean(), inplace=True)
    df.replace([np.inf, -np.inf], df[observed_column].mean(), inplace=True)

# Compute metrics for observation and each model
obs = df[observed_column].values
obs_std = np.std(obs)

print(f"Standard deviation of observed data: {obs_std}")

# Check if obs_std is a valid number
if not np.isfinite(obs_std):
    raise ValueError("Standard deviation of observed data is not finite.")

models_metrics = {}  # Dictionary to store metrics for each model
colors = cm.get_cmap('tab20', len(model_columns))

for idx, col in enumerate(model_columns):
    model_values = df[col].values
    std, corr, rmse = compute_metrics(obs, model_values)
    
    # Skip models with NaN values
    if np.isnan(std) or np.isnan(corr) or np.isnan(rmse):
        print(f"Skipping {col} due to NaN values")
        continue

    print(f"Processing {col} - stddev: {std}, correlation: {corr}, rmse: {rmse}")
    models_metrics[col] = (std, corr, rmse)

# Plotting Taylor diagram
fig = plt.figure(figsize=(12, 8))  # Create figure with specified size
dia = TaylorDiagram(obs_std, fig=fig, label='Observation')

for idx, (model, (std, corr, rmse)) in enumerate(models_metrics.items()):
    print(f"Plotting {model} with stddev: {std}, correlation: {corr}")
    dia.add_sample(std, corr, rmse, label=model, color=colors(idx))

dia.add_grid()
dia.add_legend()
plt.show()