# work with GSI diag files   
This notebook illustreate how to work with GSI diag files using a similar `obsSpaceGSI` class

In [None]:
%%time 

# autoload external python modules if they changed
%load_ext autoreload
%autoreload 2

import os, sys
pyDAmonitor_ROOT=os.getenv("pyDAmonitor_ROOT")
if pyDAmonitor_ROOT is None:
    print("!!! pyDAmonitor_ROOT is NOT set. Run `source ush/load_pyDAmonitor.sh`")
else:
    print(f"pyDAmonitor_ROOT={pyDAmonitor_ROOT}\n")
sys.path.insert(0, pyDAmonitor_ROOT)
    
# import modules
import warnings
import math
import numpy as np
import uxarray as ux
import xarray as xr
import pandas as pd
import seaborn as sns  # seaborn handles NaN values automatically
import matplotlib.pyplot as plt
from netCDF4 import Dataset
from DAmonitor.base import query_dataset, query_data, query_obj, to_dataframe

In [None]:
diag_ana=f"{pyDAmonitor_ROOT}/data/gsi/diag_conv_t_anl.2024050601.nc4"
diag_bkg=f"{pyDAmonitor_ROOT}/data/gsi/diag_conv_t_ges.2024050601.nc4"

## Use `obsSpaceGSI` to read GSI diag files

In [None]:
from DAmonitor.obs import obsSpaceGSI, fit_rate
tana=obsSpaceGSI(diag_ana)
tbkg=obsSpaceGSI(diag_bkg)

In [None]:
query_obj(tana)
# one can access diag data using `tana.data`
# query_data(tana.data)

In [None]:
print(tana.data.Observation_Class)

## convert to Pandas DataFrame

In [None]:
pd.set_option('display.max_columns', None)  # show all dataframe columns
dfana=to_dataframe(tana.data)
dfbkg=to_dataframe(tbkg.data)
pd.set_option("display.max_columns", None)
dfana

## Plot histrogram

In [None]:
plt.figure(figsize=(8, 5))
#sns.histplot(df["oman"], bins=50, kde=True, color="steelblue")
sns.histplot(tana.data.Obs_Minus_Forecast_adjusted, bins=100, kde=False, color="steelblue")
plt.title("Histogram of oman")
plt.xlabel("oman values")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

## Plot fitting rate

In [None]:
## assemble 133 OMB and OMA into a dictionary
df_133_a = dfana[dfana["Observation_Type"] == 133]
df_133_b = dfbkg[dfbkg["Observation_Type"] == 133]

t133={
    'oman': df_133_a["Obs_Minus_Forecast_adjusted"].to_numpy(),
    'ombg': df_133_b["Obs_Minus_Forecast_adjusted"].to_numpy(),
    'height': df_133_a["Station_Elevation"].to_numpy(),
}

In [None]:
dz = 1000
grouped = fit_rate(t133, dz=dz)

# 5. Plot vertical profile of fit_rate vs height
plt.figure(figsize=(7, 6))
plt.plot(grouped["fit_rate"], grouped["height_bin"], marker="o", color="blue")
# plt.axvline(x=0, color="gray", linestyle="--")  # ax vertical line

plt.xlabel("Fit Rate (%)")  # label change
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x*100:.0f}%'))  # format as %
plt.ylabel("Height Bin (m)")
plt.title("Vertical Profile of Fit Rate")

# Fine-tune ticks
plt.xticks(np.arange(0, 0.25, 0.05))  #, fontsize=12)
plt.yticks(np.arange(0, 13000, dz))  #, , fontsize=12)
# Add minor ticks
from matplotlib.ticker import AutoMinorLocator
plt.gca().xaxis.set_minor_locator(AutoMinorLocator())
plt.gca().yaxis.set_minor_locator(AutoMinorLocator())
# plt.grid(which='both', linestyle='--', linewidth=0.5)
plt.grid(True)

plt.ylim(0, 13000)  # set y-axis from 0 (bottom) to 13,000 (top)
plt.tight_layout()
plt.show()

## For reference, check diag file contents using netCDF4 directly

In [None]:
dataset=Dataset(diag_ana, mode='r')
query_dataset(dataset)

### check the shape/ndim of each variable in the netCDF4 dataset

In [None]:
for var in dataset.variables:
    print(var, dataset.variables[var][:].shape) #ndim

In [None]:
print(dataset.variables["Bias_Correction_Terms"][:])

### check attributes

In [None]:
for stmp in dataset.variables["Data_Pof"].ncattrs():
    print(stmp)

In [None]:
print(dataset.variables["Data_Pof"].ncattrs())